Update SIMDUTF (#4078)

author: Jarred Sumner <jarred@jarredsumner.com> 2023-08-09 09:14:51 -0700
committer: GitHub <noreply@github.com> 2023-08-09 09:14:51 -0700
commit: b3019270c9640a60f7a30f172cea10e310baf3b6 (patch)
tree: 8b6252ac910863f513a27444526b461c39be8e76
parent: 5d7c77aab0761e16ef163dcf9792e8947bdab214 (diff)
download: bun-b3019270c9640a60f7a30f172cea10e310baf3b6.tar.gz
bun-b3019270c9640a60f7a30f172cea10e310baf3b6.tar.zst
bun-b3019270c9640a60f7a30f172cea10e310baf3b6.zip
2 files changed, 24092 insertions, 25766 deletions
diff --git a/src/bun.js/bindings/simdutf.cpp b/src/bun.js/bindings/simdutf.cpp
index 6d20bcf5e..be69372f0 100644
--- a/src/bun.js/bindings/simdutf.cpp
+++ b/src/bun.js/bindings/simdutf.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on 2023-06-21 08:09:45 -0400. Do not edit! */
+/* auto-generated on 2023-08-08 16:23:39 -0400. Do not edit! */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf.cpp
 /* begin file src/simdutf.cpp */
 #include "simdutf.h"
@@ -11,16 +11,15 @@
 namespace simdutf {
 namespace {
 
-template<typename T>
-std::string toBinaryString(T b)
-{
-    std::string binary = "";
-    T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
-    while (mask > 0) {
-        binary += ((b & mask) == 0) ? '0' : '1';
-        mask >>= 1;
-    }
-    return binary;
+template <typename T>
+std::string toBinaryString(T b) {
+   std::string binary = "";
+   T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
+   while (mask > 0) {
+    binary += ((b & mask) == 0) ? '0' : '1';
+    mask >>= 1;
+  }
+  return binary;
 }
 }
 }
@@ -36,10 +35,13 @@ std::string toBinaryString(T b)
 #error "arm64.h must be included before fallback.h"
 #endif
 
+
 #ifndef SIMDUTF_IMPLEMENTATION_ARM64
 #define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64)
 #endif
-#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64&& SIMDUTF_IS_ARM64
+#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64
+
+
 
 #if SIMDUTF_IMPLEMENTATION_ARM64
 
@@ -56,6 +58,7 @@ namespace arm64 {
 #ifndef SIMDUTF_ARM64_IMPLEMENTATION_H
 #define SIMDUTF_ARM64_IMPLEMENTATION_H
 
+
 namespace simdutf {
 namespace arm64 {
 
@@ -65,85 +68,83 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-    simdutf_really_inline implementation()
-        : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON)
-    {
-    }
-    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
-    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
-    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
+  simdutf_really_inline implementation() : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {}
+  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
+  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
+  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf32( size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
+
 };
 
 } // namespace arm64
@@ -164,6 +165,7 @@ public:
 #ifndef SIMDUTF_ARM64_INTRINSICS_H
 #define SIMDUTF_ARM64_INTRINSICS_H
 
+
 // This should be the correct header whether
 // you use visual studio or other compilers.
 #include <arm_neon.h>
@@ -180,9 +182,8 @@ namespace arm64 {
 namespace {
 
 /* result might be undefined when input_num is zero */
-simdutf_really_inline int count_ones(uint64_t input_num)
-{
-    return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
+simdutf_really_inline int count_ones(uint64_t input_num) {
+   return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
 }
 
 } // unnamed namespace
@@ -198,6 +199,7 @@ simdutf_really_inline int count_ones(uint64_t input_num)
 
 #include <type_traits>
 
+
 namespace simdutf {
 namespace arm64 {
 namespace {
@@ -207,6 +209,7 @@ namespace simd {
 namespace {
 // Start of private section with Visual Studio workaround
 
+
 /**
  * make_uint8x16_t initializes a SIMD register (uint8x16_t).
  * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
@@ -218,138 +221,130 @@ namespace {
  * You should not use this function except for compile-time constants:
  * it is not efficient.
  */
-simdutf_really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
-    uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
-    uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
-    uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16)
-{
-    // Doing a load like so end ups generating worse code.
-    // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
-    //                     x9, x10,x11,x12,x13,x14,x15,x16};
-    // return vld1q_u8(array);
-    uint8x16_t x {};
-    // incredibly, Visual Studio does not allow x[0] = x1
-    x = vsetq_lane_u8(x1, x, 0);
-    x = vsetq_lane_u8(x2, x, 1);
-    x = vsetq_lane_u8(x3, x, 2);
-    x = vsetq_lane_u8(x4, x, 3);
-    x = vsetq_lane_u8(x5, x, 4);
-    x = vsetq_lane_u8(x6, x, 5);
-    x = vsetq_lane_u8(x7, x, 6);
-    x = vsetq_lane_u8(x8, x, 7);
-    x = vsetq_lane_u8(x9, x, 8);
-    x = vsetq_lane_u8(x10, x, 9);
-    x = vsetq_lane_u8(x11, x, 10);
-    x = vsetq_lane_u8(x12, x, 11);
-    x = vsetq_lane_u8(x13, x, 12);
-    x = vsetq_lane_u8(x14, x, 13);
-    x = vsetq_lane_u8(x15, x, 14);
-    x = vsetq_lane_u8(x16, x, 15);
-    return x;
+simdutf_really_inline uint8x16_t make_uint8x16_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
+                                         uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8,
+                                         uint8_t x9,  uint8_t x10, uint8_t x11, uint8_t x12,
+                                         uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
+  // Doing a load like so end ups generating worse code.
+  // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+  //                     x9, x10,x11,x12,x13,x14,x15,x16};
+  // return vld1q_u8(array);
+  uint8x16_t x{};
+  // incredibly, Visual Studio does not allow x[0] = x1
+  x = vsetq_lane_u8(x1, x, 0);
+  x = vsetq_lane_u8(x2, x, 1);
+  x = vsetq_lane_u8(x3, x, 2);
+  x = vsetq_lane_u8(x4, x, 3);
+  x = vsetq_lane_u8(x5, x, 4);
+  x = vsetq_lane_u8(x6, x, 5);
+  x = vsetq_lane_u8(x7, x, 6);
+  x = vsetq_lane_u8(x8, x, 7);
+  x = vsetq_lane_u8(x9, x, 8);
+  x = vsetq_lane_u8(x10, x, 9);
+  x = vsetq_lane_u8(x11, x, 10);
+  x = vsetq_lane_u8(x12, x, 11);
+  x = vsetq_lane_u8(x13, x, 12);
+  x = vsetq_lane_u8(x14, x, 13);
+  x = vsetq_lane_u8(x15, x, 14);
+  x = vsetq_lane_u8(x16, x, 15);
+  return x;
 }
 
 // We have to do the same work for make_int8x16_t
-simdutf_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
-    int8_t x5, int8_t x6, int8_t x7, int8_t x8,
-    int8_t x9, int8_t x10, int8_t x11, int8_t x12,
-    int8_t x13, int8_t x14, int8_t x15, int8_t x16)
-{
-    // Doing a load like so end ups generating worse code.
-    // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
-    //                     x9, x10,x11,x12,x13,x14,x15,x16};
-    // return vld1q_s8(array);
-    int8x16_t x {};
-    // incredibly, Visual Studio does not allow x[0] = x1
-    x = vsetq_lane_s8(x1, x, 0);
-    x = vsetq_lane_s8(x2, x, 1);
-    x = vsetq_lane_s8(x3, x, 2);
-    x = vsetq_lane_s8(x4, x, 3);
-    x = vsetq_lane_s8(x5, x, 4);
-    x = vsetq_lane_s8(x6, x, 5);
-    x = vsetq_lane_s8(x7, x, 6);
-    x = vsetq_lane_s8(x8, x, 7);
-    x = vsetq_lane_s8(x9, x, 8);
-    x = vsetq_lane_s8(x10, x, 9);
-    x = vsetq_lane_s8(x11, x, 10);
-    x = vsetq_lane_s8(x12, x, 11);
-    x = vsetq_lane_s8(x13, x, 12);
-    x = vsetq_lane_s8(x14, x, 13);
-    x = vsetq_lane_s8(x15, x, 14);
-    x = vsetq_lane_s8(x16, x, 15);
-    return x;
-}
-
-simdutf_really_inline uint8x8_t make_uint8x8_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
-    uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8)
-{
-    uint8x8_t x {};
-    x = vset_lane_u8(x1, x, 0);
-    x = vset_lane_u8(x2, x, 1);
-    x = vset_lane_u8(x3, x, 2);
-    x = vset_lane_u8(x4, x, 3);
-    x = vset_lane_u8(x5, x, 4);
-    x = vset_lane_u8(x6, x, 5);
-    x = vset_lane_u8(x7, x, 6);
-    x = vset_lane_u8(x8, x, 7);
-    return x;
-}
-
-simdutf_really_inline uint16x8_t make_uint16x8_t(uint16_t x1, uint16_t x2, uint16_t x3, uint16_t x4,
-    uint16_t x5, uint16_t x6, uint16_t x7, uint16_t x8)
-{
-    uint16x8_t x {};
-    x = vsetq_lane_u16(x1, x, 0);
-    x = vsetq_lane_u16(x2, x, 1);
-    x = vsetq_lane_u16(x3, x, 2);
-    x = vsetq_lane_u16(x4, x, 3);
-    x = vsetq_lane_u16(x5, x, 4);
-    x = vsetq_lane_u16(x6, x, 5);
-    x = vsetq_lane_u16(x7, x, 6);
-    x = vsetq_lane_u16(x8, x, 7);
-    ;
-    return x;
-}
-
-simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1, int16_t x2, int16_t x3, int16_t x4,
-    int16_t x5, int16_t x6, int16_t x7, int16_t x8)
-{
-    uint16x8_t x {};
-    x = vsetq_lane_s16(x1, x, 0);
-    x = vsetq_lane_s16(x2, x, 1);
-    x = vsetq_lane_s16(x3, x, 2);
-    x = vsetq_lane_s16(x4, x, 3);
-    x = vsetq_lane_s16(x5, x, 4);
-    x = vsetq_lane_s16(x6, x, 5);
-    x = vsetq_lane_s16(x7, x, 6);
-    x = vsetq_lane_s16(x8, x, 7);
-    ;
-    return x;
+simdutf_really_inline int8x16_t make_int8x16_t(int8_t x1,  int8_t x2,  int8_t x3,  int8_t x4,
+                                       int8_t x5,  int8_t x6,  int8_t x7,  int8_t x8,
+                                       int8_t x9,  int8_t x10, int8_t x11, int8_t x12,
+                                       int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
+  // Doing a load like so end ups generating worse code.
+  // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+  //                     x9, x10,x11,x12,x13,x14,x15,x16};
+  // return vld1q_s8(array);
+  int8x16_t x{};
+  // incredibly, Visual Studio does not allow x[0] = x1
+  x = vsetq_lane_s8(x1, x, 0);
+  x = vsetq_lane_s8(x2, x, 1);
+  x = vsetq_lane_s8(x3, x, 2);
+  x = vsetq_lane_s8(x4, x, 3);
+  x = vsetq_lane_s8(x5, x, 4);
+  x = vsetq_lane_s8(x6, x, 5);
+  x = vsetq_lane_s8(x7, x, 6);
+  x = vsetq_lane_s8(x8, x, 7);
+  x = vsetq_lane_s8(x9, x, 8);
+  x = vsetq_lane_s8(x10, x, 9);
+  x = vsetq_lane_s8(x11, x, 10);
+  x = vsetq_lane_s8(x12, x, 11);
+  x = vsetq_lane_s8(x13, x, 12);
+  x = vsetq_lane_s8(x14, x, 13);
+  x = vsetq_lane_s8(x15, x, 14);
+  x = vsetq_lane_s8(x16, x, 15);
+  return x;
+}
+
+simdutf_really_inline uint8x8_t make_uint8x8_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
+                                         uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8) {
+  uint8x8_t x{};
+  x = vset_lane_u8(x1, x, 0);
+  x = vset_lane_u8(x2, x, 1);
+  x = vset_lane_u8(x3, x, 2);
+  x = vset_lane_u8(x4, x, 3);
+  x = vset_lane_u8(x5, x, 4);
+  x = vset_lane_u8(x6, x, 5);
+  x = vset_lane_u8(x7, x, 6);
+  x = vset_lane_u8(x8, x, 7);
+  return x;
+}
+
+simdutf_really_inline uint16x8_t make_uint16x8_t(uint16_t x1,  uint16_t x2,  uint16_t x3,  uint16_t x4,
+                                       uint16_t x5,  uint16_t x6,  uint16_t x7,  uint16_t x8) {
+  uint16x8_t x{};
+  x = vsetq_lane_u16(x1, x, 0);
+  x = vsetq_lane_u16(x2, x, 1);
+  x = vsetq_lane_u16(x3, x, 2);
+  x = vsetq_lane_u16(x4, x, 3);
+  x = vsetq_lane_u16(x5, x, 4);
+  x = vsetq_lane_u16(x6, x, 5);
+  x = vsetq_lane_u16(x7, x, 6);
+  x = vsetq_lane_u16(x8, x, 7);;
+  return x;
+}
+
+simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t x3,  int16_t x4,
+                                       int16_t x5,  int16_t x6,  int16_t x7,  int16_t x8) {
+  uint16x8_t x{};
+  x = vsetq_lane_s16(x1, x, 0);
+  x = vsetq_lane_s16(x2, x, 1);
+  x = vsetq_lane_s16(x3, x, 2);
+  x = vsetq_lane_s16(x4, x, 3);
+  x = vsetq_lane_s16(x5, x, 4);
+  x = vsetq_lane_s16(x6, x, 5);
+  x = vsetq_lane_s16(x7, x, 6);
+  x = vsetq_lane_s16(x8, x, 7);;
+  return x;
 }
 
+
 // End of private section with Visual Studio workaround
 } // namespace
 #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
 
-template<typename T>
-struct simd8;
 
-//
-// Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
-//
-template<typename T, typename Mask = simd8<bool>>
-struct base_u8 {
+  template<typename T>
+  struct simd8;
+
+  //
+  // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
+  //
+  template<typename T, typename Mask=simd8<bool>>
+  struct base_u8 {
     uint8x16_t value;
     static const int SIZE = sizeof(value);
 
     // Conversion from/to SIMD register
-    simdutf_really_inline base_u8(const uint8x16_t _value)
-        : value(_value)
-    {
-    }
+    simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
     simdutf_really_inline operator const uint8x16_t&() const { return this->value; }
     simdutf_really_inline operator uint8x16_t&() { return this->value; }
-    simdutf_really_inline T first() const { return vgetq_lane_u8(*this, 0); }
-    simdutf_really_inline T last() const { return vgetq_lane_u8(*this, 15); }
+    simdutf_really_inline T first() const { return vgetq_lane_u8(*this,0); }
+    simdutf_really_inline T last() const { return vgetq_lane_u8(*this,15); }
 
     // Bit operations
     simdutf_really_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
@@ -357,74 +352,48 @@ struct base_u8 {
     simdutf_really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
     simdutf_really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
-    simdutf_really_inline simd8<T>& operator|=(const simd8<T> other)
-    {
-        auto this_cast = static_cast<simd8<T>*>(this);
-        *this_cast = *this_cast | other;
-        return *this_cast;
-    }
-    simdutf_really_inline simd8<T>& operator&=(const simd8<T> other)
-    {
-        auto this_cast = static_cast<simd8<T>*>(this);
-        *this_cast = *this_cast & other;
-        return *this_cast;
-    }
-    simdutf_really_inline simd8<T>& operator^=(const simd8<T> other)
-    {
-        auto this_cast = static_cast<simd8<T>*>(this);
-        *this_cast = *this_cast ^ other;
-        return *this_cast;
-    }
+    simdutf_really_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
+    simdutf_really_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
+    simdutf_really_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
 
     friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
 
-    template<int N = 1>
-    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const
-    {
-        return vextq_u8(prev_chunk, *this, 16 - N);
+    template<int N=1>
+    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+      return vextq_u8(prev_chunk, *this, 16 - N);
     }
-};
+  };
 
-// SIMD byte mask type (returned by things like eq and gt)
-template<>
-struct simd8<bool> : base_u8<bool> {
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base_u8<bool> {
     typedef uint16_t bitmask_t;
     typedef uint32_t bitmask2_t;
 
     static simdutf_really_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
 
-    simdutf_really_inline simd8(const uint8x16_t _value)
-        : base_u8<bool>(_value)
-    {
-    }
+    simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
     // False constructor
-    simdutf_really_inline simd8()
-        : simd8(vdupq_n_u8(0))
-    {
-    }
+    simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {}
     // Splat constructor
-    simdutf_really_inline simd8(bool _value)
-        : simd8(splat(_value))
-    {
-    }
+    simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
     simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
 
     // We return uint32_t instead of uint16_t because that seems to be more efficient for most
     // purposes (cutting it down to uint16_t costs performance in some compilers).
-    simdutf_really_inline uint32_t to_bitmask() const
-    {
+    simdutf_really_inline uint32_t to_bitmask() const {
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+      const uint8x16_t bit_mask =  make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                                   0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
 #else
-        const uint8x16_t bit_mask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 };
+      const uint8x16_t bit_mask =  {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
 #endif
-        auto minput = *this & bit_mask;
-        uint8x16_t tmp = vpaddq_u8(minput, minput);
-        tmp = vpaddq_u8(tmp, tmp);
-        tmp = vpaddq_u8(tmp, tmp);
-        return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
+      auto minput = *this & bit_mask;
+      uint8x16_t tmp = vpaddq_u8(minput, minput);
+      tmp = vpaddq_u8(tmp, tmp);
+      tmp = vpaddq_u8(tmp, tmp);
+      return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
     }
 
     // Returns 4-bit out of each byte, alternating between the high 4 bits and low bits
@@ -432,70 +401,58 @@ struct simd8<bool> : base_u8<bool> {
     // This method is expected to be faster than none() and is equivalent
     // when the vector register is the result of a comparison, with byte
     // values 0xff and 0x00.
-    simdutf_really_inline uint64_t to_bitmask64() const
-    {
-        return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
+    simdutf_really_inline uint64_t to_bitmask64() const {
+      return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
     }
 
     simdutf_really_inline bool any() const { return vmaxvq_u8(*this) != 0; }
     simdutf_really_inline bool none() const { return vmaxvq_u8(*this) == 0; }
     simdutf_really_inline bool all() const { return vminvq_u8(*this) == 0xFF; }
-};
 
-// Unsigned bytes
-template<>
-struct simd8<uint8_t> : base_u8<uint8_t> {
+
+  };
+
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base_u8<uint8_t> {
     static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) { return vmovq_n_u8(_value); }
     static simdutf_really_inline simd8<uint8_t> zero() { return vdupq_n_u8(0); }
     static simdutf_really_inline simd8<uint8_t> load(const uint8_t* values) { return vld1q_u8(values); }
-    simdutf_really_inline simd8(const uint8x16_t _value)
-        : base_u8<uint8_t>(_value)
-    {
-    }
+    simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
     // Zero constructor
-    simdutf_really_inline simd8()
-        : simd8(zero())
-    {
-    }
+    simdutf_really_inline simd8() : simd8(zero()) {}
     // Array constructor
-    simdutf_really_inline simd8(const uint8_t values[16])
-        : simd8(load(values))
-    {
-    }
+    simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
     // Splat constructor
-    simdutf_really_inline simd8(uint8_t _value)
-        : simd8(splat(_value))
-    {
-    }
+    simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
     // Member-by-member initialization
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
     simdutf_really_inline simd8(
-        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
-        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
-        : simd8(make_uint8x16_t(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15))
-    {
-    }
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) : simd8(make_uint8x16_t(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
 #else
     simdutf_really_inline simd8(
-        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
-        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
-        : simd8(uint8x16_t {
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15 })
-    {
-    }
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) : simd8(uint8x16_t{
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    }) {}
 #endif
 
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<uint8_t> repeat_16(
-        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
-        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
-    {
-        return simd8<uint8_t>(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15);
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) {
+      return simd8<uint8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
     }
 
     // Store to array
@@ -508,16 +465,8 @@ struct simd8<uint8_t> : base_u8<uint8_t> {
     // Addition/subtraction are the same for signed and unsigned
     simdutf_really_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
     simdutf_really_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
-    simdutf_really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other)
-    {
-        *this = *this + other;
-        return *this;
-    }
-    simdutf_really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other)
-    {
-        *this = *this - other;
-        return *this;
-    }
+    simdutf_really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
+    simdutf_really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
 
     // Order-specific operations
     simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
@@ -546,116 +495,136 @@ struct simd8<uint8_t> : base_u8<uint8_t> {
 
     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
     template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
-    {
-        return lookup_table.apply_lookup_16_to(*this);
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return lookup_table.apply_lookup_16_to(*this);
     }
 
+
     template<typename L>
     simdutf_really_inline simd8<L> lookup_16(
-        L replace0, L replace1, L replace2, L replace3,
-        L replace4, L replace5, L replace6, L replace7,
-        L replace8, L replace9, L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const
-    {
-        return lookup_16(simd8<L>::repeat_16(
-            replace0, replace1, replace2, replace3,
-            replace4, replace5, replace6, replace7,
-            replace8, replace9, replace10, replace11,
-            replace12, replace13, replace14, replace15));
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
     }
 
     template<typename T>
-    simdutf_really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) const
-    {
-        return vqtbl1q_u8(*this, simd8<uint8_t>(original));
+    simdutf_really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) const {
+      return vqtbl1q_u8(*this, simd8<uint8_t>(original));
     }
-};
+  };
 
-// Signed bytes
-template<>
-struct simd8<int8_t> {
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> {
     int8x16_t value;
 
     static simdutf_really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
     static simdutf_really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
     static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
-    template<endianness big_endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t* p) const
-    {
-        uint16x8_t first = vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(this->value)));
-        uint16x8_t second = vmovl_high_u8(vreinterpretq_u8_s8(this->value));
-        if (!match_system(big_endian)) {
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-#else
-            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
-#endif
-            first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(first), swap));
-            second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(second), swap));
-        }
-        vst1q_u16(reinterpret_cast<uint16_t*>(p), first);
-        vst1q_u16(reinterpret_cast<uint16_t*>(p + 8), second);
-    }
-    simdutf_really_inline void store_ascii_as_utf32(char32_t* p) const
-    {
-        vst1q_u32(reinterpret_cast<uint32_t*>(p), vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(this->value))))));
-        vst1q_u32(reinterpret_cast<uint32_t*>(p + 4), vmovl_high_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(this->value)))));
-        vst1q_u32(reinterpret_cast<uint32_t*>(p + 8), vmovl_u16(vget_low_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value)))));
-        vst1q_u32(reinterpret_cast<uint32_t*>(p + 12), vmovl_high_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value))));
+
+    // Use ST2 instead of UXTL+UXTL2 to interleave zeroes. UXTL is actually a USHLL #0,
+    // and shifting in NEON is actually quite slow.
+    //
+    // While this needs the registers to be in a specific order, bigger cores can interleave
+    // these with no overhead, and it still performs decently on little cores.
+    //    movi  v1.3d, #0
+    //      mov   v0.16b, value[0]
+    //    st2   {v0.16b, v1.16b}, [ptr], #32
+    //      mov   v0.16b, value[1]
+    //    st2   {v0.16b, v1.16b}, [ptr], #32
+    //    ...
+    template <endianness big_endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
+      int8x16x2_t pair = match_system(big_endian)
+          ? int8x16x2_t{{this->value, vmovq_n_s8(0)}}
+          : int8x16x2_t{{vmovq_n_s8(0), this->value}};
+      vst2q_s8(reinterpret_cast<int8_t *>(p), pair);
+    }
+
+    // currently unused
+    // Technically this could be done with ST4 like in store_ascii_as_utf16, but it is
+    // very much not worth it, as explicitly mentioned in the ARM Cortex-X1 Core Software
+    // Optimization Guide:
+    //   4.18 Complex ASIMD instructions
+    //     The bandwidth of [ST4 with element size less than 64b] is limited by decode
+    //     constraints and it is advisable to avoid them when high performing code is desired.
+    // Instead, it is better to use ZIP1+ZIP2 and two ST2.
+    simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
+      const uint16x8_t low = vreinterpretq_u16_s8(vzip1q_s8(this->value, vmovq_n_s8(0)));
+      const uint16x8_t high = vreinterpretq_u16_s8(vzip2q_s8(this->value, vmovq_n_s8(0)));
+      const uint16x8x2_t low_pair{{ low, vmovq_n_u16(0) }};
+      vst2q_u16(reinterpret_cast<uint16_t *>(p), low_pair);
+      const uint16x8x2_t high_pair{{ high, vmovq_n_u16(0) }};
+      vst2q_u16(reinterpret_cast<uint16_t *>(p + 8), high_pair);
+    }
+
+    // In places where the table can be reused, which is most uses in simdutf, it is worth it to do
+    // 4 table lookups, as there is no direct zero extension from u8 to u32.
+    simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t * p) const {
+      const simd8<uint8_t> tb1{  0,255,255,255,  1,255,255,255,  2,255,255,255,  3,255,255,255 };
+      const simd8<uint8_t> tb2{  4,255,255,255,  5,255,255,255,  6,255,255,255,  7,255,255,255 };
+      const simd8<uint8_t> tb3{  8,255,255,255,  9,255,255,255, 10,255,255,255, 11,255,255,255 };
+      const simd8<uint8_t> tb4{ 12,255,255,255, 13,255,255,255, 14,255,255,255, 15,255,255,255 };
+
+      // encourage store pairing and interleaving
+      const auto shuf1 = this->apply_lookup_16_to(tb1);
+      const auto shuf2 = this->apply_lookup_16_to(tb2);
+      shuf1.store(reinterpret_cast<int8_t *>(p));
+      shuf2.store(reinterpret_cast<int8_t *>(p + 4));
+
+      const auto shuf3 = this->apply_lookup_16_to(tb3);
+      const auto shuf4 = this->apply_lookup_16_to(tb4);
+      shuf3.store(reinterpret_cast<int8_t *>(p + 8));
+      shuf4.store(reinterpret_cast<int8_t *>(p + 12));
     }
     // Conversion from/to SIMD register
-    simdutf_really_inline simd8(const int8x16_t _value)
-        : value { _value }
-    {
-    }
+    simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {}
     simdutf_really_inline operator const int8x16_t&() const { return this->value; }
+#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
     simdutf_really_inline operator const uint8x16_t() const { return vreinterpretq_u8_s8(this->value); }
+#endif
     simdutf_really_inline operator int8x16_t&() { return this->value; }
 
     // Zero constructor
-    simdutf_really_inline simd8()
-        : simd8(zero())
-    {
-    }
+    simdutf_really_inline simd8() : simd8(zero()) {}
     // Splat constructor
-    simdutf_really_inline simd8(int8_t _value)
-        : simd8(splat(_value))
-    {
-    }
+    simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
     // Array constructor
-    simdutf_really_inline simd8(const int8_t* values)
-        : simd8(load(values))
-    {
-    }
+    simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
     // Member-by-member initialization
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
     simdutf_really_inline simd8(
-        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
-        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
-        : simd8(make_int8x16_t(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15))
-    {
-    }
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) : simd8(make_int8x16_t(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
 #else
     simdutf_really_inline simd8(
-        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
-        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
-        : simd8(int8x16_t {
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15 })
-    {
-    }
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) : simd8(int8x16_t{
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    }) {}
 #endif
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<int8_t> repeat_16(
-        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
-        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
-    {
-        return simd8<int8_t>(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15);
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) {
+      return simd8<int8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
     }
 
     // Store to array
@@ -666,15 +635,9 @@ struct simd8<int8_t> {
     // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
     // and relatively ugly and hard to read.
 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
-    simdutf_really_inline explicit simd8(const uint8x16_t other)
-        : simd8(vreinterpretq_s8_u8(other))
-    {
-    }
+    simdutf_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
 #endif
-    simdutf_really_inline operator simd8<uint8_t>() const
-    {
-        return vreinterpretq_u8_s8(this->value);
-    }
+    simdutf_really_inline operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
 
     simdutf_really_inline simd8<int8_t> operator|(const simd8<int8_t> other) const { return vorrq_s8(value, other.value); }
     simdutf_really_inline simd8<int8_t> operator&(const simd8<int8_t> other) const { return vandq_s8(value, other.value); }
@@ -684,16 +647,8 @@ struct simd8<int8_t> {
     // Math
     simdutf_really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(value, other.value); }
     simdutf_really_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(value, other.value); }
-    simdutf_really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other)
-    {
-        *this = *this + other;
-        return *this;
-    }
-    simdutf_really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other)
-    {
-        *this = *this - other;
-        return *this;
-    }
+    simdutf_really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
+    simdutf_really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
 
     simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); }
     simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); }
@@ -706,41 +661,38 @@ struct simd8<int8_t> {
     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(value, other.value); }
     simdutf_really_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(value, other.value); }
 
-    template<int N = 1>
-    simdutf_really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const
-    {
-        return vextq_s8(prev_chunk, *this, 16 - N);
+    template<int N=1>
+    simdutf_really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
+      return vextq_s8(prev_chunk, *this, 16 - N);
     }
 
     // Perform a lookup assuming no value is larger than 16
     template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
-    {
-        return lookup_table.apply_lookup_16_to(*this);
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return lookup_table.apply_lookup_16_to(*this);
     }
     template<typename L>
     simdutf_really_inline simd8<L> lookup_16(
-        L replace0, L replace1, L replace2, L replace3,
-        L replace4, L replace5, L replace6, L replace7,
-        L replace8, L replace9, L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const
-    {
-        return lookup_16(simd8<L>::repeat_16(
-            replace0, replace1, replace2, replace3,
-            replace4, replace5, replace6, replace7,
-            replace8, replace9, replace10, replace11,
-            replace12, replace13, replace14, replace15));
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
     }
 
     template<typename T>
-    simdutf_really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original)
-    {
-        return vqtbl1q_s8(*this, simd8<uint8_t>(original));
+    simdutf_really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) const {
+      return vqtbl1q_s8(*this, simd8<uint8_t>(original));
     }
-};
+  };
 
-template<typename T>
-struct simd8x64 {
+  template<typename T>
+  struct simd8x64 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
     static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
     simd8<T> chunks[NUM_CHUNKS];
@@ -749,181 +701,159 @@ struct simd8x64 {
     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
     simd8x64() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3)
-        : chunks { chunk0, chunk1, chunk2, chunk3 }
-    {
-    }
-    simdutf_really_inline simd8x64(const T* ptr)
-        : chunks { simd8<T>::load(ptr), simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T)) }
-    {
-    }
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
 
-    simdutf_really_inline void store(T* ptr) const
-    {
-        this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
-        this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
-        this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
-        this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+    simdutf_really_inline void store(T* ptr) const {
+      this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
+      this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
+      this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
+      this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
     }
 
-    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T>& other)
-    {
-        this->chunks[0] |= other.chunks[0];
-        this->chunks[1] |= other.chunks[1];
-        this->chunks[2] |= other.chunks[2];
-        this->chunks[3] |= other.chunks[3];
-        return *this;
+
+    simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
+      this->chunks[0] |= other.chunks[0];
+      this->chunks[1] |= other.chunks[1];
+      this->chunks[2] |= other.chunks[2];
+      this->chunks[3] |= other.chunks[3];
+      return *this;
     }
 
-    simdutf_really_inline simd8<T> reduce_or() const
-    {
-        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    simdutf_really_inline simd8<T> reduce_or() const {
+      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
     }
 
-    simdutf_really_inline bool is_ascii() const
-    {
-        return reduce_or().is_ascii();
+    simdutf_really_inline bool is_ascii() const {
+      return reduce_or().is_ascii();
     }
 
-    template<endianness endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
-    {
-        this->chunks[0].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 0);
-        this->chunks[1].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 1);
-        this->chunks[2].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 2);
-        this->chunks[3].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 3);
+    template <endianness endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
+      this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
+      this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
+      this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
     }
 
-    simdutf_really_inline void store_ascii_as_utf32(char32_t* ptr) const
-    {
-        this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
-        this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
-        this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 2);
-        this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 3);
+    simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
+      this->chunks[0].store_ascii_as_utf32_tbl(ptr+sizeof(simd8<T>)*0);
+      this->chunks[1].store_ascii_as_utf32_tbl(ptr+sizeof(simd8<T>)*1);
+      this->chunks[2].store_ascii_as_utf32_tbl(ptr+sizeof(simd8<T>)*2);
+      this->chunks[3].store_ascii_as_utf32_tbl(ptr+sizeof(simd8<T>)*3);
     }
 
-    simdutf_really_inline uint64_t to_bitmask() const
-    {
+    simdutf_really_inline uint64_t to_bitmask() const {
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x16_t bit_mask = make_uint8x16_t(
-            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+      const uint8x16_t bit_mask = make_uint8x16_t(
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+      );
 #else
-        const uint8x16_t bit_mask = {
-            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-        };
+      const uint8x16_t bit_mask = {
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+      };
 #endif
-        // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
-        uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
-        uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
-        sum0 = vpaddq_u8(sum0, sum1);
-        sum0 = vpaddq_u8(sum0, sum0);
-        return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
-    }
-
-    simdutf_really_inline uint64_t eq(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] == mask,
-            this->chunks[1] == mask,
-            this->chunks[2] == mask,
-            this->chunks[3] == mask)
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t lteq(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] <= mask,
-            this->chunks[1] <= mask,
-            this->chunks[2] <= mask,
-            this->chunks[3] <= mask)
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const
-    {
-        const simd8<T> mask_low = simd8<T>::splat(low);
-        const simd8<T> mask_high = simd8<T>::splat(high);
-
-        return simd8x64<bool>(
-            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
-    {
-        const simd8<T> mask_low = simd8<T>::splat(low);
-        const simd8<T> mask_high = simd8<T>::splat(high);
-        return simd8x64<bool>(
-            (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
-            (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
-            (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
-            (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] < mask,
-            this->chunks[1] < mask,
-            this->chunks[2] < mask,
-            this->chunks[3] < mask)
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t gt(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] > mask,
-            this->chunks[1] > mask,
-            this->chunks[2] > mask,
-            this->chunks[3] > mask)
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] >= mask,
-            this->chunks[1] >= mask,
-            this->chunks[2] >= mask,
-            this->chunks[3] >= mask)
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const
-    {
-        const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
-        return simd8x64<bool>(
-            simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
-            simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
-            simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
-            simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask)
-            .to_bitmask();
-    }
-}; // struct simd8x64<T>
+      // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
+      uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
+      uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
+      sum0 = vpaddq_u8(sum0, sum1);
+      sum0 = vpaddq_u8(sum0, sum0);
+      return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return  simd8x64<bool>(
+      this->chunks[0] == mask,
+      this->chunks[1] == mask,
+      this->chunks[2] == mask,
+      this->chunks[3] == mask
+    ).to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return  simd8x64<bool>(
+      this->chunks[0] <= mask,
+      this->chunks[1] <= mask,
+      this->chunks[2] <= mask,
+      this->chunks[3] <= mask
+    ).to_bitmask();
+  }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+
+      return  simd8x64<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+      return  simd8x64<bool>(
+        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+        (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+        (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] < mask,
+        this->chunks[1] < mask,
+        this->chunks[2] < mask,
+        this->chunks[3] < mask
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t gt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] > mask,
+        this->chunks[1] > mask,
+        this->chunks[2] > mask,
+        this->chunks[3] > mask
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] >= mask,
+        this->chunks[1] >= mask,
+        this->chunks[2] >= mask,
+        this->chunks[3] >= mask
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+      return  simd8x64<bool>(
+        simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
+        simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
+        simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
+        simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask
+      ).to_bitmask();
+    }
+  }; // struct simd8x64<T>
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/simd16-inl.h
 /* begin file src/simdutf/arm64/simd16-inl.h */
 template<typename T>
 struct simd16;
 
-template<typename T, typename Mask = simd16<bool>>
-struct base_u16 {
+  template<typename T, typename Mask=simd16<bool>>
+  struct base_u16 {
     uint16x8_t value;
     static const int SIZE = sizeof(value);
 
     // Conversion from/to SIMD register
     simdutf_really_inline base_u16() = default;
-    simdutf_really_inline base_u16(const uint16x8_t _value)
-        : value(_value)
-    {
-    }
+    simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {}
     simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
     simdutf_really_inline operator uint16x8_t&() { return this->value; }
     // Bit operations
@@ -932,244 +862,165 @@ struct base_u16 {
     simdutf_really_inline simd16<T> operator^(const simd16<T> other) const { return veorq_u16(*this, other); }
     simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const { return vbicq_u16(*this, other); }
     simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
-    simdutf_really_inline simd16<T>& operator|=(const simd16<T> other)
-    {
-        auto this_cast = static_cast<simd16<T>*>(this);
-        *this_cast = *this_cast | other;
-        return *this_cast;
-    }
-    simdutf_really_inline simd16<T>& operator&=(const simd16<T> other)
-    {
-        auto this_cast = static_cast<simd16<T>*>(this);
-        *this_cast = *this_cast & other;
-        return *this_cast;
-    }
-    simdutf_really_inline simd16<T>& operator^=(const simd16<T> other)
-    {
-        auto this_cast = static_cast<simd16<T>*>(this);
-        *this_cast = *this_cast ^ other;
-        return *this_cast;
-    }
+    simdutf_really_inline simd16<T>& operator|=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
+    simdutf_really_inline simd16<T>& operator&=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
+    simdutf_really_inline simd16<T>& operator^=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
 
     friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return vceqq_u16(lhs, rhs); }
 
-    template<int N = 1>
-    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const
-    {
-        return vextq_u18(prev_chunk, *this, 8 - N);
+    template<int N=1>
+    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+      return vextq_u18(prev_chunk, *this, 8 - N);
     }
-};
+  };
 
-template<typename T, typename Mask = simd16<bool>>
-struct base16 : base_u16<T> {
-    typedef uint16_t bitmask_t;
-    typedef uint32_t bitmask2_t;
+template<typename T, typename Mask=simd16<bool>>
+struct base16: base_u16<T> {
+  typedef uint16_t bitmask_t;
+  typedef uint32_t bitmask2_t;
 
-    simdutf_really_inline base16()
-        : base_u16<T>()
-    {
-    }
-    simdutf_really_inline base16(const uint16x8_t _value)
-        : base_u16<T>(_value)
-    {
-    }
-    template<typename Pointer>
-    simdutf_really_inline base16(const Pointer* ptr)
-        : base16(vld1q_u16(ptr))
-    {
-    }
+  simdutf_really_inline base16() : base_u16<T>() {}
+  simdutf_really_inline base16(const uint16x8_t _value) : base_u16<T>(_value) {}
+  template <typename Pointer>
+  simdutf_really_inline base16(const Pointer* ptr) : base16(vld1q_u16(ptr)) {}
 
-    static const int SIZE = sizeof(base_u16<T>::value);
+  static const int SIZE = sizeof(base_u16<T>::value);
 
-    template<int N = 1>
-    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const
-    {
-        return vextq_u18(prev_chunk, *this, 8 - N);
-    }
+  template<int N=1>
+  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+    return vextq_u18(prev_chunk, *this, 8 - N);
+  }
 };
 
 // SIMD byte mask type (returned by things like eq and gt)
 template<>
-struct simd16<bool> : base16<bool> {
-    static simdutf_really_inline simd16<bool> splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); }
+struct simd16<bool>: base16<bool> {
+  static simdutf_really_inline simd16<bool> splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); }
+
+  simdutf_really_inline simd16<bool>() : base16() {}
+  simdutf_really_inline simd16<bool>(const uint16x8_t _value) : base16<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
 
-    simdutf_really_inline simd16<bool>()
-        : base16()
-    {
-    }
-    simdutf_really_inline simd16<bool>(const uint16x8_t _value)
-        : base16<bool>(_value)
-    {
-    }
-    // Splat constructor
-    simdutf_really_inline simd16<bool>(bool _value)
-        : base16<bool>(splat(_value))
-    {
-    }
 };
 
 template<typename T>
-struct base16_numeric : base16<T> {
-    static simdutf_really_inline simd16<T> splat(T _value) { return vmovq_n_u16(_value); }
-    static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
-    static simdutf_really_inline simd16<T> load(const T values[8])
-    {
-        return vld1q_u16(reinterpret_cast<const uint16_t*>(values));
-    }
-
-    simdutf_really_inline base16_numeric()
-        : base16<T>()
-    {
-    }
-    simdutf_really_inline base16_numeric(const uint16x8_t _value)
-        : base16<T>(_value)
-    {
-    }
-
-    // Store to array
-    simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); }
-
-    // Override to distinguish from bool version
-    simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
-
-    // Addition/subtraction are the same for signed and unsigned
-    simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return vaddq_u8(*this, other); }
-    simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return vsubq_u8(*this, other); }
-    simdutf_really_inline simd16<T>& operator+=(const simd16<T> other)
-    {
-        *this = *this + other;
-        return *static_cast<simd16<T>*>(this);
-    }
-    simdutf_really_inline simd16<T>& operator-=(const simd16<T> other)
-    {
-        *this = *this - other;
-        return *static_cast<simd16<T>*>(this);
-    }
+struct base16_numeric: base16<T> {
+  static simdutf_really_inline simd16<T> splat(T _value) { return vmovq_n_u16(_value); }
+  static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
+  static simdutf_really_inline simd16<T> load(const T values[8]) {
+    return vld1q_u16(reinterpret_cast<const uint16_t*>(values));
+  }
+
+  simdutf_really_inline base16_numeric() : base16<T>() {}
+  simdutf_really_inline base16_numeric(const uint16x8_t _value) : base16<T>(_value) {}
+
+  // Store to array
+  simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); }
+
+  // Override to distinguish from bool version
+  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return vaddq_u8(*this, other); }
+  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return vsubq_u8(*this, other); }
+  simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
+  simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
 };
 
 // Signed words
 template<>
 struct simd16<int16_t> : base16_numeric<int16_t> {
-    simdutf_really_inline simd16()
-        : base16_numeric<int16_t>()
-    {
-    }
+  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
-    simdutf_really_inline simd16(const uint16x8_t _value)
-        : base16_numeric<int16_t>(_value)
-    {
-    }
+  simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<int16_t>(_value) {}
 #endif
-    simdutf_really_inline simd16(const int16x8_t _value)
-        : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value))
-    {
-    }
-
-    // Splat constructor
-    simdutf_really_inline simd16(int16_t _value)
-        : simd16(splat(_value))
-    {
-    }
-    // Array constructor
-    simdutf_really_inline simd16(const int16_t* values)
-        : simd16(load(values))
-    {
-    }
-    simdutf_really_inline simd16(const char16_t* values)
-        : simd16(load(reinterpret_cast<const int16_t*>(values)))
-    {
-    }
-    simdutf_really_inline operator simd16<uint16_t>() const;
-    simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
-    simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); }
-
-    simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); }
-    simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); }
-    // Order-sensitive comparisons
-    simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
-    simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
-    simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
-    simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+  simdutf_really_inline simd16(const int16x8_t _value) : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value)) {}
+
+  // Splat constructor
+  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
+  simdutf_really_inline operator simd16<uint16_t>() const;
+  simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
+  simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); }
+
+  simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); }
+  simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); }
+  // Order-sensitive comparisons
+  simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+  simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+  simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+  simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
 };
 
-// Unsigned words
-template<>
-struct simd16<uint16_t> : base16_numeric<uint16_t> {
-    simdutf_really_inline simd16()
-        : base16_numeric<uint16_t>()
-    {
-    }
-    simdutf_really_inline simd16(const uint16x8_t _value)
-        : base16_numeric<uint16_t>(_value)
-    {
-    }
 
-    // Splat constructor
-    simdutf_really_inline simd16(uint16_t _value)
-        : simd16(splat(_value))
-    {
-    }
-    // Array constructor
-    simdutf_really_inline simd16(const uint16_t* values)
-        : simd16(load(values))
-    {
-    }
-    simdutf_really_inline simd16(const char16_t* values)
-        : simd16(load(reinterpret_cast<const uint16_t*>(values)))
-    {
-    }
-
-    simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
-    simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
-    // Saturated math
-    simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return vqaddq_u16(*this, other); }
-    simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return vqsubq_u16(*this, other); }
-
-    // Order-specific operations
-    simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return vmaxq_u16(*this, other); }
-    simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return vminq_u16(*this, other); }
-    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-    simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
-    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-    simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
-    simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return vcleq_u16(*this, other); }
-    simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return vcgeq_u16(*this, other); }
-    simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return vcgtq_u16(*this, other); }
-    simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return vcltq_u16(*this, other); }
 
-    // Bit-specific operations
-    simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
-    template<int N>
-    simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(vshrq_n_u16(*this, N)); }
-    template<int N>
-    simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(vshlq_n_u16(*this, N)); }
-
-    // logical operations
-    simdutf_really_inline simd16<uint16_t> operator|(const simd16<uint16_t> other) const { return vorrq_u16(*this, other); }
-    simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
-    simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
-
-    // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
-    static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1)
-    {
-        return vqmovn_high_u16(vqmovn_u16(v0), v1);
-    }
 
-    // Change the endianness
-    simdutf_really_inline simd16<uint16_t> swap_bytes() const
-    {
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-#else
-        const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
-#endif
-        return vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(*this), swap));
-    }
+// Unsigned words
+template<>
+struct simd16<uint16_t>: base16_numeric<uint16_t>  {
+  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+  simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<uint16_t>(_value) {}
+
+  // Splat constructor
+  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
+
+
+  simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
+  simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
+  // Saturated math
+  simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return vqaddq_u16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return vqsubq_u16(*this, other); }
+
+  // Order-specific operations
+  simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return vmaxq_u16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return vminq_u16(*this, other); }
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
+  simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return vcleq_u16(*this, other); }
+  simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return vcgeq_u16(*this, other); }
+  simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return  vcgtq_u16(*this, other); }
+  simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return vcltq_u16(*this, other); }
+
+  // Bit-specific operations
+  simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
+  template<int N>
+  simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(vshrq_n_u16(*this, N)); }
+  template<int N>
+  simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(vshlq_n_u16(*this, N)); }
+
+  // logical operations
+  simdutf_really_inline simd16<uint16_t> operator|(const simd16<uint16_t> other) const { return vorrq_u16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
+
+  // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
+  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
+    return vqmovn_high_u16(vqmovn_u16(v0), v1);
+  }
+
+  // Change the endianness
+  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
+    #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+    const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    #else
+    const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    #endif
+    return vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(*this), swap));
+  }
 };
 simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
 
-template<typename T>
-struct simd16x32 {
+
+  template<typename T>
+  struct simd16x32 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
     static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
     simd16<T> chunks[NUM_CHUNKS];
@@ -1178,138 +1029,122 @@ struct simd16x32 {
     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
     simd16x32() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3)
-        : chunks { chunk0, chunk1, chunk2, chunk3 }
-    {
-    }
-    simdutf_really_inline simd16x32(const T* ptr)
-        : chunks { simd16<T>::load(ptr), simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)), simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)), simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T)) }
-    {
-    }
+    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
 
-    simdutf_really_inline void store(T* ptr) const
-    {
-        this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
-        this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
-        this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
-        this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
+    simdutf_really_inline void store(T* ptr) const {
+      this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
+      this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
+      this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
+      this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
     }
 
-    simdutf_really_inline simd16<T> reduce_or() const
-    {
-        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    simdutf_really_inline simd16<T> reduce_or() const {
+      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
     }
 
-    simdutf_really_inline bool is_ascii() const
-    {
-        return reduce_or().is_ascii();
+    simdutf_really_inline bool is_ascii() const {
+      return reduce_or().is_ascii();
     }
 
-    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
-    {
-        this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
-        this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
-        this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
-        this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
+      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
+      this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
+      this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
     }
 
-    simdutf_really_inline uint64_t to_bitmask() const
-    {
+    simdutf_really_inline uint64_t to_bitmask() const {
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x16_t bit_mask = make_uint8x16_t(
-            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+      const uint8x16_t bit_mask = make_uint8x16_t(
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+      );
 #else
-        const uint8x16_t bit_mask = {
-            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-        };
+      const uint8x16_t bit_mask = {
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+      };
 #endif
-        // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
-        uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
-        uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
-        sum0 = vpaddq_u8(sum0, sum1);
-        sum0 = vpaddq_u8(sum0, sum0);
-        return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
-    }
-
-    simdutf_really_inline void swap_bytes()
-    {
-        this->chunks[0] = this->chunks[0].swap_bytes();
-        this->chunks[1] = this->chunks[1].swap_bytes();
-        this->chunks[2] = this->chunks[2].swap_bytes();
-        this->chunks[3] = this->chunks[3].swap_bytes();
-    }
-
-    simdutf_really_inline uint64_t eq(const T m) const
-    {
-        const simd16<T> mask = simd16<T>::splat(m);
-        return simd16x32<bool>(
-            this->chunks[0] == mask,
-            this->chunks[1] == mask,
-            this->chunks[2] == mask,
-            this->chunks[3] == mask)
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t lteq(const T m) const
-    {
-        const simd16<T> mask = simd16<T>::splat(m);
-        return simd16x32<bool>(
-            this->chunks[0] <= mask,
-            this->chunks[1] <= mask,
-            this->chunks[2] <= mask,
-            this->chunks[3] <= mask)
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const
-    {
-        const simd16<T> mask_low = simd16<T>::splat(low);
-        const simd16<T> mask_high = simd16<T>::splat(high);
-
-        return simd16x32<bool>(
-            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
-    {
-        const simd16<T> mask_low = simd16<T>::splat(low);
-        const simd16<T> mask_high = simd16<T>::splat(high);
-        return simd16x32<bool>(
-            (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
-            (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
-            (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
-            (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const
-    {
-        const simd16<T> mask = simd16<T>::splat(m);
-        return simd16x32<bool>(
-            this->chunks[0] < mask,
-            this->chunks[1] < mask,
-            this->chunks[2] < mask,
-            this->chunks[3] < mask)
-            .to_bitmask();
-    }
-
-}; // struct simd16x32<T>
-template<>
-simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t low, const uint16_t high) const
-{
-    const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
-    const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
-    simd16x32<uint16_t> x(
+      // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
+      uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
+      uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
+      sum0 = vpaddq_u8(sum0, sum1);
+      sum0 = vpaddq_u8(sum0, sum0);
+      return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+    }
+
+    simdutf_really_inline void swap_bytes() {
+      this->chunks[0] = this->chunks[0].swap_bytes();
+      this->chunks[1] = this->chunks[1].swap_bytes();
+      this->chunks[2] = this->chunks[2].swap_bytes();
+      this->chunks[3] = this->chunks[3].swap_bytes();
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return  simd16x32<bool>(
+      this->chunks[0] == mask,
+      this->chunks[1] == mask,
+      this->chunks[2] == mask,
+      this->chunks[3] == mask
+    ).to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return  simd16x32<bool>(
+      this->chunks[0] <= mask,
+      this->chunks[1] <= mask,
+      this->chunks[2] <= mask,
+      this->chunks[3] <= mask
+    ).to_bitmask();
+  }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd16<T> mask_low = simd16<T>::splat(low);
+      const simd16<T> mask_high = simd16<T>::splat(high);
+
+      return  simd16x32<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd16<T> mask_low = simd16<T>::splat(low);
+      const simd16<T> mask_high = simd16<T>::splat(high);
+      return  simd16x32<bool>(
+        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+        (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+        (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] < mask,
+        this->chunks[1] < mask,
+        this->chunks[2] < mask,
+        this->chunks[3] < mask
+      ).to_bitmask();
+    }
+
+  }; // struct simd16x32<T>
+  template<>
+  simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t low, const uint16_t high) const {
+      const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
+      const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
+      simd16x32<uint16_t> x(
         simd16<uint16_t>((this->chunks[0] > mask_high) | (this->chunks[0] < mask_low)),
         simd16<uint16_t>((this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)),
         simd16<uint16_t>((this->chunks[2] > mask_high) | (this->chunks[2] < mask_low)),
-        simd16<uint16_t>((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)));
-    return x.to_bitmask();
-}
+        simd16<uint16_t>((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+      );
+      return  x.to_bitmask();
+    }
 /* end file src/simdutf/arm64/simd16-inl.h */
 } // namespace simd
 } // unnamed namespace
@@ -1332,6 +1167,8 @@ simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t
 #ifndef SIMDUTF_ICELAKE_H
 #define SIMDUTF_ICELAKE_H
 
+
+
 #ifdef __has_include
 // How do we detect that a compiler supports vbmi2?
 // For sure if the following header is found, we are ok?
@@ -1355,13 +1192,16 @@ simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t
 
 // To see why  (__BMI__) && (__LZCNT__) are not part of this next line, see
 // https://github.com/simdutf/simdutf/issues/1247
-#define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && (SIMDUTF_HAS_AVX512F && SIMDUTF_HAS_AVX512DQ && SIMDUTF_HAS_AVX512VL && SIMDUTF_HAS_AVX512VBMI2) && (!SIMDUTF_IS_32BITS))
+#define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && (SIMDUTF_HAS_AVX512F && \
+                                         SIMDUTF_HAS_AVX512DQ && \
+                                         SIMDUTF_HAS_AVX512VL && \
+                                           SIMDUTF_HAS_AVX512VBMI2) && (!SIMDUTF_IS_32BITS))
 
 #if SIMDUTF_IMPLEMENTATION_ICELAKE
 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
 #define SIMDUTF_TARGET_ICELAKE
 #else
-#define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt")
+#define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt,avx512vpopcntdq")
 #endif
 
 namespace simdutf {
@@ -1369,6 +1209,8 @@ namespace icelake {
 } // namespace icelake
 } // namespace simdutf
 
+
+
 //
 // These two need to be included outside SIMDUTF_TARGET_REGION
 //
@@ -1377,9 +1219,10 @@ namespace icelake {
 #ifndef SIMDUTF_ICELAKE_INTRINSICS_H
 #define SIMDUTF_ICELAKE_INTRINSICS_H
 
+
 #ifdef SIMDUTF_VISUAL_STUDIO
 // under clang within visual studio, this will include <x86intrin.h>
-#include <intrin.h> // visual studio or clang
+#include <intrin.h>  // visual studio or clang
 #include <immintrin.h>
 #else
 
@@ -1393,6 +1236,7 @@ SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
 
 #include <x86intrin.h> // elsewhere
 
+
 #if SIMDUTF_GCC11ORMORE
 // cancels the suppression of the -Wuninitialized
 SIMDUTF_POP_DISABLE_WARNINGS
@@ -1422,10 +1266,10 @@ SIMDUTF_POP_DISABLE_WARNINGS
  * <x86intrin.h>  (or <intrin.h>) before, so the headers
  * are fooled.
  */
-#include <bmiintrin.h> // for _blsr_u64
-#include <bmi2intrin.h> // for _pext_u64, _pdep_u64
+#include <bmiintrin.h>   // for _blsr_u64
+#include <bmi2intrin.h>  // for _pext_u64, _pdep_u64
 #include <lzcntintrin.h> // for  __lzcnt64
-#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
+#include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
 #include <smmintrin.h>
 #include <tmmintrin.h>
 #include <avxintrin.h>
@@ -1439,6 +1283,8 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #include <avx512vlbwintrin.h>
 #include <avx512vbmiintrin.h>
 #include <avx512vbmi2intrin.h>
+#include <avx512vpopcntdqintrin.h>
+#include <avx512vpopcntdqvlintrin.h>
 // unfortunately, we may not get _blsr_u64, but, thankfully, clang
 // has it as a macro.
 #ifndef _blsr_u64
@@ -1447,6 +1293,8 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #endif //  _blsr_u64
 #endif // SIMDUTF_CLANG_VISUAL_STUDIO
 
+
+
 #if defined(__GNUC__) && !defined(__clang__)
 
 #if __GNUC__ == 8
@@ -1463,16 +1311,15 @@ SIMDUTF_POP_DISABLE_WARNINGS
 /**
  * GCC 8 fails to provide _mm512_set_epi8. We roll our own.
  */
-inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63)
-{
-    return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
-        uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
-        uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
-        uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
-        uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
-        uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
-        uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
-        uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
+inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
+  return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
+                          uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
+                          uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
+                          uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
+                          uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
+                          uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
+                          uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
+                          uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
 }
 #pragma GCC pop_options
 #endif // SIMDUTF_GCC8
@@ -1484,6 +1331,7 @@ inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, u
 #ifndef SIMDUTF_ICELAKE_IMPLEMENTATION_H
 #define SIMDUTF_ICELAKE_IMPLEMENTATION_H
 
+
 namespace simdutf {
 namespace icelake {
 
@@ -1493,88 +1341,85 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-    simdutf_really_inline implementation()
-        : simdutf::implementation(
-            "icelake",
-            "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)",
-            internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2)
-    {
-    }
-    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
-    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
-    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
+  simdutf_really_inline implementation() : simdutf::implementation(
+      "icelake",
+      "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)",
+      internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2 | internal::instruction_set::AVX512VPOPCNTDQ ) {}
+  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
+  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
+  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf32( size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
 };
 
 } // namespace icelake
@@ -1598,7 +1443,7 @@ SIMDUTF_TARGET_ICELAKE
 #endif
 
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe - uninitialized)
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
 #endif // end of workaround
 /* end file src/simdutf/icelake/begin.h */
 // Declarations
@@ -1612,15 +1457,13 @@ namespace icelake {
 namespace {
 
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num)
-{
-    // note: we do not support legacy 32-bit Windows
-    return __popcnt64(input_num); // Visual Studio wants two underscores
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
+  // note: we do not support legacy 32-bit Windows
+  return __popcnt64(input_num);// Visual Studio wants two underscores
 }
 #else
-simdutf_really_inline long long int count_ones(uint64_t input_num)
-{
-    return _popcnt64(input_num);
+simdutf_really_inline long long int count_ones(uint64_t input_num) {
+  return _popcnt64(input_num);
 }
 #endif
 
@@ -1638,11 +1481,14 @@ simdutf_really_inline long long int count_ones(uint64_t input_num)
 SIMDUTF_UNTARGET_REGION
 #endif
 
+
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
 SIMDUTF_POP_DISABLE_WARNINGS
 #endif // end of workaround
 /* end file src/simdutf/icelake/end.h */
 
+
+
 #endif // SIMDUTF_IMPLEMENTATION_ICELAKE
 #endif // SIMDUTF_ICELAKE_H
 /* end file src/simdutf/icelake.h */
@@ -1658,6 +1504,7 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #error "haswell.h must be included before fallback.h"
 #endif
 
+
 // Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected
 // at runtime.
 #ifndef SIMDUTF_IMPLEMENTATION_HASWELL
@@ -1696,6 +1543,7 @@ namespace haswell {
 #ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H
 #define SIMDUTF_HASWELL_IMPLEMENTATION_H
 
+
 // The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
 namespace simdutf {
 namespace haswell {
@@ -1704,88 +1552,86 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-    simdutf_really_inline implementation()
-        : simdutf::implementation(
-            "haswell",
-            "Intel/AMD AVX2",
-            internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2)
-    {
-    }
-    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
-    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
-    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
+  simdutf_really_inline implementation() : simdutf::implementation(
+      "haswell",
+      "Intel/AMD AVX2",
+      internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2
+  ) {}
+  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
+  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
+  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf32( size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
 };
 
 } // namespace haswell
@@ -1798,9 +1644,10 @@ public:
 #ifndef SIMDUTF_HASWELL_INTRINSICS_H
 #define SIMDUTF_HASWELL_INTRINSICS_H
 
+
 #ifdef SIMDUTF_VISUAL_STUDIO
 // under clang within visual studio, this will include <x86intrin.h>
-#include <intrin.h> // visual studio or clang
+#include <intrin.h>  // visual studio or clang
 #else
 
 #if SIMDUTF_GCC11ORMORE
@@ -1813,6 +1660,7 @@ SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
 
 #include <x86intrin.h> // elsewhere
 
+
 #if SIMDUTF_GCC11ORMORE
 // cancels the suppression of the -Wuninitialized
 SIMDUTF_POP_DISABLE_WARNINGS
@@ -1839,9 +1687,9 @@ SIMDUTF_POP_DISABLE_WARNINGS
  * <x86intrin.h>  (or <intrin.h>) before, so the headers
  * are fooled.
  */
-#include <bmiintrin.h> // for _blsr_u64
+#include <bmiintrin.h>   // for _blsr_u64
 #include <lzcntintrin.h> // for  __lzcnt64
-#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
+#include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
 #include <smmintrin.h>
 #include <tmmintrin.h>
 #include <avxintrin.h>
@@ -1872,7 +1720,7 @@ SIMDUTF_TARGET_HASWELL
 #endif
 
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe - uninitialized)
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
 #endif // end of workaround
 /* end file src/simdutf/haswell/begin.h */
 // Declarations
@@ -1886,15 +1734,13 @@ namespace haswell {
 namespace {
 
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num)
-{
-    // note: we do not support legacy 32-bit Windows
-    return __popcnt64(input_num); // Visual Studio wants two underscores
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
+  // note: we do not support legacy 32-bit Windows
+  return __popcnt64(input_num);// Visual Studio wants two underscores
 }
 #else
-simdutf_really_inline long long int count_ones(uint64_t input_num)
-{
-    return _popcnt64(input_num);
+simdutf_really_inline long long int count_ones(uint64_t input_num) {
+  return _popcnt64(input_num);
 }
 #endif
 
@@ -1909,249 +1755,185 @@ simdutf_really_inline long long int count_ones(uint64_t input_num)
 #ifndef SIMDUTF_HASWELL_SIMD_H
 #define SIMDUTF_HASWELL_SIMD_H
 
+
 namespace simdutf {
 namespace haswell {
 namespace {
 namespace simd {
 
-// Forward-declared so they can be used by splat and friends.
-template<typename Child>
-struct base {
+  // Forward-declared so they can be used by splat and friends.
+  template<typename Child>
+  struct base {
     __m256i value;
 
     // Zero constructor
-    simdutf_really_inline base()
-        : value { __m256i() }
-    {
-    }
+    simdutf_really_inline base() : value{__m256i()} {}
 
     // Conversion from SIMD register
-    simdutf_really_inline base(const __m256i _value)
-        : value(_value)
-    {
-    }
+    simdutf_really_inline base(const __m256i _value) : value(_value) {}
     // Conversion to SIMD register
     simdutf_really_inline operator const __m256i&() const { return this->value; }
     simdutf_really_inline operator __m256i&() { return this->value; }
-    template<endianness big_endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
-    {
-        __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
-        __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this, 1));
-        if (big_endian) {
-            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-            first = _mm256_shuffle_epi8(first, swap);
-            second = _mm256_shuffle_epi8(second, swap);
-        }
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), first);
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr + 16), second);
-    }
-    simdutf_really_inline void store_ascii_as_utf32(char32_t* ptr) const
-    {
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr + 8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this, 8))));
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this, 1)));
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this, 1), 8)));
+    template <endianness big_endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
+      __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this,1));
+      if (big_endian) {
+        const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+        first = _mm256_shuffle_epi8(first, swap);
+        second = _mm256_shuffle_epi8(second, swap);
+      }
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first);
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second);
+    }
+    simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr+8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this,8))));
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this,1)));
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this,1),8)));
     }
     // Bit operations
     simdutf_really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
     simdutf_really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
     simdutf_really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
     simdutf_really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
-    simdutf_really_inline Child& operator|=(const Child other)
-    {
-        auto this_cast = static_cast<Child*>(this);
-        *this_cast = *this_cast | other;
-        return *this_cast;
-    }
-    simdutf_really_inline Child& operator&=(const Child other)
-    {
-        auto this_cast = static_cast<Child*>(this);
-        *this_cast = *this_cast & other;
-        return *this_cast;
-    }
-    simdutf_really_inline Child& operator^=(const Child other)
-    {
-        auto this_cast = static_cast<Child*>(this);
-        *this_cast = *this_cast ^ other;
-        return *this_cast;
-    }
-};
+    simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
+    simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
+    simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+  };
 
-// Forward-declared so they can be used by splat and friends.
-template<typename T>
-struct simd8;
+  // Forward-declared so they can be used by splat and friends.
+  template<typename T>
+  struct simd8;
 
-template<typename T, typename Mask = simd8<bool>>
-struct base8 : base<simd8<T>> {
+  template<typename T, typename Mask=simd8<bool>>
+  struct base8: base<simd8<T>> {
     typedef uint32_t bitmask_t;
     typedef uint64_t bitmask2_t;
 
-    simdutf_really_inline base8()
-        : base<simd8<T>>()
-    {
-    }
-    simdutf_really_inline base8(const __m256i _value)
-        : base<simd8<T>>(_value)
-    {
-    }
-    simdutf_really_inline T first() const { return _mm256_extract_epi8(*this, 0); }
-    simdutf_really_inline T last() const { return _mm256_extract_epi8(*this, 31); }
+    simdutf_really_inline base8() : base<simd8<T>>() {}
+    simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
+    simdutf_really_inline T first() const { return _mm256_extract_epi8(*this,0); }
+    simdutf_really_inline T last() const { return _mm256_extract_epi8(*this,31); }
     friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm256_cmpeq_epi8(lhs, rhs); }
 
     static const int SIZE = sizeof(base<T>::value);
 
-    template<int N = 1>
-    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const
-    {
-        return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
+    template<int N=1>
+    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+      return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
     }
-};
+  };
 
-// SIMD byte mask type (returned by things like eq and gt)
-template<>
-struct simd8<bool> : base8<bool> {
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base8<bool> {
     static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
 
-    simdutf_really_inline simd8<bool>()
-        : base8()
-    {
-    }
-    simdutf_really_inline simd8<bool>(const __m256i _value)
-        : base8<bool>(_value)
-    {
-    }
+    simdutf_really_inline simd8<bool>() : base8() {}
+    simdutf_really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
     // Splat constructor
-    simdutf_really_inline simd8<bool>(bool _value)
-        : base8<bool>(splat(_value))
-    {
-    }
+    simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
 
     simdutf_really_inline uint32_t to_bitmask() const { return uint32_t(_mm256_movemask_epi8(*this)); }
     simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
     simdutf_really_inline bool none() const { return _mm256_testz_si256(*this, *this); }
     simdutf_really_inline bool all() const { return static_cast<uint32_t>(_mm256_movemask_epi8(*this)) == 0xFFFFFFFF; }
     simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
-};
+  };
 
-template<typename T>
-struct base8_numeric : base8<T> {
+  template<typename T>
+  struct base8_numeric: base8<T> {
     static simdutf_really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
     static simdutf_really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
-    static simdutf_really_inline simd8<T> load(const T values[32])
-    {
-        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(values));
+    static simdutf_really_inline simd8<T> load(const T values[32]) {
+      return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
     }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     static simdutf_really_inline simd8<T> repeat_16(
-        T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
-        T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15)
-    {
-        return simd8<T>(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15,
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15);
+      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
+      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
+    ) {
+      return simd8<T>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
     }
 
-    simdutf_really_inline base8_numeric()
-        : base8<T>()
-    {
-    }
-    simdutf_really_inline base8_numeric(const __m256i _value)
-        : base8<T>(_value)
-    {
-    }
+    simdutf_really_inline base8_numeric() : base8<T>() {}
+    simdutf_really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
 
     // Store to array
-    simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), *this); }
+    simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
 
     // Addition/subtraction are the same for signed and unsigned
     simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
     simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
-    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other)
-    {
-        *this = *this + other;
-        return *static_cast<simd8<T>*>(this);
-    }
-    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other)
-    {
-        *this = *this - other;
-        return *static_cast<simd8<T>*>(this);
-    }
+    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
+    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
 
     // Override to distinguish from bool version
     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
 
     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
     template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
-    {
-        return _mm256_shuffle_epi8(lookup_table, *this);
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return _mm256_shuffle_epi8(lookup_table, *this);
     }
 
     template<typename L>
     simdutf_really_inline simd8<L> lookup_16(
-        L replace0, L replace1, L replace2, L replace3,
-        L replace4, L replace5, L replace6, L replace7,
-        L replace8, L replace9, L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const
-    {
-        return lookup_16(simd8<L>::repeat_16(
-            replace0, replace1, replace2, replace3,
-            replace4, replace5, replace6, replace7,
-            replace8, replace9, replace10, replace11,
-            replace12, replace13, replace14, replace15));
-    }
-};
-
-// Signed bytes
-template<>
-struct simd8<int8_t> : base8_numeric<int8_t> {
-    simdutf_really_inline simd8()
-        : base8_numeric<int8_t>()
-    {
-    }
-    simdutf_really_inline simd8(const __m256i _value)
-        : base8_numeric<int8_t>(_value)
-    {
-    }
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
+    }
+  };
+
+
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
+    simdutf_really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
 
     // Splat constructor
-    simdutf_really_inline simd8(int8_t _value)
-        : simd8(splat(_value))
-    {
-    }
+    simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
     // Array constructor
-    simdutf_really_inline simd8(const int8_t values[32])
-        : simd8(load(values))
-    {
-    }
+    simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
     simdutf_really_inline operator simd8<uint8_t>() const;
     // Member-by-member initialization
     simdutf_really_inline simd8(
-        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
-        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
-        int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
-        int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31)
-        : simd8(_mm256_setr_epi8(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15,
-            v16, v17, v18, v19, v20, v21, v22, v23,
-            v24, v25, v26, v27, v28, v29, v30, v31))
-    {
-    }
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
+      int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+      int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
+    ) : simd8(_mm256_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15,
+      v16,v17,v18,v19,v20,v21,v22,v23,
+      v24,v25,v26,v27,v28,v29,v30,v31
+    )) {}
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<int8_t> repeat_16(
-        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
-        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
-    {
-        return simd8<int8_t>(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15,
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15);
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) {
+      return simd8<int8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
     }
     simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
     // Order-sensitive comparisons
@@ -2159,54 +1941,43 @@ struct simd8<int8_t> : base8_numeric<int8_t> {
     simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
     simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
-};
+  };
 
-// Unsigned bytes
-template<>
-struct simd8<uint8_t> : base8_numeric<uint8_t> {
-    simdutf_really_inline simd8()
-        : base8_numeric<uint8_t>()
-    {
-    }
-    simdutf_really_inline simd8(const __m256i _value)
-        : base8_numeric<uint8_t>(_value)
-    {
-    }
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base8_numeric<uint8_t> {
+    simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
+    simdutf_really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
     // Splat constructor
-    simdutf_really_inline simd8(uint8_t _value)
-        : simd8(splat(_value))
-    {
-    }
+    simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
     // Array constructor
-    simdutf_really_inline simd8(const uint8_t values[32])
-        : simd8(load(values))
-    {
-    }
+    simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
     // Member-by-member initialization
     simdutf_really_inline simd8(
-        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
-        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
-        uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
-        uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31)
-        : simd8(_mm256_setr_epi8(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15,
-            v16, v17, v18, v19, v20, v21, v22, v23,
-            v24, v25, v26, v27, v28, v29, v30, v31))
-    {
-    }
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
+      uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
+      uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
+    ) : simd8(_mm256_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15,
+      v16,v17,v18,v19,v20,v21,v22,v23,
+      v24,v25,v26,v27,v28,v29,v30,v31
+    )) {}
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<uint8_t> repeat_16(
-        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
-        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
-    {
-        return simd8<uint8_t>(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15,
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15);
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) {
+      return simd8<uint8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
     }
 
+
     // Saturated math
     simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
     simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
@@ -2240,12 +2011,13 @@ struct simd8<uint8_t> : base8_numeric<uint8_t> {
     // Get one of the bits and make a bitmask out of it.
     // e.g. value.get_bit<7>() gets the high bit
     template<int N>
-    simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7 - N)); }
-};
-simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
+    simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
+  };
+  simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
 
-template<typename T>
-struct simd8x64 {
+
+  template<typename T>
+  struct simd8x64 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
     static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
     simd8<T> chunks[NUM_CHUNKS];
@@ -2254,383 +2026,296 @@ struct simd8x64 {
     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
     simd8x64() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1)
-        : chunks { chunk0, chunk1 }
-    {
-    }
-    simdutf_really_inline simd8x64(const T* ptr)
-        : chunks { simd8<T>::load(ptr), simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)) }
-    {
-    }
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
+    simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T))} {}
 
-    simdutf_really_inline void store(T* ptr) const
-    {
-        this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
-        this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+    simdutf_really_inline void store(T* ptr) const {
+      this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
+      this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
     }
 
-    simdutf_really_inline uint64_t to_bitmask() const
-    {
-        uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
-        uint64_t r_hi = this->chunks[1].to_bitmask();
-        return r_lo | (r_hi << 32);
+    simdutf_really_inline uint64_t to_bitmask() const {
+      uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+      uint64_t r_hi =                       this->chunks[1].to_bitmask();
+      return r_lo | (r_hi << 32);
     }
 
-    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T>& other)
-    {
-        this->chunks[0] |= other.chunks[0];
-        this->chunks[1] |= other.chunks[1];
-        return *this;
+    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T> &other) {
+      this->chunks[0] |= other.chunks[0];
+      this->chunks[1] |= other.chunks[1];
+      return *this;
     }
 
-    simdutf_really_inline simd8<T> reduce_or() const
-    {
-        return this->chunks[0] | this->chunks[1];
+    simdutf_really_inline simd8<T> reduce_or() const {
+      return this->chunks[0] | this->chunks[1];
     }
 
-    simdutf_really_inline bool is_ascii() const
-    {
-        return this->reduce_or().is_ascii();
+    simdutf_really_inline bool is_ascii() const {
+      return this->reduce_or().is_ascii();
     }
 
-    template<endianness endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
-    {
-        this->chunks[0].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 0);
-        this->chunks[1].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 1);
+    template <endianness endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
+      this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
     }
 
-    simdutf_really_inline void store_ascii_as_utf32(char32_t* ptr) const
-    {
-        this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
-        this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
+    simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
+      this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
+      this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
     }
 
-    simdutf_really_inline simd8x64<T> bit_or(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<T>(
-            this->chunks[0] | mask,
-            this->chunks[1] | mask);
+    simdutf_really_inline simd8x64<T> bit_or(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return simd8x64<T>(
+        this->chunks[0] | mask,
+        this->chunks[1] | mask
+      );
     }
 
-    simdutf_really_inline uint64_t eq(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] == mask,
-            this->chunks[1] == mask)
-            .to_bitmask();
+    simdutf_really_inline uint64_t eq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] == mask,
+        this->chunks[1] == mask
+      ).to_bitmask();
     }
 
-    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t>& other) const
-    {
-        return simd8x64<bool>(
-            this->chunks[0] == other.chunks[0],
-            this->chunks[1] == other.chunks[1])
-            .to_bitmask();
+    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+      return  simd8x64<bool>(
+        this->chunks[0] == other.chunks[0],
+        this->chunks[1] == other.chunks[1]
+      ).to_bitmask();
     }
 
-    simdutf_really_inline uint64_t lteq(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] <= mask,
-            this->chunks[1] <= mask)
-            .to_bitmask();
+    simdutf_really_inline uint64_t lteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] <= mask,
+        this->chunks[1] <= mask
+      ).to_bitmask();
     }
 
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const
-    {
-        const simd8<T> mask_low = simd8<T>::splat(low);
-        const simd8<T> mask_high = simd8<T>::splat(high);
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
 
-        return simd8x64<bool>(
-            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
-            .to_bitmask();
+      return  simd8x64<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
     }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
-    {
-        const simd8<T> mask_low = simd8<T>::splat(low);
-        const simd8<T> mask_high = simd8<T>::splat(high);
-        return simd8x64<bool>(
-            (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
-            (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low))
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] < mask,
-            this->chunks[1] < mask)
-            .to_bitmask();
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+      return  simd8x64<bool>(
+        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] < mask,
+        this->chunks[1] < mask
+      ).to_bitmask();
     }
 
-    simdutf_really_inline uint64_t gt(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] > mask,
-            this->chunks[1] > mask)
-            .to_bitmask();
+    simdutf_really_inline uint64_t gt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] > mask,
+        this->chunks[1] > mask
+      ).to_bitmask();
     }
-    simdutf_really_inline uint64_t gteq(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] >= mask,
-            this->chunks[1] >= mask)
-            .to_bitmask();
+    simdutf_really_inline uint64_t gteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] >= mask,
+        this->chunks[1] >= mask
+      ).to_bitmask();
     }
-    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const
-    {
-        const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
-        return simd8x64<bool>(
-            (simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
-            (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask))
-            .to_bitmask();
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+      return  simd8x64<bool>(
+        (simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
+        (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask)
+      ).to_bitmask();
     }
-}; // struct simd8x64<T>
+  }; // struct simd8x64<T>
 
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/simd16-inl.h
 /* begin file src/simdutf/haswell/simd16-inl.h */
 #ifdef __GNUC__
 #if __GNUC__ < 8
 #define _mm256_set_m128i(xmm1, xmm2) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
-#define _mm256_setr_m128i(xmm2, xmm1) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
+#define _mm256_setr_m128i(xmm2, xmm1)  _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
 #endif
 #endif
 
 template<typename T>
 struct simd16;
 
-template<typename T, typename Mask = simd16<bool>>
-struct base16 : base<simd16<T>> {
-    using bitmask_type = uint32_t;
+template<typename T, typename Mask=simd16<bool>>
+struct base16: base<simd16<T>> {
+  using bitmask_type = uint32_t;
 
-    simdutf_really_inline base16()
-        : base<simd16<T>>()
-    {
-    }
-    simdutf_really_inline base16(const __m256i _value)
-        : base<simd16<T>>(_value)
-    {
-    }
-    template<typename Pointer>
-    simdutf_really_inline base16(const Pointer* ptr)
-        : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)))
-    {
-    }
-    friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm256_cmpeq_epi16(lhs, rhs); }
+  simdutf_really_inline base16() : base<simd16<T>>() {}
+  simdutf_really_inline base16(const __m256i _value) : base<simd16<T>>(_value) {}
+  template <typename Pointer>
+  simdutf_really_inline base16(const Pointer* ptr) : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr))) {}
+  friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm256_cmpeq_epi16(lhs, rhs); }
 
-    /// the size of vector in bytes
-    static const int SIZE = sizeof(base<simd16<T>>::value);
+  /// the size of vector in bytes
+  static const int SIZE = sizeof(base<simd16<T>>::value);
 
-    /// the number of elements of type T a vector can hold
-    static const int ELEMENTS = SIZE / sizeof(T);
+  /// the number of elements of type T a vector can hold
+  static const int ELEMENTS = SIZE / sizeof(T);
 
-    template<int N = 1>
-    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const
-    {
-        return _mm256_alignr_epi8(*this, prev_chunk, 16 - N);
-    }
+  template<int N=1>
+  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+    return _mm256_alignr_epi8(*this, prev_chunk, 16 - N);
+  }
 };
 
 // SIMD byte mask type (returned by things like eq and gt)
 template<>
-struct simd16<bool> : base16<bool> {
-    static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); }
+struct simd16<bool>: base16<bool> {
+  static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); }
 
-    simdutf_really_inline simd16<bool>()
-        : base16()
-    {
-    }
-    simdutf_really_inline simd16<bool>(const __m256i _value)
-        : base16<bool>(_value)
-    {
-    }
-    // Splat constructor
-    simdutf_really_inline simd16<bool>(bool _value)
-        : base16<bool>(splat(_value))
-    {
-    }
+  simdutf_really_inline simd16<bool>() : base16() {}
+  simdutf_really_inline simd16<bool>(const __m256i _value) : base16<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
 
-    simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); }
-    simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
-    simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+  simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); }
+  simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
+  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
 };
 
 template<typename T>
-struct base16_numeric : base16<T> {
-    static simdutf_really_inline simd16<T> splat(T _value) { return _mm256_set1_epi16(_value); }
-    static simdutf_really_inline simd16<T> zero() { return _mm256_setzero_si256(); }
-    static simdutf_really_inline simd16<T> load(const T values[8])
-    {
-        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(values));
-    }
-
-    simdutf_really_inline base16_numeric()
-        : base16<T>()
-    {
-    }
-    simdutf_really_inline base16_numeric(const __m256i _value)
-        : base16<T>(_value)
-    {
-    }
-
-    // Store to array
-    simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), *this); }
-
-    // Override to distinguish from bool version
-    simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
-
-    // Addition/subtraction are the same for signed and unsigned
-    simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm256_add_epi16(*this, other); }
-    simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm256_sub_epi16(*this, other); }
-    simdutf_really_inline simd16<T>& operator+=(const simd16<T> other)
-    {
-        *this = *this + other;
-        return *static_cast<simd16<T>*>(this);
-    }
-    simdutf_really_inline simd16<T>& operator-=(const simd16<T> other)
-    {
-        *this = *this - other;
-        return *static_cast<simd16<T>*>(this);
-    }
+struct base16_numeric: base16<T> {
+  static simdutf_really_inline simd16<T> splat(T _value) { return _mm256_set1_epi16(_value); }
+  static simdutf_really_inline simd16<T> zero() { return _mm256_setzero_si256(); }
+  static simdutf_really_inline simd16<T> load(const T values[8]) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
+  }
+
+  simdutf_really_inline base16_numeric() : base16<T>() {}
+  simdutf_really_inline base16_numeric(const __m256i _value) : base16<T>(_value) {}
+
+  // Store to array
+  simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
+
+  // Override to distinguish from bool version
+  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
+
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm256_add_epi16(*this, other); }
+  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm256_sub_epi16(*this, other); }
+  simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
+  simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
 };
 
 // Signed words
 template<>
 struct simd16<int16_t> : base16_numeric<int16_t> {
-    simdutf_really_inline simd16()
-        : base16_numeric<int16_t>()
-    {
-    }
-    simdutf_really_inline simd16(const __m256i _value)
-        : base16_numeric<int16_t>(_value)
-    {
-    }
-    // Splat constructor
-    simdutf_really_inline simd16(int16_t _value)
-        : simd16(splat(_value))
-    {
-    }
-    // Array constructor
-    simdutf_really_inline simd16(const int16_t* values)
-        : simd16(load(values))
-    {
-    }
-    simdutf_really_inline simd16(const char16_t* values)
-        : simd16(load(reinterpret_cast<const int16_t*>(values)))
-    {
-    }
-    // Order-sensitive comparisons
-    simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm256_max_epi16(*this, other); }
-    simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm256_min_epi16(*this, other); }
-    simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(*this, other); }
-    simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(other, *this); }
+  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+  simdutf_really_inline simd16(const __m256i _value) : base16_numeric<int16_t>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
+  // Order-sensitive comparisons
+  simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm256_max_epi16(*this, other); }
+  simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm256_min_epi16(*this, other); }
+  simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(*this, other); }
+  simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(other, *this); }
 };
 
 // Unsigned words
 template<>
-struct simd16<uint16_t> : base16_numeric<uint16_t> {
-    simdutf_really_inline simd16()
-        : base16_numeric<uint16_t>()
-    {
-    }
-    simdutf_really_inline simd16(const __m256i _value)
-        : base16_numeric<uint16_t>(_value)
-    {
-    }
-
-    // Splat constructor
-    simdutf_really_inline simd16(uint16_t _value)
-        : simd16(splat(_value))
-    {
-    }
-    // Array constructor
-    simdutf_really_inline simd16(const uint16_t* values)
-        : simd16(load(values))
-    {
-    }
-    simdutf_really_inline simd16(const char16_t* values)
-        : simd16(load(reinterpret_cast<const uint16_t*>(values)))
-    {
-    }
-
-    // Saturated math
-    simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm256_adds_epu16(*this, other); }
-    simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm256_subs_epu16(*this, other); }
-
-    // Order-specific operations
-    simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm256_max_epu16(*this, other); }
-    simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm256_min_epu16(*this, other); }
-    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-    simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
-    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-    simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
-    simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
-    simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
-    simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
-    simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
-
-    // Bit-specific operations
-    simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
-    simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
-    simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
-    simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
-
-    simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
-    simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
-    simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm256_testz_si256(*this, bits); }
-    simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
-    template<int N>
-    simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm256_srli_epi16(*this, N)); }
-    template<int N>
-    simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm256_slli_epi16(*this, N)); }
-    // Get one of the bits and make a bitmask out of it.
-    // e.g. value.get_bit<7>() gets the high bit
-    template<int N>
-    simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15 - N)); }
-
-    // Change the endianness
-    simdutf_really_inline simd16<uint16_t> swap_bytes() const
-    {
-        const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-            17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-        return _mm256_shuffle_epi8(*this, swap);
-    }
-
-    // Pack with the unsigned saturation two uint16_t words into single uint8_t vector
-    static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1)
-    {
-        // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
-        //       we have to shuffle lanes in order to produce bytes in the
-        //       correct order.
-
-        // get the 0th lanes
-        const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
-        const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
-
-        // get the 1st lanes
-        const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
-        const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
-
-        // build new vectors (shuffle lanes)
-        const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
-        const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
-
-        // pack words in linear order from v0 and v1
-        return _mm256_packus_epi16(t0, t1);
-    }
+struct simd16<uint16_t>: base16_numeric<uint16_t>  {
+  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+  simdutf_really_inline simd16(const __m256i _value) : base16_numeric<uint16_t>(_value) {}
+
+  // Splat constructor
+  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
+
+  // Saturated math
+  simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm256_adds_epu16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm256_subs_epu16(*this, other); }
+
+  // Order-specific operations
+  simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm256_max_epu16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm256_min_epu16(*this, other); }
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
+  simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
+  simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
+  simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+  simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+
+  // Bit-specific operations
+  simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
+  simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
+  simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
+  simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
+
+  simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
+  simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+  simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm256_testz_si256(*this, bits); }
+  simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
+  template<int N>
+  simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm256_srli_epi16(*this, N)); }
+  template<int N>
+  simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm256_slli_epi16(*this, N)); }
+  // Get one of the bits and make a bitmask out of it.
+  // e.g. value.get_bit<7>() gets the high bit
+  template<int N>
+  simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15-N)); }
+
+  // Change the endianness
+  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
+    const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+    return _mm256_shuffle_epi8(*this, swap);
+  }
+
+  // Pack with the unsigned saturation two uint16_t words into single uint8_t vector
+  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
+    // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
+    //       we have to shuffle lanes in order to produce bytes in the
+    //       correct order.
+
+    // get the 0th lanes
+    const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
+    const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
+
+    // get the 1st lanes
+    const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
+    const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
+
+    // build new vectors (shuffle lanes)
+    const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
+    const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
+
+    // pack words in linear order from v0 and v1
+    return _mm256_packus_epi16(t0, t1);
+  }
 };
 
-template<typename T>
-struct simd16x32 {
+
+  template<typename T>
+  struct simd16x32 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
     static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
     simd16<T> chunks[NUM_CHUNKS];
@@ -2639,114 +2324,96 @@ struct simd16x32 {
     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
     simd16x32() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1)
-        : chunks { chunk0, chunk1 }
-    {
-    }
-    simdutf_really_inline simd16x32(const T* ptr)
-        : chunks { simd16<T>::load(ptr), simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)) }
-    {
-    }
+    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1) : chunks{chunk0, chunk1} {}
+    simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T))} {}
 
-    simdutf_really_inline void store(T* ptr) const
-    {
-        this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
-        this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
+    simdutf_really_inline void store(T* ptr) const {
+      this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
+      this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
     }
 
-    simdutf_really_inline uint64_t to_bitmask() const
-    {
-        uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
-        uint64_t r_hi = this->chunks[1].to_bitmask();
-        return r_lo | (r_hi << 32);
+    simdutf_really_inline uint64_t to_bitmask() const {
+      uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+      uint64_t r_hi =                       this->chunks[1].to_bitmask();
+      return r_lo | (r_hi << 32);
     }
 
-    simdutf_really_inline simd16<T> reduce_or() const
-    {
-        return this->chunks[0] | this->chunks[1];
+    simdutf_really_inline simd16<T> reduce_or() const {
+      return this->chunks[0] | this->chunks[1];
     }
 
-    simdutf_really_inline bool is_ascii() const
-    {
-        return this->reduce_or().is_ascii();
+    simdutf_really_inline bool is_ascii() const {
+      return this->reduce_or().is_ascii();
     }
 
-    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
-    {
-        this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
-        this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>));
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
+      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>));
     }
 
-    simdutf_really_inline simd16x32<T> bit_or(const T m) const
-    {
-        const simd16<T> mask = simd16<T>::splat(m);
-        return simd16x32<T>(
-            this->chunks[0] | mask,
-            this->chunks[1] | mask);
+    simdutf_really_inline simd16x32<T> bit_or(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return simd16x32<T>(
+        this->chunks[0] | mask,
+        this->chunks[1] | mask
+      );
     }
 
-    simdutf_really_inline void swap_bytes()
-    {
-        this->chunks[0] = this->chunks[0].swap_bytes();
-        this->chunks[1] = this->chunks[1].swap_bytes();
+    simdutf_really_inline void swap_bytes() {
+      this->chunks[0] = this->chunks[0].swap_bytes();
+      this->chunks[1] = this->chunks[1].swap_bytes();
     }
 
-    simdutf_really_inline uint64_t eq(const T m) const
-    {
-        const simd16<T> mask = simd16<T>::splat(m);
-        return simd16x32<bool>(
-            this->chunks[0] == mask,
-            this->chunks[1] == mask)
-            .to_bitmask();
+    simdutf_really_inline uint64_t eq(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] == mask,
+        this->chunks[1] == mask
+      ).to_bitmask();
     }
 
-    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t>& other) const
-    {
-        return simd16x32<bool>(
-            this->chunks[0] == other.chunks[0],
-            this->chunks[1] == other.chunks[1])
-            .to_bitmask();
+    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
+      return  simd16x32<bool>(
+        this->chunks[0] == other.chunks[0],
+        this->chunks[1] == other.chunks[1]
+      ).to_bitmask();
     }
 
-    simdutf_really_inline uint64_t lteq(const T m) const
-    {
-        const simd16<T> mask = simd16<T>::splat(m);
-        return simd16x32<bool>(
-            this->chunks[0] <= mask,
-            this->chunks[1] <= mask)
-            .to_bitmask();
+    simdutf_really_inline uint64_t lteq(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] <= mask,
+        this->chunks[1] <= mask
+      ).to_bitmask();
     }
 
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const
-    {
-        const simd16<T> mask_low = simd16<T>::splat(low);
-        const simd16<T> mask_high = simd16<T>::splat(high);
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd16<T> mask_low = simd16<T>::splat(low);
+      const simd16<T> mask_high = simd16<T>::splat(high);
 
-        return simd16x32<bool>(
-            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
-            .to_bitmask();
+      return  simd16x32<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
     }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
-    {
-        const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
-        const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
-        return simd16x32<bool>(
-            (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
-            (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low))
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const
-    {
-        const simd16<T> mask = simd16<T>::splat(m);
-        return simd16x32<bool>(
-            this->chunks[0] < mask,
-            this->chunks[1] < mask)
-            .to_bitmask();
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
+      const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
+      return simd16x32<bool>(
+        (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+        (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] < mask,
+        this->chunks[1] < mask
+      ).to_bitmask();
     }
-}; // struct simd16x32<T>
+  }; // struct simd16x32<T>
 /* end file src/simdutf/haswell/simd16-inl.h */
 
 } // namespace simd
@@ -2766,6 +2433,7 @@ struct simd16x32 {
 SIMDUTF_UNTARGET_REGION
 #endif
 
+
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
 SIMDUTF_POP_DISABLE_WARNINGS
 #endif // end of workaround
@@ -2783,6 +2451,7 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #error "westmere.h must be included before fallback.h"
 #endif
 
+
 // Default Westmere to on if this is x86-64, unless we'll always select Haswell.
 #ifndef SIMDUTF_IMPLEMENTATION_WESTMERE
 //
@@ -2819,6 +2488,7 @@ namespace westmere {
 #ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H
 #define SIMDUTF_WESTMERE_IMPLEMENTATION_H
 
+
 // The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
 namespace simdutf {
 namespace westmere {
@@ -2829,85 +2499,82 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-    simdutf_really_inline implementation()
-        : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42)
-    {
-    }
-    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
-    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
-    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
+  simdutf_really_inline implementation() : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42) {}
+  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
+  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
+  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf32( size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
 };
 
 } // namespace westmere
@@ -2935,6 +2602,7 @@ SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
 
 #include <x86intrin.h> // elsewhere
 
+
 #if SIMDUTF_GCC11ORMORE
 // cancels the suppression of the -Wuninitialized
 SIMDUTF_POP_DISABLE_WARNINGS
@@ -2942,6 +2610,7 @@ SIMDUTF_POP_DISABLE_WARNINGS
 
 #endif // SIMDUTF_VISUAL_STUDIO
 
+
 #ifdef SIMDUTF_CLANG_VISUAL_STUDIO
 /**
  * You are not supposed, normally, to include these
@@ -2951,9 +2620,11 @@ SIMDUTF_POP_DISABLE_WARNINGS
  * only get included *if* the corresponding features are detected
  * from macros:
  */
-#include <smmintrin.h> // for _mm_alignr_epi8
+#include <smmintrin.h>  // for _mm_alignr_epi8
 #endif
 
+
+
 #endif // SIMDUTF_WESTMERE_INTRINSICS_H
 /* end file src/simdutf/westmere/intrinsics.h */
 
@@ -2983,15 +2654,13 @@ namespace westmere {
 namespace {
 
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num)
-{
-    // note: we do not support legacy 32-bit Windows
-    return __popcnt64(input_num); // Visual Studio wants two underscores
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
+  // note: we do not support legacy 32-bit Windows
+  return __popcnt64(input_num);// Visual Studio wants two underscores
 }
 #else
-simdutf_really_inline long long int count_ones(uint64_t input_num)
-{
-    return _popcnt64(input_num);
+simdutf_really_inline long long int count_ones(uint64_t input_num) {
+  return _popcnt64(input_num);
 }
 #endif
 
@@ -3011,155 +2680,110 @@ namespace westmere {
 namespace {
 namespace simd {
 
-template<typename Child>
-struct base {
+  template<typename Child>
+  struct base {
     __m128i value;
 
     // Zero constructor
-    simdutf_really_inline base()
-        : value { __m128i() }
-    {
-    }
+    simdutf_really_inline base() : value{__m128i()} {}
 
     // Conversion from SIMD register
-    simdutf_really_inline base(const __m128i _value)
-        : value(_value)
-    {
-    }
+    simdutf_really_inline base(const __m128i _value) : value(_value) {}
     // Conversion to SIMD register
     simdutf_really_inline operator const __m128i&() const { return this->value; }
     simdutf_really_inline operator __m128i&() { return this->value; }
-    template<endianness big_endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t* p) const
-    {
-        __m128i first = _mm_cvtepu8_epi16(*this);
-        __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this, 8));
-        if (big_endian) {
-            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-            first = _mm_shuffle_epi8(first, swap);
-            second = _mm_shuffle_epi8(second, swap);
-        }
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(p), first);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(p + 8), second);
-    }
-    simdutf_really_inline void store_ascii_as_utf32(char32_t* p) const
-    {
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(p), _mm_cvtepu8_epi32(*this));
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(p + 4), _mm_cvtepu8_epi32(_mm_srli_si128(*this, 4)));
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(p + 8), _mm_cvtepu8_epi32(_mm_srli_si128(*this, 8)));
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(p + 12), _mm_cvtepu8_epi32(_mm_srli_si128(*this, 12)));
+    template <endianness big_endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
+      __m128i first = _mm_cvtepu8_epi16(*this);
+      __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this,8));
+      if (big_endian) {
+        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        first = _mm_shuffle_epi8(first, swap);
+        second = _mm_shuffle_epi8(second, swap);
+      }
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(p), first);
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), second);
+    }
+    simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this));
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+4), _mm_cvtepu8_epi32(_mm_srli_si128(*this,4)));
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), _mm_cvtepu8_epi32(_mm_srli_si128(*this,8)));
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+12), _mm_cvtepu8_epi32(_mm_srli_si128(*this,12)));
     }
     // Bit operations
     simdutf_really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
     simdutf_really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
     simdutf_really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
     simdutf_really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
-    simdutf_really_inline Child& operator|=(const Child other)
-    {
-        auto this_cast = static_cast<Child*>(this);
-        *this_cast = *this_cast | other;
-        return *this_cast;
-    }
-    simdutf_really_inline Child& operator&=(const Child other)
-    {
-        auto this_cast = static_cast<Child*>(this);
-        *this_cast = *this_cast & other;
-        return *this_cast;
-    }
-    simdutf_really_inline Child& operator^=(const Child other)
-    {
-        auto this_cast = static_cast<Child*>(this);
-        *this_cast = *this_cast ^ other;
-        return *this_cast;
-    }
-};
+    simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
+    simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
+    simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+  };
 
-// Forward-declared so they can be used by splat and friends.
-template<typename T>
-struct simd8;
+  // Forward-declared so they can be used by splat and friends.
+  template<typename T>
+  struct simd8;
 
-template<typename T, typename Mask = simd8<bool>>
-struct base8 : base<simd8<T>> {
+  template<typename T, typename Mask=simd8<bool>>
+  struct base8: base<simd8<T>> {
     typedef uint16_t bitmask_t;
     typedef uint32_t bitmask2_t;
 
-    simdutf_really_inline T first() const { return _mm_extract_epi8(*this, 0); }
-    simdutf_really_inline T last() const { return _mm_extract_epi8(*this, 15); }
-    simdutf_really_inline base8()
-        : base<simd8<T>>()
-    {
-    }
-    simdutf_really_inline base8(const __m128i _value)
-        : base<simd8<T>>(_value)
-    {
-    }
+    simdutf_really_inline T first() const { return _mm_extract_epi8(*this,0); }
+    simdutf_really_inline T last() const { return _mm_extract_epi8(*this,15); }
+    simdutf_really_inline base8() : base<simd8<T>>() {}
+    simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
 
     friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
 
     static const int SIZE = sizeof(base<simd8<T>>::value);
 
-    template<int N = 1>
-    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const
-    {
-        return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+    template<int N=1>
+    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+      return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
     }
-};
+  };
 
-// SIMD byte mask type (returned by things like eq and gt)
-template<>
-struct simd8<bool> : base8<bool> {
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base8<bool> {
     static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
 
-    simdutf_really_inline simd8<bool>()
-        : base8()
-    {
-    }
-    simdutf_really_inline simd8<bool>(const __m128i _value)
-        : base8<bool>(_value)
-    {
-    }
+    simdutf_really_inline simd8<bool>() : base8() {}
+    simdutf_really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
     // Splat constructor
-    simdutf_really_inline simd8<bool>(bool _value)
-        : base8<bool>(splat(_value))
-    {
-    }
+    simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
 
     simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
     simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
     simdutf_really_inline bool none() const { return _mm_testz_si128(*this, *this); }
     simdutf_really_inline bool all() const { return _mm_movemask_epi8(*this) == 0xFFFF; }
     simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
-};
+  };
 
-template<typename T>
-struct base8_numeric : base8<T> {
+  template<typename T>
+  struct base8_numeric: base8<T> {
     static simdutf_really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
     static simdutf_really_inline simd8<T> zero() { return _mm_setzero_si128(); }
-    static simdutf_really_inline simd8<T> load(const T values[16])
-    {
-        return _mm_loadu_si128(reinterpret_cast<const __m128i*>(values));
+    static simdutf_really_inline simd8<T> load(const T values[16]) {
+      return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
     }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     static simdutf_really_inline simd8<T> repeat_16(
-        T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
-        T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15)
-    {
-        return simd8<T>(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15);
+      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
+      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
+    ) {
+      return simd8<T>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
     }
 
-    simdutf_really_inline base8_numeric()
-        : base8<T>()
-    {
-    }
-    simdutf_really_inline base8_numeric(const __m128i _value)
-        : base8<T>(_value)
-    {
-    }
+    simdutf_really_inline base8_numeric() : base8<T>() {}
+    simdutf_really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
 
     // Store to array
-    simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), *this); }
+    simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
 
     // Override to distinguish from bool version
     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
@@ -3167,77 +2791,56 @@ struct base8_numeric : base8<T> {
     // Addition/subtraction are the same for signed and unsigned
     simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
     simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
-    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other)
-    {
-        *this = *this + other;
-        return *static_cast<simd8<T>*>(this);
-    }
-    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other)
-    {
-        *this = *this - other;
-        return *static_cast<simd8<T>*>(this);
-    }
+    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
+    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
 
     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
     template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
-    {
-        return _mm_shuffle_epi8(lookup_table, *this);
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return _mm_shuffle_epi8(lookup_table, *this);
     }
 
     template<typename L>
     simdutf_really_inline simd8<L> lookup_16(
-        L replace0, L replace1, L replace2, L replace3,
-        L replace4, L replace5, L replace6, L replace7,
-        L replace8, L replace9, L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const
-    {
-        return lookup_16(simd8<L>::repeat_16(
-            replace0, replace1, replace2, replace3,
-            replace4, replace5, replace6, replace7,
-            replace8, replace9, replace10, replace11,
-            replace12, replace13, replace14, replace15));
-    }
-};
-
-// Signed bytes
-template<>
-struct simd8<int8_t> : base8_numeric<int8_t> {
-    simdutf_really_inline simd8()
-        : base8_numeric<int8_t>()
-    {
-    }
-    simdutf_really_inline simd8(const __m128i _value)
-        : base8_numeric<int8_t>(_value)
-    {
-    }
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
+    }
+  };
+
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
+    simdutf_really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
     // Splat constructor
-    simdutf_really_inline simd8(int8_t _value)
-        : simd8(splat(_value))
-    {
-    }
+    simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
     // Array constructor
-    simdutf_really_inline simd8(const int8_t* values)
-        : simd8(load(values))
-    {
-    }
+    simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
     // Member-by-member initialization
     simdutf_really_inline simd8(
-        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
-        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
-        : simd8(_mm_setr_epi8(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15))
-    {
-    }
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) : simd8(_mm_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<int8_t> repeat_16(
-        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
-        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
-    {
-        return simd8<int8_t>(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15);
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) {
+      return simd8<int8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
     }
     simdutf_really_inline operator simd8<uint8_t>() const;
     simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
@@ -3247,47 +2850,35 @@ struct simd8<int8_t> : base8_numeric<int8_t> {
     simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
     simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
-};
+  };
 
-// Unsigned bytes
-template<>
-struct simd8<uint8_t> : base8_numeric<uint8_t> {
-    simdutf_really_inline simd8()
-        : base8_numeric<uint8_t>()
-    {
-    }
-    simdutf_really_inline simd8(const __m128i _value)
-        : base8_numeric<uint8_t>(_value)
-    {
-    }
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base8_numeric<uint8_t>  {
+    simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
+    simdutf_really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
 
     // Splat constructor
-    simdutf_really_inline simd8(uint8_t _value)
-        : simd8(splat(_value))
-    {
-    }
+    simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
     // Array constructor
-    simdutf_really_inline simd8(const uint8_t* values)
-        : simd8(load(values))
-    {
-    }
+    simdutf_really_inline simd8(const uint8_t* values) : simd8(load(values)) {}
     // Member-by-member initialization
     simdutf_really_inline simd8(
-        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
-        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
-        : simd8(_mm_setr_epi8(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15))
-    {
-    }
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) : simd8(_mm_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<uint8_t> repeat_16(
-        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
-        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
-    {
-        return simd8<uint8_t>(
-            v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14, v15);
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) {
+      return simd8<uint8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
     }
 
     // Saturated math
@@ -3324,44 +2915,30 @@ struct simd8<uint8_t> : base8_numeric<uint8_t> {
     // Get one of the bits and make a bitmask out of it.
     // e.g. value.get_bit<7>() gets the high bit
     template<int N>
-    simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7 - N)); }
-};
-simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
+    simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
+  };
+  simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
 
-// Unsigned bytes
-template<>
-struct simd8<uint16_t> : base<uint16_t> {
+  // Unsigned bytes
+  template<>
+  struct simd8<uint16_t>: base<uint16_t> {
     static simdutf_really_inline simd8<uint16_t> splat(uint16_t _value) { return _mm_set1_epi16(_value); }
-    static simdutf_really_inline simd8<uint16_t> load(const uint16_t values[8])
-    {
-        return _mm_loadu_si128(reinterpret_cast<const __m128i*>(values));
+    static simdutf_really_inline simd8<uint16_t> load(const uint16_t values[8]) {
+      return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
     }
 
-    simdutf_really_inline simd8()
-        : base<uint16_t>()
-    {
-    }
-    simdutf_really_inline simd8(const __m128i _value)
-        : base<uint16_t>(_value)
-    {
-    }
+    simdutf_really_inline simd8() : base<uint16_t>() {}
+    simdutf_really_inline simd8(const __m128i _value) : base<uint16_t>(_value) {}
     // Splat constructor
-    simdutf_really_inline simd8(uint16_t _value)
-        : simd8(splat(_value))
-    {
-    }
+    simdutf_really_inline simd8(uint16_t _value) : simd8(splat(_value)) {}
     // Array constructor
-    simdutf_really_inline simd8(const uint16_t* values)
-        : simd8(load(values))
-    {
-    }
+    simdutf_really_inline simd8(const uint16_t* values) : simd8(load(values)) {}
     // Member-by-member initialization
     simdutf_really_inline simd8(
-        uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
-        : simd8(_mm_setr_epi16(
-            v0, v1, v2, v3, v4, v5, v6, v7))
-    {
-    }
+      uint16_t v0,  uint16_t v1,  uint16_t v2,  uint16_t v3,  uint16_t v4,  uint16_t v5,  uint16_t v6,  uint16_t v7
+    ) : simd8(_mm_setr_epi16(
+      v0, v1, v2, v3, v4, v5, v6, v7
+    )) {}
 
     // Saturated math
     simdutf_really_inline simd8<uint16_t> saturating_add(const simd8<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
@@ -3388,9 +2965,9 @@ struct simd8<uint16_t> : base<uint16_t> {
     simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
     simdutf_really_inline bool bits_not_set_anywhere(simd8<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
     simdutf_really_inline bool any_bits_set_anywhere(simd8<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
-};
-template<typename T>
-struct simd8x64 {
+     };
+  template<typename T>
+  struct simd8x64 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
     static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
     simd8<T> chunks[NUM_CHUNKS];
@@ -3399,395 +2976,303 @@ struct simd8x64 {
     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
     simd8x64() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3)
-        : chunks { chunk0, chunk1, chunk2, chunk3 }
-    {
-    }
-    simdutf_really_inline simd8x64(const T* ptr)
-        : chunks { simd8<T>::load(ptr), simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T)) }
-    {
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
+
+    simdutf_really_inline void store(T* ptr) const {
+      this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
+      this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
+      this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
+      this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
     }
 
-    simdutf_really_inline void store(T* ptr) const
-    {
-        this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
-        this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
-        this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
-        this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
-    }
-
-    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T>& other)
-    {
-        this->chunks[0] |= other.chunks[0];
-        this->chunks[1] |= other.chunks[1];
-        this->chunks[2] |= other.chunks[2];
-        this->chunks[3] |= other.chunks[3];
-        return *this;
-    }
-
-    simdutf_really_inline simd8<T> reduce_or() const
-    {
-        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
-    }
-
-    simdutf_really_inline bool is_ascii() const
-    {
-        return this->reduce_or().is_ascii();
-    }
-
-    template<endianness endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
-    {
-        this->chunks[0].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 0);
-        this->chunks[1].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 1);
-        this->chunks[2].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 2);
-        this->chunks[3].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 3);
-    }
-
-    simdutf_really_inline void store_ascii_as_utf32(char32_t* ptr) const
-    {
-        this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
-        this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
-        this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 2);
-        this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 3);
-    }
-
-    simdutf_really_inline uint64_t to_bitmask() const
-    {
-        uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
-        uint64_t r1 = this->chunks[1].to_bitmask();
-        uint64_t r2 = this->chunks[2].to_bitmask();
-        uint64_t r3 = this->chunks[3].to_bitmask();
-        return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
-    }
-
-    simdutf_really_inline uint64_t eq(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] == mask,
-            this->chunks[1] == mask,
-            this->chunks[2] == mask,
-            this->chunks[3] == mask)
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t>& other) const
-    {
-        return simd8x64<bool>(
-            this->chunks[0] == other.chunks[0],
-            this->chunks[1] == other.chunks[1],
-            this->chunks[2] == other.chunks[2],
-            this->chunks[3] == other.chunks[3])
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t lteq(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] <= mask,
-            this->chunks[1] <= mask,
-            this->chunks[2] <= mask,
-            this->chunks[3] <= mask)
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const
-    {
-        const simd8<T> mask_low = simd8<T>::splat(low);
-        const simd8<T> mask_high = simd8<T>::splat(high);
-
-        return simd8x64<bool>(
-            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
-    {
-        const simd8<T> mask_low = simd8<T>::splat(low - 1);
-        const simd8<T> mask_high = simd8<T>::splat(high + 1);
-        return simd8x64<bool>(
-            (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
-            (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
-            (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
-            (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low))
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] < mask,
-            this->chunks[1] < mask,
-            this->chunks[2] < mask,
-            this->chunks[3] < mask)
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t gt(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] > mask,
-            this->chunks[1] > mask,
-            this->chunks[2] > mask,
-            this->chunks[3] > mask)
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] >= mask,
-            this->chunks[1] >= mask,
-            this->chunks[2] >= mask,
-            this->chunks[3] >= mask)
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const
-    {
-        const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
-        return simd8x64<bool>(
-            simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
-            simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
-            simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
-            simd8<uint8_t>(__m128i(this->chunks[3])) >= mask)
-            .to_bitmask();
-    }
-}; // struct simd8x64<T>
+    simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
+      this->chunks[0] |= other.chunks[0];
+      this->chunks[1] |= other.chunks[1];
+      this->chunks[2] |= other.chunks[2];
+      this->chunks[3] |= other.chunks[3];
+      return *this;
+    }
+
+    simdutf_really_inline simd8<T> reduce_or() const {
+      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    }
+
+    simdutf_really_inline bool is_ascii() const {
+      return this->reduce_or().is_ascii();
+    }
+
+    template <endianness endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
+      this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
+      this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
+      this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
+    }
+
+    simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
+      this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
+      this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
+      this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*2);
+      this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*3);
+    }
+
+    simdutf_really_inline uint64_t to_bitmask() const {
+      uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
+      uint64_t r1 =          this->chunks[1].to_bitmask() ;
+      uint64_t r2 =          this->chunks[2].to_bitmask() ;
+      uint64_t r3 =          this->chunks[3].to_bitmask() ;
+      return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] == mask,
+        this->chunks[1] == mask,
+        this->chunks[2] == mask,
+        this->chunks[3] == mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+      return  simd8x64<bool>(
+        this->chunks[0] == other.chunks[0],
+        this->chunks[1] == other.chunks[1],
+        this->chunks[2] == other.chunks[2],
+        this->chunks[3] == other.chunks[3]
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] <= mask,
+        this->chunks[1] <= mask,
+        this->chunks[2] <= mask,
+        this->chunks[3] <= mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+
+      return  simd8x64<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low-1);
+      const simd8<T> mask_high = simd8<T>::splat(high+1);
+      return simd8x64<bool>(
+        (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+        (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
+        (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
+        (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] < mask,
+        this->chunks[1] < mask,
+        this->chunks[2] < mask,
+        this->chunks[3] < mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t gt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] > mask,
+        this->chunks[1] > mask,
+        this->chunks[2] > mask,
+        this->chunks[3] > mask
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] >= mask,
+        this->chunks[1] >= mask,
+        this->chunks[2] >= mask,
+        this->chunks[3] >= mask
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+      return  simd8x64<bool>(
+        simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
+        simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
+        simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
+        simd8<uint8_t>(__m128i(this->chunks[3])) >= mask
+      ).to_bitmask();
+    }
+  }; // struct simd8x64<T>
 
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/simd16-inl.h
 /* begin file src/simdutf/westmere/simd16-inl.h */
 template<typename T>
 struct simd16;
 
-template<typename T, typename Mask = simd16<bool>>
-struct base16 : base<simd16<T>> {
-    typedef uint16_t bitmask_t;
-    typedef uint32_t bitmask2_t;
+template<typename T, typename Mask=simd16<bool>>
+struct base16: base<simd16<T>> {
+  typedef uint16_t bitmask_t;
+  typedef uint32_t bitmask2_t;
 
-    simdutf_really_inline base16()
-        : base<simd16<T>>()
-    {
-    }
-    simdutf_really_inline base16(const __m128i _value)
-        : base<simd16<T>>(_value)
-    {
-    }
-    template<typename Pointer>
-    simdutf_really_inline base16(const Pointer* ptr)
-        : base16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)))
-    {
-    }
+  simdutf_really_inline base16() : base<simd16<T>>() {}
+  simdutf_really_inline base16(const __m128i _value) : base<simd16<T>>(_value) {}
+  template <typename Pointer>
+  simdutf_really_inline base16(const Pointer* ptr) : base16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr))) {}
 
-    friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm_cmpeq_epi16(lhs, rhs); }
+  friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm_cmpeq_epi16(lhs, rhs); }
 
-    static const int SIZE = sizeof(base<simd16<T>>::value);
+  static const int SIZE = sizeof(base<simd16<T>>::value);
 
-    template<int N = 1>
-    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const
-    {
-        return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
-    }
+  template<int N=1>
+  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+    return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+  }
 };
 
 // SIMD byte mask type (returned by things like eq and gt)
 template<>
-struct simd16<bool> : base16<bool> {
-    static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); }
+struct simd16<bool>: base16<bool> {
+  static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); }
 
-    simdutf_really_inline simd16<bool>()
-        : base16()
-    {
-    }
-    simdutf_really_inline simd16<bool>(const __m128i _value)
-        : base16<bool>(_value)
-    {
-    }
-    // Splat constructor
-    simdutf_really_inline simd16<bool>(bool _value)
-        : base16<bool>(splat(_value))
-    {
-    }
+  simdutf_really_inline simd16<bool>() : base16() {}
+  simdutf_really_inline simd16<bool>(const __m128i _value) : base16<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
 
-    simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
-    simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
-    simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+  simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
+  simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
+  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
 };
 
 template<typename T>
-struct base16_numeric : base16<T> {
-    static simdutf_really_inline simd16<T> splat(T _value) { return _mm_set1_epi16(_value); }
-    static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
-    static simdutf_really_inline simd16<T> load(const T values[8])
-    {
-        return _mm_loadu_si128(reinterpret_cast<const __m128i*>(values));
-    }
-
-    simdutf_really_inline base16_numeric()
-        : base16<T>()
-    {
-    }
-    simdutf_really_inline base16_numeric(const __m128i _value)
-        : base16<T>(_value)
-    {
-    }
-
-    // Store to array
-    simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), *this); }
-
-    // Override to distinguish from bool version
-    simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
-
-    // Addition/subtraction are the same for signed and unsigned
-    simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm_add_epi16(*this, other); }
-    simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm_sub_epi16(*this, other); }
-    simdutf_really_inline simd16<T>& operator+=(const simd16<T> other)
-    {
-        *this = *this + other;
-        return *static_cast<simd16<T>*>(this);
-    }
-    simdutf_really_inline simd16<T>& operator-=(const simd16<T> other)
-    {
-        *this = *this - other;
-        return *static_cast<simd16<T>*>(this);
-    }
+struct base16_numeric: base16<T> {
+  static simdutf_really_inline simd16<T> splat(T _value) { return _mm_set1_epi16(_value); }
+  static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
+  static simdutf_really_inline simd16<T> load(const T values[8]) {
+    return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+  }
+
+  simdutf_really_inline base16_numeric() : base16<T>() {}
+  simdutf_really_inline base16_numeric(const __m128i _value) : base16<T>(_value) {}
+
+  // Store to array
+  simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
+
+  // Override to distinguish from bool version
+  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm_add_epi16(*this, other); }
+  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm_sub_epi16(*this, other); }
+  simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
+  simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
 };
 
 // Signed words
 template<>
 struct simd16<int16_t> : base16_numeric<int16_t> {
-    simdutf_really_inline simd16()
-        : base16_numeric<int16_t>()
-    {
-    }
-    simdutf_really_inline simd16(const __m128i _value)
-        : base16_numeric<int16_t>(_value)
-    {
-    }
-    // Splat constructor
-    simdutf_really_inline simd16(int16_t _value)
-        : simd16(splat(_value))
-    {
-    }
-    // Array constructor
-    simdutf_really_inline simd16(const int16_t* values)
-        : simd16(load(values))
-    {
-    }
-    simdutf_really_inline simd16(const char16_t* values)
-        : simd16(load(reinterpret_cast<const int16_t*>(values)))
-    {
-    }
-    // Member-by-member initialization
-    simdutf_really_inline simd16(
-        int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7)
-        : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7))
-    {
-    }
-    simdutf_really_inline operator simd16<uint16_t>() const;
-
-    // Order-sensitive comparisons
-    simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm_max_epi16(*this, other); }
-    simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm_min_epi16(*this, other); }
-    simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(*this, other); }
-    simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(other, *this); }
+  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+  simdutf_really_inline simd16(const __m128i _value) : base16_numeric<int16_t>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
+  // Member-by-member initialization
+  simdutf_really_inline simd16(
+    int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7)
+    : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
+  simdutf_really_inline operator simd16<uint16_t>() const;
+
+  // Order-sensitive comparisons
+  simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm_max_epi16(*this, other); }
+  simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm_min_epi16(*this, other); }
+  simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(*this, other); }
+  simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(other, *this); }
 };
 
 // Unsigned words
 template<>
-struct simd16<uint16_t> : base16_numeric<uint16_t> {
-    simdutf_really_inline simd16()
-        : base16_numeric<uint16_t>()
-    {
-    }
-    simdutf_really_inline simd16(const __m128i _value)
-        : base16_numeric<uint16_t>(_value)
-    {
-    }
-
-    // Splat constructor
-    simdutf_really_inline simd16(uint16_t _value)
-        : simd16(splat(_value))
-    {
-    }
-    // Array constructor
-    simdutf_really_inline simd16(const uint16_t* values)
-        : simd16(load(values))
-    {
-    }
-    simdutf_really_inline simd16(const char16_t* values)
-        : simd16(load(reinterpret_cast<const uint16_t*>(values)))
-    {
-    }
-    // Member-by-member initialization
-    simdutf_really_inline simd16(
-        uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
-        : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7))
-    {
-    }
-    // Repeat 16 values as many times as necessary (usually for lookup tables)
-    simdutf_really_inline static simd16<uint16_t> repeat_16(
-        uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
-    {
-        return simd16<uint16_t>(v0, v1, v2, v3, v4, v5, v6, v7);
-    }
-
-    // Saturated math
-    simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
-    simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
-
-    // Order-specific operations
-    simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm_max_epu16(*this, other); }
-    simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm_min_epu16(*this, other); }
-    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-    simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
-    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-    simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
-    simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
-    simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
-    simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
-    simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
-
-    // Bit-specific operations
-    simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
-    simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
-    simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
-    simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
-
-    simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
-    simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
-    simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
-    simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
-    template<int N>
-    simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm_srli_epi16(*this, N)); }
-    template<int N>
-    simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm_slli_epi16(*this, N)); }
-    // Get one of the bits and make a bitmask out of it.
-    // e.g. value.get_bit<7>() gets the high bit
-    template<int N>
-    simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7 - N)); }
-
-    // Change the endianness
-    simdutf_really_inline simd16<uint16_t> swap_bytes() const
-    {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        return _mm_shuffle_epi8(*this, swap);
-    }
+struct simd16<uint16_t>: base16_numeric<uint16_t>  {
+  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+  simdutf_really_inline simd16(const __m128i _value) : base16_numeric<uint16_t>(_value) {}
+
+  // Splat constructor
+  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
+  // Member-by-member initialization
+  simdutf_really_inline simd16(
+    uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
+  : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd16<uint16_t> repeat_16(
+    uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7
+  ) {
+    return simd16<uint16_t>(v0, v1, v2, v3, v4, v5, v6, v7);
+  }
+
+  // Saturated math
+  simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
+
+  // Order-specific operations
+  simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm_max_epu16(*this, other); }
+  simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm_min_epu16(*this, other); }
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
+  simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
+  simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
+  simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+  simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+
+  // Bit-specific operations
+  simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
+  simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
+  simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
+  simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
+
+  simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
+  simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+  simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
+  simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
+  template<int N>
+  simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm_srli_epi16(*this, N)); }
+  template<int N>
+  simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm_slli_epi16(*this, N)); }
+  // Get one of the bits and make a bitmask out of it.
+  // e.g. value.get_bit<7>() gets the high bit
+  template<int N>
+  simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
+
+  // Change the endianness
+  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
+    const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    return _mm_shuffle_epi8(*this, swap);
+  }
 
-    // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
-    static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1)
-    {
-        return _mm_packus_epi16(v0, v1);
-    }
+  // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
+  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
+    return _mm_packus_epi16(v0, v1);
+  }
 };
 simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
 
 template<typename T>
-struct simd16x32 {
+  struct simd16x32 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
     static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
     simd16<T> chunks[NUM_CHUNKS];
@@ -3796,124 +3281,106 @@ struct simd16x32 {
     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
     simd16x32() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3)
-        : chunks { chunk0, chunk1, chunk2, chunk3 }
-    {
-    }
-    simdutf_really_inline simd16x32(const T* ptr)
-        : chunks { simd16<T>::load(ptr), simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)), simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)), simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T)) }
-    {
-    }
-
-    simdutf_really_inline void store(T* ptr) const
-    {
-        this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
-        this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
-        this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
-        this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
-    }
-
-    simdutf_really_inline simd16<T> reduce_or() const
-    {
-        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
-    }
-
-    simdutf_really_inline bool is_ascii() const
-    {
-        return this->reduce_or().is_ascii();
-    }
-
-    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
-    {
-        this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
-        this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
-        this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
-        this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
-    }
-
-    simdutf_really_inline uint64_t to_bitmask() const
-    {
-        uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
-        uint64_t r1 = this->chunks[1].to_bitmask();
-        uint64_t r2 = this->chunks[2].to_bitmask();
-        uint64_t r3 = this->chunks[3].to_bitmask();
-        return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
-    }
-
-    simdutf_really_inline void swap_bytes()
-    {
-        this->chunks[0] = this->chunks[0].swap_bytes();
-        this->chunks[1] = this->chunks[1].swap_bytes();
-        this->chunks[2] = this->chunks[2].swap_bytes();
-        this->chunks[3] = this->chunks[3].swap_bytes();
-    }
-
-    simdutf_really_inline uint64_t eq(const T m) const
-    {
-        const simd16<T> mask = simd16<T>::splat(m);
-        return simd16x32<bool>(
-            this->chunks[0] == mask,
-            this->chunks[1] == mask,
-            this->chunks[2] == mask,
-            this->chunks[3] == mask)
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t>& other) const
-    {
-        return simd16x32<bool>(
-            this->chunks[0] == other.chunks[0],
-            this->chunks[1] == other.chunks[1],
-            this->chunks[2] == other.chunks[2],
-            this->chunks[3] == other.chunks[3])
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t lteq(const T m) const
-    {
-        const simd16<T> mask = simd16<T>::splat(m);
-        return simd16x32<bool>(
-            this->chunks[0] <= mask,
-            this->chunks[1] <= mask,
-            this->chunks[2] <= mask,
-            this->chunks[3] <= mask)
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const
-    {
-        const simd16<T> mask_low = simd16<T>::splat(low);
-        const simd16<T> mask_high = simd16<T>::splat(high);
-
-        return simd16x32<bool>(
-            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
-    {
-        const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
-        const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
-        return simd16x32<bool>(
-            (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
-            (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
-            (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
-            (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low))
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const
-    {
-        const simd16<T> mask = simd16<T>::splat(m);
-        return simd16x32<bool>(
-            this->chunks[0] < mask,
-            this->chunks[1] < mask,
-            this->chunks[2] < mask,
-            this->chunks[3] < mask)
-            .to_bitmask();
-    }
-}; // struct simd16x32<T>
+    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
+
+    simdutf_really_inline void store(T* ptr) const {
+      this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
+      this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
+      this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
+      this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
+    }
+
+    simdutf_really_inline simd16<T> reduce_or() const {
+      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    }
+
+    simdutf_really_inline bool is_ascii() const {
+      return this->reduce_or().is_ascii();
+    }
+
+    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
+      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
+      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
+      this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
+      this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
+    }
+
+    simdutf_really_inline uint64_t to_bitmask() const {
+      uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
+      uint64_t r1 =          this->chunks[1].to_bitmask() ;
+      uint64_t r2 =          this->chunks[2].to_bitmask() ;
+      uint64_t r3 =          this->chunks[3].to_bitmask() ;
+      return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    }
+
+    simdutf_really_inline void swap_bytes() {
+      this->chunks[0] = this->chunks[0].swap_bytes();
+      this->chunks[1] = this->chunks[1].swap_bytes();
+      this->chunks[2] = this->chunks[2].swap_bytes();
+      this->chunks[3] = this->chunks[3].swap_bytes();
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] == mask,
+        this->chunks[1] == mask,
+        this->chunks[2] == mask,
+        this->chunks[3] == mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
+      return  simd16x32<bool>(
+        this->chunks[0] == other.chunks[0],
+        this->chunks[1] == other.chunks[1],
+        this->chunks[2] == other.chunks[2],
+        this->chunks[3] == other.chunks[3]
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] <= mask,
+        this->chunks[1] <= mask,
+        this->chunks[2] <= mask,
+        this->chunks[3] <= mask
+      ).to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd16<T> mask_low = simd16<T>::splat(low);
+      const simd16<T> mask_high = simd16<T>::splat(high);
+
+      return  simd16x32<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
+      const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
+      return simd16x32<bool>(
+        (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+        (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
+        (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
+        (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
+      ).to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const {
+      const simd16<T> mask = simd16<T>::splat(m);
+      return  simd16x32<bool>(
+        this->chunks[0] < mask,
+        this->chunks[1] < mask,
+        this->chunks[2] < mask,
+        this->chunks[3] < mask
+      ).to_bitmask();
+    }
+  }; // struct simd16x32<T>
 /* end file src/simdutf/westmere/simd16-inl.h */
 
 } // namespace simd
@@ -3946,10 +3413,13 @@ SIMDUTF_UNTARGET_REGION
 #error "ppc64.h must be included before fallback.h"
 #endif
 
+
 #ifndef SIMDUTF_IMPLEMENTATION_PPC64
 #define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64)
 #endif
-#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64&& SIMDUTF_IS_PPC64
+#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64 && SIMDUTF_IS_PPC64
+
+
 
 #if SIMDUTF_IMPLEMENTATION_PPC64
 
@@ -3966,6 +3436,7 @@ namespace ppc64 {
 #ifndef SIMDUTF_PPC64_IMPLEMENTATION_H
 #define SIMDUTF_PPC64_IMPLEMENTATION_H
 
+
 namespace simdutf {
 namespace ppc64 {
 
@@ -3975,64 +3446,62 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-    simdutf_really_inline implementation()
-        : simdutf::implementation("ppc64", "PPC64 ALTIVEC",
-            internal::instruction_set::ALTIVEC)
-    {
-    }
-    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
-    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
-    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
+  simdutf_really_inline implementation()
+      : simdutf::implementation("ppc64", "PPC64 ALTIVEC",
+                                 internal::instruction_set::ALTIVEC) {}
+  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
+  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
+  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
 };
 
 } // namespace ppc64
@@ -4053,6 +3522,7 @@ public:
 #ifndef SIMDUTF_PPC64_INTRINSICS_H
 #define SIMDUTF_PPC64_INTRINSICS_H
 
+
 // This should be the correct header whether
 // you use visual studio or other compilers.
 #include <altivec.h>
@@ -4078,15 +3548,13 @@ namespace ppc64 {
 namespace {
 
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-simdutf_really_inline int count_ones(uint64_t input_num)
-{
-    // note: we do not support legacy 32-bit Windows
-    return __popcnt64(input_num); // Visual Studio wants two underscores
+simdutf_really_inline int count_ones(uint64_t input_num) {
+  // note: we do not support legacy 32-bit Windows
+  return __popcnt64(input_num); // Visual Studio wants two underscores
 }
 #else
-simdutf_really_inline int count_ones(uint64_t input_num)
-{
-    return __builtin_popcountll(input_num);
+simdutf_really_inline int count_ones(uint64_t input_num) {
+  return __builtin_popcountll(input_num);
 }
 #endif
 
@@ -4110,592 +3578,474 @@ namespace simd {
 
 using __m128i = __vector unsigned char;
 
-template<typename Child> struct base {
-    __m128i value;
-
-    // Zero constructor
-    simdutf_really_inline base()
-        : value { __m128i() }
-    {
-    }
-
-    // Conversion from SIMD register
-    simdutf_really_inline base(const __m128i _value)
-        : value(_value)
-    {
-    }
-
-    // Conversion to SIMD register
-    simdutf_really_inline operator const __m128i&() const
-    {
-        return this->value;
-    }
-    simdutf_really_inline operator __m128i&() { return this->value; }
-
-    // Bit operations
-    simdutf_really_inline Child operator|(const Child other) const
-    {
-        return vec_or(this->value, (__m128i)other);
-    }
-    simdutf_really_inline Child operator&(const Child other) const
-    {
-        return vec_and(this->value, (__m128i)other);
-    }
-    simdutf_really_inline Child operator^(const Child other) const
-    {
-        return vec_xor(this->value, (__m128i)other);
-    }
-    simdutf_really_inline Child bit_andnot(const Child other) const
-    {
-        return vec_andc(this->value, (__m128i)other);
-    }
-    simdutf_really_inline Child& operator|=(const Child other)
-    {
-        auto this_cast = static_cast<Child*>(this);
-        *this_cast = *this_cast | other;
-        return *this_cast;
-    }
-    simdutf_really_inline Child& operator&=(const Child other)
-    {
-        auto this_cast = static_cast<Child*>(this);
-        *this_cast = *this_cast & other;
-        return *this_cast;
-    }
-    simdutf_really_inline Child& operator^=(const Child other)
-    {
-        auto this_cast = static_cast<Child*>(this);
-        *this_cast = *this_cast ^ other;
-        return *this_cast;
-    }
+template <typename Child> struct base {
+  __m128i value;
+
+  // Zero constructor
+  simdutf_really_inline base() : value{__m128i()} {}
+
+  // Conversion from SIMD register
+  simdutf_really_inline base(const __m128i _value) : value(_value) {}
+
+  // Conversion to SIMD register
+  simdutf_really_inline operator const __m128i &() const {
+    return this->value;
+  }
+  simdutf_really_inline operator __m128i &() { return this->value; }
+
+  // Bit operations
+  simdutf_really_inline Child operator|(const Child other) const {
+    return vec_or(this->value, (__m128i)other);
+  }
+  simdutf_really_inline Child operator&(const Child other) const {
+    return vec_and(this->value, (__m128i)other);
+  }
+  simdutf_really_inline Child operator^(const Child other) const {
+    return vec_xor(this->value, (__m128i)other);
+  }
+  simdutf_really_inline Child bit_andnot(const Child other) const {
+    return vec_andc(this->value, (__m128i)other);
+  }
+  simdutf_really_inline Child &operator|=(const Child other) {
+    auto this_cast = static_cast<Child*>(this);
+    *this_cast = *this_cast | other;
+    return *this_cast;
+  }
+  simdutf_really_inline Child &operator&=(const Child other) {
+    auto this_cast = static_cast<Child*>(this);
+    *this_cast = *this_cast & other;
+    return *this_cast;
+  }
+  simdutf_really_inline Child &operator^=(const Child other) {
+    auto this_cast = static_cast<Child*>(this);
+    *this_cast = *this_cast ^ other;
+    return *this_cast;
+  }
 };
 
 // Forward-declared so they can be used by splat and friends.
-template<typename T> struct simd8;
+template <typename T> struct simd8;
 
-template<typename T, typename Mask = simd8<bool>>
+template <typename T, typename Mask = simd8<bool>>
 struct base8 : base<simd8<T>> {
-    typedef uint16_t bitmask_t;
-    typedef uint32_t bitmask2_t;
+  typedef uint16_t bitmask_t;
+  typedef uint32_t bitmask2_t;
 
-    simdutf_really_inline base8()
-        : base<simd8<T>>()
-    {
-    }
-    simdutf_really_inline base8(const __m128i _value)
-        : base<simd8<T>>(_value)
-    {
-    }
+  simdutf_really_inline base8() : base<simd8<T>>() {}
+  simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
 
-    friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs)
-    {
-        return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
-    }
+  friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) {
+    return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
+  }
 
-    static const int SIZE = sizeof(base<simd8<T>>::value);
+  static const int SIZE = sizeof(base<simd8<T>>::value);
 
-    template<int N = 1>
-    simdutf_really_inline simd8<T> prev(simd8<T> prev_chunk) const
-    {
-        __m128i chunk = this->value;
+  template <int N = 1>
+  simdutf_really_inline simd8<T> prev(simd8<T> prev_chunk) const {
+    __m128i chunk = this->value;
 #ifdef __LITTLE_ENDIAN__
-        chunk = (__m128i)vec_reve(this->value);
-        prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
+    chunk = (__m128i)vec_reve(this->value);
+    prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
 #endif
-        chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
+    chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
 #ifdef __LITTLE_ENDIAN__
-        chunk = (__m128i)vec_reve((__m128i)chunk);
+    chunk = (__m128i)vec_reve((__m128i)chunk);
 #endif
-        return chunk;
-    }
+    return chunk;
+  }
 };
 
 // SIMD byte mask type (returned by things like eq and gt)
-template<> struct simd8<bool> : base8<bool> {
-    static simdutf_really_inline simd8<bool> splat(bool _value)
-    {
-        return (__m128i)vec_splats((unsigned char)(-(!!_value)));
-    }
-
-    simdutf_really_inline simd8<bool>()
-        : base8()
-    {
-    }
-    simdutf_really_inline simd8<bool>(const __m128i _value)
-        : base8<bool>(_value)
-    {
-    }
-    // Splat constructor
-    simdutf_really_inline simd8<bool>(bool _value)
-        : base8<bool>(splat(_value))
-    {
-    }
-
-    simdutf_really_inline int to_bitmask() const
-    {
-        __vector unsigned long long result;
-        const __m128i perm_mask = { 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
-            0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 };
-
-        result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
-            (__m128i)perm_mask));
+template <> struct simd8<bool> : base8<bool> {
+  static simdutf_really_inline simd8<bool> splat(bool _value) {
+    return (__m128i)vec_splats((unsigned char)(-(!!_value)));
+  }
+
+  simdutf_really_inline simd8<bool>() : base8() {}
+  simdutf_really_inline simd8<bool>(const __m128i _value)
+      : base8<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd8<bool>(bool _value)
+      : base8<bool>(splat(_value)) {}
+
+  simdutf_really_inline int to_bitmask() const {
+    __vector unsigned long long result;
+    const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
+                               0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
+
+    result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
+                                                       (__m128i)perm_mask));
 #ifdef __LITTLE_ENDIAN__
-        return static_cast<int>(result[1]);
+    return static_cast<int>(result[1]);
 #else
-        return static_cast<int>(result[0]);
+    return static_cast<int>(result[0]);
 #endif
-    }
-    simdutf_really_inline bool any() const
-    {
-        return !vec_all_eq(this->value, (__m128i)vec_splats(0));
-    }
-    simdutf_really_inline simd8<bool> operator~() const
-    {
-        return this->value ^ (__m128i)splat(true);
-    }
+  }
+  simdutf_really_inline bool any() const {
+    return !vec_all_eq(this->value, (__m128i)vec_splats(0));
+  }
+  simdutf_really_inline simd8<bool> operator~() const {
+    return this->value ^ (__m128i)splat(true);
+  }
 };
 
-template<typename T> struct base8_numeric : base8<T> {
-    static simdutf_really_inline simd8<T> splat(T value)
-    {
-        (void)value;
-        return (__m128i)vec_splats(value);
-    }
-    static simdutf_really_inline simd8<T> zero() { return splat(0); }
-    static simdutf_really_inline simd8<T> load(const T values[16])
-    {
-        return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t*>(values)));
-    }
-    // Repeat 16 values as many times as necessary (usually for lookup tables)
-    static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
-        T v5, T v6, T v7, T v8, T v9,
-        T v10, T v11, T v12, T v13,
-        T v14, T v15)
-    {
-        return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-            v14, v15);
-    }
-
-    simdutf_really_inline base8_numeric()
-        : base8<T>()
-    {
-    }
-    simdutf_really_inline base8_numeric(const __m128i _value)
-        : base8<T>(_value)
-    {
-    }
-
-    // Store to array
-    simdutf_really_inline void store(T dst[16]) const
-    {
-        vec_vsx_st(this->value, 0, reinterpret_cast<__m128i*>(dst));
-    }
-
-    // Override to distinguish from bool version
-    simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
-
-    // Addition/subtraction are the same for signed and unsigned
-    simdutf_really_inline simd8<T> operator+(const simd8<T> other) const
-    {
-        return (__m128i)((__m128i)this->value + (__m128i)other);
-    }
-    simdutf_really_inline simd8<T> operator-(const simd8<T> other) const
-    {
-        return (__m128i)((__m128i)this->value - (__m128i)other);
-    }
-    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other)
-    {
-        *this = *this + other;
-        return *static_cast<simd8<T>*>(this);
-    }
-    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other)
-    {
-        *this = *this - other;
-        return *static_cast<simd8<T>*>(this);
-    }
-
-    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
-    // for out of range values)
-    template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
-    {
-        return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
-    }
-
-    template<typename L>
-    simdutf_really_inline simd8<L>
-    lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
-        L replace5, L replace6, L replace7, L replace8, L replace9,
-        L replace10, L replace11, L replace12, L replace13, L replace14,
-        L replace15) const
-    {
-        return lookup_16(simd8<L>::repeat_16(
-            replace0, replace1, replace2, replace3, replace4, replace5, replace6,
-            replace7, replace8, replace9, replace10, replace11, replace12,
-            replace13, replace14, replace15));
-    }
+template <typename T> struct base8_numeric : base8<T> {
+  static simdutf_really_inline simd8<T> splat(T value) {
+    (void)value;
+    return (__m128i)vec_splats(value);
+  }
+  static simdutf_really_inline simd8<T> zero() { return splat(0); }
+  static simdutf_really_inline simd8<T> load(const T values[16]) {
+    return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
+  }
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
+                                                   T v5, T v6, T v7, T v8, T v9,
+                                                   T v10, T v11, T v12, T v13,
+                                                   T v14, T v15) {
+    return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+                    v14, v15);
+  }
+
+  simdutf_really_inline base8_numeric() : base8<T>() {}
+  simdutf_really_inline base8_numeric(const __m128i _value)
+      : base8<T>(_value) {}
+
+  // Store to array
+  simdutf_really_inline void store(T dst[16]) const {
+    vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
+  }
+
+  // Override to distinguish from bool version
+  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
+    return (__m128i)((__m128i)this->value + (__m128i)other);
+  }
+  simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
+    return (__m128i)((__m128i)this->value - (__m128i)other);
+  }
+  simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
+    *this = *this + other;
+    return *static_cast<simd8<T> *>(this);
+  }
+  simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
+    *this = *this - other;
+    return *static_cast<simd8<T> *>(this);
+  }
+
+  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+  // for out of range values)
+  template <typename L>
+  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+    return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
+  }
+
+  template <typename L>
+  simdutf_really_inline simd8<L>
+  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+            L replace5, L replace6, L replace7, L replace8, L replace9,
+            L replace10, L replace11, L replace12, L replace13, L replace14,
+            L replace15) const {
+    return lookup_16(simd8<L>::repeat_16(
+        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+        replace7, replace8, replace9, replace10, replace11, replace12,
+        replace13, replace14, replace15));
+  }
 };
 
 // Signed bytes
-template<> struct simd8<int8_t> : base8_numeric<int8_t> {
-    simdutf_really_inline simd8()
-        : base8_numeric<int8_t>()
-    {
-    }
-    simdutf_really_inline simd8(const __m128i _value)
-        : base8_numeric<int8_t>(_value)
-    {
-    }
-
-    // Splat constructor
-    simdutf_really_inline simd8(int8_t _value)
-        : simd8(splat(_value))
-    {
-    }
-    // Array constructor
-    simdutf_really_inline simd8(const int8_t* values)
-        : simd8(load(values))
-    {
-    }
-    // Member-by-member initialization
-    simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
-        int8_t v4, int8_t v5, int8_t v6, int8_t v7,
-        int8_t v8, int8_t v9, int8_t v10, int8_t v11,
-        int8_t v12, int8_t v13, int8_t v14, int8_t v15)
-        : simd8((__m128i)(__vector signed char) { v0, v1, v2, v3, v4, v5, v6, v7,
-            v8, v9, v10, v11, v12, v13, v14,
-            v15 })
-    {
-    }
-    // Repeat 16 values as many times as necessary (usually for lookup tables)
-    simdutf_really_inline static simd8<int8_t>
-    repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
-        int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
-        int8_t v12, int8_t v13, int8_t v14, int8_t v15)
-    {
-        return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-            v13, v14, v15);
-    }
-
-    // Order-sensitive comparisons
-    simdutf_really_inline simd8<int8_t>
-    max_val(const simd8<int8_t> other) const
-    {
-        return (__m128i)vec_max((__vector signed char)this->value,
-            (__vector signed char)(__m128i)other);
-    }
-    simdutf_really_inline simd8<int8_t>
-    min_val(const simd8<int8_t> other) const
-    {
-        return (__m128i)vec_min((__vector signed char)this->value,
-            (__vector signed char)(__m128i)other);
-    }
-    simdutf_really_inline simd8<bool>
-    operator>(const simd8<int8_t> other) const
-    {
-        return (__m128i)vec_cmpgt((__vector signed char)this->value,
-            (__vector signed char)(__m128i)other);
-    }
-    simdutf_really_inline simd8<bool>
-    operator<(const simd8<int8_t> other) const
-    {
-        return (__m128i)vec_cmplt((__vector signed char)this->value,
-            (__vector signed char)(__m128i)other);
-    }
+template <> struct simd8<int8_t> : base8_numeric<int8_t> {
+  simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
+  simdutf_really_inline simd8(const __m128i _value)
+      : base8_numeric<int8_t>(_value) {}
+
+  // Splat constructor
+  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
+  // Member-by-member initialization
+  simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
+                               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+                               int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+                               int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+      : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
+                                              v8, v9, v10, v11, v12, v13, v14,
+                                              v15}) {}
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd8<int8_t>
+  repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+            int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+            int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+    return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                         v13, v14, v15);
+  }
+
+  // Order-sensitive comparisons
+  simdutf_really_inline simd8<int8_t>
+  max_val(const simd8<int8_t> other) const {
+    return (__m128i)vec_max((__vector signed char)this->value,
+                            (__vector signed char)(__m128i)other);
+  }
+  simdutf_really_inline simd8<int8_t>
+  min_val(const simd8<int8_t> other) const {
+    return (__m128i)vec_min((__vector signed char)this->value,
+                            (__vector signed char)(__m128i)other);
+  }
+  simdutf_really_inline simd8<bool>
+  operator>(const simd8<int8_t> other) const {
+    return (__m128i)vec_cmpgt((__vector signed char)this->value,
+                              (__vector signed char)(__m128i)other);
+  }
+  simdutf_really_inline simd8<bool>
+  operator<(const simd8<int8_t> other) const {
+    return (__m128i)vec_cmplt((__vector signed char)this->value,
+                              (__vector signed char)(__m128i)other);
+  }
 };
 
 // Unsigned bytes
-template<> struct simd8<uint8_t> : base8_numeric<uint8_t> {
-    simdutf_really_inline simd8()
-        : base8_numeric<uint8_t>()
-    {
-    }
-    simdutf_really_inline simd8(const __m128i _value)
-        : base8_numeric<uint8_t>(_value)
-    {
-    }
-    // Splat constructor
-    simdutf_really_inline simd8(uint8_t _value)
-        : simd8(splat(_value))
-    {
-    }
-    // Array constructor
-    simdutf_really_inline simd8(const uint8_t* values)
-        : simd8(load(values))
-    {
-    }
-    // Member-by-member initialization
-    simdutf_really_inline
-    simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
+  simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
+  simdutf_really_inline simd8(const __m128i _value)
+      : base8_numeric<uint8_t>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
+  // Member-by-member initialization
+  simdutf_really_inline
+  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
         uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
         uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
-        : simd8((__m128i) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-            v13, v14, v15 })
-    {
-    }
-    // Repeat 16 values as many times as necessary (usually for lookup tables)
-    simdutf_really_inline static simd8<uint8_t>
-    repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
-        uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
-        uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
-        uint8_t v15)
-    {
-        return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-            v13, v14, v15);
-    }
-
-    // Saturated math
-    simdutf_really_inline simd8<uint8_t>
-    saturating_add(const simd8<uint8_t> other) const
-    {
-        return (__m128i)vec_adds(this->value, (__m128i)other);
-    }
-    simdutf_really_inline simd8<uint8_t>
-    saturating_sub(const simd8<uint8_t> other) const
-    {
-        return (__m128i)vec_subs(this->value, (__m128i)other);
-    }
-
-    // Order-specific operations
-    simdutf_really_inline simd8<uint8_t>
-    max_val(const simd8<uint8_t> other) const
-    {
-        return (__m128i)vec_max(this->value, (__m128i)other);
-    }
-    simdutf_really_inline simd8<uint8_t>
-    min_val(const simd8<uint8_t> other) const
-    {
-        return (__m128i)vec_min(this->value, (__m128i)other);
-    }
-    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-    simdutf_really_inline simd8<uint8_t>
-    gt_bits(const simd8<uint8_t> other) const
-    {
-        return this->saturating_sub(other);
-    }
-    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-    simdutf_really_inline simd8<uint8_t>
-    lt_bits(const simd8<uint8_t> other) const
-    {
-        return other.saturating_sub(*this);
-    }
-    simdutf_really_inline simd8<bool>
-    operator<=(const simd8<uint8_t> other) const
-    {
-        return other.max_val(*this) == other;
-    }
-    simdutf_really_inline simd8<bool>
-    operator>=(const simd8<uint8_t> other) const
-    {
-        return other.min_val(*this) == other;
-    }
-    simdutf_really_inline simd8<bool>
-    operator>(const simd8<uint8_t> other) const
-    {
-        return this->gt_bits(other).any_bits_set();
-    }
-    simdutf_really_inline simd8<bool>
-    operator<(const simd8<uint8_t> other) const
-    {
-        return this->gt_bits(other).any_bits_set();
-    }
-
-    // Bit-specific operations
-    simdutf_really_inline simd8<bool> bits_not_set() const
-    {
-        return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
-    }
-    simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const
-    {
-        return (*this & bits).bits_not_set();
-    }
-    simdutf_really_inline simd8<bool> any_bits_set() const
-    {
-        return ~this->bits_not_set();
-    }
-    simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const
-    {
-        return ~this->bits_not_set(bits);
-    }
-
-    simdutf_really_inline bool is_ascii() const
-    {
-        return this->saturating_sub(0b01111111u).bits_not_set_anywhere();
-    }
-
-    simdutf_really_inline bool bits_not_set_anywhere() const
-    {
-        return vec_all_eq(this->value, (__m128i)vec_splats(0));
-    }
-    simdutf_really_inline bool any_bits_set_anywhere() const
-    {
-        return !bits_not_set_anywhere();
-    }
-    simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const
-    {
-        return vec_all_eq(vec_and(this->value, (__m128i)bits),
-            (__m128i)vec_splats(0));
-    }
-    simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const
-    {
-        return !bits_not_set_anywhere(bits);
-    }
-    template<int N> simdutf_really_inline simd8<uint8_t> shr() const
-    {
-        return simd8<uint8_t>(
-            (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
-    }
-    template<int N> simdutf_really_inline simd8<uint8_t> shl() const
-    {
-        return simd8<uint8_t>(
-            (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
-    }
+      : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                        v13, v14, v15}) {}
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd8<uint8_t>
+  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+            uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+            uint8_t v15) {
+    return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                          v13, v14, v15);
+  }
+
+  // Saturated math
+  simdutf_really_inline simd8<uint8_t>
+  saturating_add(const simd8<uint8_t> other) const {
+    return (__m128i)vec_adds(this->value, (__m128i)other);
+  }
+  simdutf_really_inline simd8<uint8_t>
+  saturating_sub(const simd8<uint8_t> other) const {
+    return (__m128i)vec_subs(this->value, (__m128i)other);
+  }
+
+  // Order-specific operations
+  simdutf_really_inline simd8<uint8_t>
+  max_val(const simd8<uint8_t> other) const {
+    return (__m128i)vec_max(this->value, (__m128i)other);
+  }
+  simdutf_really_inline simd8<uint8_t>
+  min_val(const simd8<uint8_t> other) const {
+    return (__m128i)vec_min(this->value, (__m128i)other);
+  }
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd8<uint8_t>
+  gt_bits(const simd8<uint8_t> other) const {
+    return this->saturating_sub(other);
+  }
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd8<uint8_t>
+  lt_bits(const simd8<uint8_t> other) const {
+    return other.saturating_sub(*this);
+  }
+  simdutf_really_inline simd8<bool>
+  operator<=(const simd8<uint8_t> other) const {
+    return other.max_val(*this) == other;
+  }
+  simdutf_really_inline simd8<bool>
+  operator>=(const simd8<uint8_t> other) const {
+    return other.min_val(*this) == other;
+  }
+  simdutf_really_inline simd8<bool>
+  operator>(const simd8<uint8_t> other) const {
+    return this->gt_bits(other).any_bits_set();
+  }
+  simdutf_really_inline simd8<bool>
+  operator<(const simd8<uint8_t> other) const {
+    return this->gt_bits(other).any_bits_set();
+  }
+
+  // Bit-specific operations
+  simdutf_really_inline simd8<bool> bits_not_set() const {
+    return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
+  }
+  simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
+    return (*this & bits).bits_not_set();
+  }
+  simdutf_really_inline simd8<bool> any_bits_set() const {
+    return ~this->bits_not_set();
+  }
+  simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+    return ~this->bits_not_set(bits);
+  }
+
+  simdutf_really_inline bool is_ascii() const {
+      return this->saturating_sub(0b01111111u).bits_not_set_anywhere();
+  }
+
+  simdutf_really_inline bool bits_not_set_anywhere() const {
+    return vec_all_eq(this->value, (__m128i)vec_splats(0));
+  }
+  simdutf_really_inline bool any_bits_set_anywhere() const {
+    return !bits_not_set_anywhere();
+  }
+  simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
+    return vec_all_eq(vec_and(this->value, (__m128i)bits),
+                      (__m128i)vec_splats(0));
+  }
+  simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+    return !bits_not_set_anywhere(bits);
+  }
+  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
+    return simd8<uint8_t>(
+        (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
+  }
+  template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
+    return simd8<uint8_t>(
+        (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
+  }
 };
 
-template<typename T> struct simd8x64 {
-    static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
-    static_assert(NUM_CHUNKS == 4,
-        "PPC64 kernel should use four registers per 64-byte block.");
-    simd8<T> chunks[NUM_CHUNKS];
+template <typename T> struct simd8x64 {
+  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+  static_assert(NUM_CHUNKS == 4,
+                "PPC64 kernel should use four registers per 64-byte block.");
+  simd8<T> chunks[NUM_CHUNKS];
 
-    simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
-    simd8x64<T>&
-    operator=(const simd8<T> other)
-        = delete; // no assignment allowed
-    simd8x64() = delete; // no default constructor allowed
+  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+  simd8x64<T> &
+  operator=(const simd8<T> other) = delete; // no assignment allowed
+  simd8x64() = delete;                      // no default constructor allowed
 
-    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
-        const simd8<T> chunk2, const simd8<T> chunk3)
-        : chunks { chunk0, chunk1, chunk2, chunk3 }
-    {
-    }
+  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
+                                  const simd8<T> chunk2, const simd8<T> chunk3)
+      : chunks{chunk0, chunk1, chunk2, chunk3} {}
 
-    simdutf_really_inline simd8x64(const T* ptr)
-        : chunks { simd8<T>::load(ptr), simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T)) }
-    {
-    }
+  simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
 
-    simdutf_really_inline void store(T* ptr) const
-    {
-        this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
-        this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
-        this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
-        this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
-    }
+  simdutf_really_inline void store(T* ptr) const {
+    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0/sizeof(T));
+    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1/sizeof(T));
+    this->chunks[2].store(ptr + sizeof(simd8<T>) * 2/sizeof(T));
+    this->chunks[3].store(ptr + sizeof(simd8<T>) * 3/sizeof(T));
+  }
 
-    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T>& other)
-    {
-        this->chunks[0] |= other.chunks[0];
-        this->chunks[1] |= other.chunks[1];
-        this->chunks[2] |= other.chunks[2];
-        this->chunks[3] |= other.chunks[3];
-        return *this;
-    }
 
-    simdutf_really_inline simd8<T> reduce_or() const
-    {
-        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+  simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
+      this->chunks[0] |= other.chunks[0];
+      this->chunks[1] |= other.chunks[1];
+      this->chunks[2] |= other.chunks[2];
+      this->chunks[3] |= other.chunks[3];
+      return *this;
     }
 
-    simdutf_really_inline bool is_ascii() const
-    {
-        return input.reduce_or().is_ascii();
-    }
+  simdutf_really_inline simd8<T> reduce_or() const {
+    return (this->chunks[0] | this->chunks[1]) |
+           (this->chunks[2] | this->chunks[3]);
+  }
 
-    simdutf_really_inline uint64_t to_bitmask() const
-    {
-        uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
-        uint64_t r1 = this->chunks[1].to_bitmask();
-        uint64_t r2 = this->chunks[2].to_bitmask();
-        uint64_t r3 = this->chunks[3].to_bitmask();
-        return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
-    }
 
-    simdutf_really_inline uint64_t eq(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
-            this->chunks[2] == mask, this->chunks[3] == mask)
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t>& other) const
-    {
-        return simd8x64<bool>(this->chunks[0] == other.chunks[0],
-            this->chunks[1] == other.chunks[1],
-            this->chunks[2] == other.chunks[2],
-            this->chunks[3] == other.chunks[3])
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t lteq(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
-            this->chunks[2] <= mask, this->chunks[3] <= mask)
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const
-    {
-        const simd8<T> mask_low = simd8<T>::splat(low);
-        const simd8<T> mask_high = simd8<T>::splat(high);
-
-        return simd8x64<bool>(
-            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
-    {
-        const simd8<T> mask_low = simd8<T>::splat(low);
-        const simd8<T> mask_high = simd8<T>::splat(high);
-        return simd8x64<bool>(
-            (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
-            (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
-            (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
-            (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
-            this->chunks[2] < mask, this->chunks[3] < mask)
-            .to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t gt(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] > mask,
-            this->chunks[1] > mask,
-            this->chunks[2] > mask,
-            this->chunks[3] > mask)
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq(const T m) const
-    {
-        const simd8<T> mask = simd8<T>::splat(m);
-        return simd8x64<bool>(
-            this->chunks[0] >= mask,
-            this->chunks[1] >= mask,
-            this->chunks[2] >= mask,
-            this->chunks[3] >= mask)
-            .to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const
-    {
-        const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
-        return simd8x64<bool>(
-            simd8<uint8_t>(this->chunks[0]) >= mask,
-            simd8<uint8_t>(this->chunks[1]) >= mask,
-            simd8<uint8_t>(this->chunks[2]) >= mask,
-            simd8<uint8_t>(this->chunks[3]) >= mask)
-            .to_bitmask();
-    }
+  simdutf_really_inline bool is_ascii() const {
+    return input.reduce_or().is_ascii();
+  }
+
+  simdutf_really_inline uint64_t to_bitmask() const {
+    uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+    uint64_t r1 = this->chunks[1].to_bitmask();
+    uint64_t r2 = this->chunks[2].to_bitmask();
+    uint64_t r3 = this->chunks[3].to_bitmask();
+    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+  }
+
+  simdutf_really_inline uint64_t eq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+                          this->chunks[2] == mask, this->chunks[3] == mask)
+        .to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+    return simd8x64<bool>(this->chunks[0] == other.chunks[0],
+                          this->chunks[1] == other.chunks[1],
+                          this->chunks[2] == other.chunks[2],
+                          this->chunks[3] == other.chunks[3])
+        .to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+                          this->chunks[2] <= mask, this->chunks[3] <= mask)
+        .to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+
+      return  simd8x64<bool>(
+        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
+      ).to_bitmask();
+  }
+  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+      const simd8<T> mask_low = simd8<T>::splat(low);
+      const simd8<T> mask_high = simd8<T>::splat(high);
+      return  simd8x64<bool>(
+        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+        (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+        (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
+      ).to_bitmask();
+  }
+  simdutf_really_inline uint64_t lt(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+                          this->chunks[2] < mask, this->chunks[3] < mask)
+        .to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t gt(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] > mask,
+        this->chunks[1] > mask,
+        this->chunks[2] > mask,
+        this->chunks[3] > mask
+      ).to_bitmask();
+  }
+  simdutf_really_inline uint64_t gteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] >= mask,
+        this->chunks[1] >= mask,
+        this->chunks[2] >= mask,
+        this->chunks[3] >= mask
+      ).to_bitmask();
+  }
+  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+      return  simd8x64<bool>(
+        simd8<uint8_t>(this->chunks[0]) >= mask,
+        simd8<uint8_t>(this->chunks[1]) >= mask,
+        simd8<uint8_t>(this->chunks[2]) >= mask,
+        simd8<uint8_t>(this->chunks[3]) >= mask
+      ).to_bitmask();
+  }
 }; // struct simd8x64<T>
 
 } // namespace simd
@@ -4719,6 +4069,7 @@ template<typename T> struct simd8x64 {
 #ifndef SIMDUTF_FALLBACK_H
 #define SIMDUTF_FALLBACK_H
 
+
 // Note that fallback.h is always imported last.
 
 // Default Fallback to on unless a builtin implementation has already been selected.
@@ -4747,6 +4098,7 @@ namespace fallback {
 #ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
 #define SIMDUTF_FALLBACK_IMPLEMENTATION_H
 
+
 namespace simdutf {
 namespace fallback {
 
@@ -4756,89 +4108,86 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-    simdutf_really_inline implementation()
-        : simdutf::implementation(
-            "fallback",
-            "Generic fallback implementation",
-            0)
-    {
-    }
-    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
-    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
-    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
-    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
-    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
-};
+  simdutf_really_inline implementation() : simdutf::implementation(
+      "fallback",
+      "Generic fallback implementation",
+      0
+  ) {}
+  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
+  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
+  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf32( size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;};
 
 } // namespace fallback
 } // namespace simdutf
@@ -4865,21 +4214,19 @@ namespace fallback {
 namespace {
 
 #if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64)
-static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x)
-{
-    unsigned long x0 = (unsigned long)x, top, bottom;
-    _BitScanForward(&top, (unsigned long)(x >> 32));
-    _BitScanForward(&bottom, x0);
-    *ret = x0 ? bottom : 32 + top;
-    return x != 0;
-}
-static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x)
-{
-    unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
-    _BitScanReverse(&top, x1);
-    _BitScanReverse(&bottom, (unsigned long)x);
-    *ret = x1 ? top + 32 : bottom;
-    return x != 0;
+static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
+  unsigned long x0 = (unsigned long)x, top, bottom;
+  _BitScanForward(&top, (unsigned long)(x >> 32));
+  _BitScanForward(&bottom, x0);
+  *ret = x0 ? bottom : 32 + top;
+  return x != 0;
+}
+static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
+  unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
+  _BitScanReverse(&top, x1);
+  _BitScanReverse(&bottom, (unsigned long)x);
+  *ret = x1 ? top + 32 : bottom;
+  return x != 0;
 }
 #endif
 
@@ -4899,20 +4246,16 @@ static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x)
 /* end file src/simdutf/fallback.h */
 
 namespace simdutf {
-bool implementation::supported_by_runtime_system() const
-{
-    uint32_t required_instruction_sets = this->required_instruction_sets();
-    uint32_t supported_instruction_sets = internal::detect_supported_architectures();
-    return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
+bool implementation::supported_by_runtime_system() const {
+  uint32_t required_instruction_sets = this->required_instruction_sets();
+  uint32_t supported_instruction_sets = internal::detect_supported_architectures();
+  return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
 }
 
-simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char* input, size_t length) const noexcept
-{
+simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char * input, size_t length) const noexcept {
     // If there is a BOM, then we trust it.
     auto bom_encoding = simdutf::BOM::check_bom(input, length);
-    if (bom_encoding != encoding_type::unspecified) {
-        return bom_encoding;
-    }
+    if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
     // UTF8 is common, it includes ASCII, and is commonly represented
     // without a BOM, so if it fits, go with that. Note that it is still
     // possible to get it wrong, we are only 'guessing'. If some has UTF-16
@@ -4920,21 +4263,15 @@ simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char
     //
     // An interesting twist might be to check for UTF-16 ASCII first (every
     // other byte is zero).
-    if (validate_utf8(input, length)) {
-        return encoding_type::UTF8;
-    }
+    if(validate_utf8(input, length)) { return encoding_type::UTF8; }
     // The next most common encoding that might appear without BOM is probably
     // UTF-16LE, so try that next.
-    if ((length % 2) == 0) {
-        // important: we need to divide by two
-        if (validate_utf16le(reinterpret_cast<const char16_t*>(input), length / 2)) {
-            return encoding_type::UTF16_LE;
-        }
+    if((length % 2) == 0) {
+      // important: we need to divide by two
+      if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { return encoding_type::UTF16_LE; }
     }
-    if ((length % 4) == 0) {
-        if (validate_utf32(reinterpret_cast<const char32_t*>(input), length / 4)) {
-            return encoding_type::UTF32_LE;
-        }
+    if((length % 4) == 0) {
+      if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { return encoding_type::UTF32_LE; }
     }
     return encoding_type::unspecified;
 }
@@ -4944,46 +4281,41 @@ namespace internal {
 // Static array of known implementations. We're hoping these get baked into the executable
 // without requiring a static initializer.
 
+
 #if SIMDUTF_IMPLEMENTATION_ICELAKE
-static const icelake::implementation* get_icelake_singleton()
-{
-    static const icelake::implementation icelake_singleton {};
-    return &icelake_singleton;
+static const icelake::implementation* get_icelake_singleton() {
+  static const icelake::implementation icelake_singleton{};
+  return &icelake_singleton;
 }
 #endif
 #if SIMDUTF_IMPLEMENTATION_HASWELL
-static const haswell::implementation* get_haswell_singleton()
-{
-    static const haswell::implementation haswell_singleton {};
-    return &haswell_singleton;
+static const haswell::implementation* get_haswell_singleton() {
+  static const haswell::implementation haswell_singleton{};
+  return &haswell_singleton;
 }
 #endif
 #if SIMDUTF_IMPLEMENTATION_WESTMERE
-static const westmere::implementation* get_westmere_singleton()
-{
-    static const westmere::implementation westmere_singleton {};
-    return &westmere_singleton;
+static const westmere::implementation* get_westmere_singleton() {
+  static const westmere::implementation westmere_singleton{};
+  return &westmere_singleton;
 }
 #endif
 #if SIMDUTF_IMPLEMENTATION_ARM64
-static const arm64::implementation* get_arm64_singleton()
-{
-    static const arm64::implementation arm64_singleton {};
-    return &arm64_singleton;
+static const arm64::implementation* get_arm64_singleton() {
+  static const arm64::implementation arm64_singleton{};
+  return &arm64_singleton;
 }
 #endif
 #if SIMDUTF_IMPLEMENTATION_PPC64
-static const ppc64::implementation* get_ppc64_singleton()
-{
-    static const ppc64::implementation ppc64_singleton {};
-    return &ppc64_singleton;
+static const ppc64::implementation* get_ppc64_singleton() {
+  static const ppc64::implementation ppc64_singleton{};
+  return &ppc64_singleton;
 }
 #endif
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
-static const fallback::implementation* get_fallback_singleton()
-{
-    static const fallback::implementation fallback_singleton {};
-    return &fallback_singleton;
+static const fallback::implementation* get_fallback_singleton() {
+  static const fallback::implementation fallback_singleton{};
+  return &fallback_singleton;
 }
 #endif
 
@@ -4992,1266 +4324,1022 @@ static const fallback::implementation* get_fallback_singleton()
  */
 class detect_best_supported_implementation_on_first_use final : public implementation {
 public:
-    const std::string& name() const noexcept final { return set_best()->name(); }
-    const std::string& description() const noexcept final { return set_best()->description(); }
-    uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
+  const std::string &name() const noexcept final { return set_best()->name(); }
+  const std::string &description() const noexcept final { return set_best()->description(); }
+  uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
 
-    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept override
-    {
-        return set_best()->detect_encodings(input, length);
-    }
+  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept override {
+    return set_best()->detect_encodings(input, length);
+  }
 
-    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final override
-    {
-        return set_best()->validate_utf8(buf, len);
-    }
+  simdutf_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf8(buf, len);
+  }
 
-    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final override
-    {
-        return set_best()->validate_utf8_with_errors(buf, len);
-    }
+  simdutf_warn_unused result validate_utf8_with_errors(const char * buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf8_with_errors(buf, len);
+  }
 
-    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final override
-    {
-        return set_best()->validate_ascii(buf, len);
-    }
+  simdutf_warn_unused bool validate_ascii(const char * buf, size_t len) const noexcept final override {
+    return set_best()->validate_ascii(buf, len);
+  }
 
-    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final override
-    {
-        return set_best()->validate_ascii_with_errors(buf, len);
-    }
+  simdutf_warn_unused result validate_ascii_with_errors(const char * buf, size_t len) const noexcept final override {
+    return set_best()->validate_ascii_with_errors(buf, len);
+  }
 
-    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final override
-    {
-        return set_best()->validate_utf16le(buf, len);
-    }
+  simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf16le(buf, len);
+  }
 
-    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final override
-    {
-        return set_best()->validate_utf16be(buf, len);
-    }
+  simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf16be(buf, len);
+  }
 
-    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final override
-    {
-        return set_best()->validate_utf16le_with_errors(buf, len);
-    }
+  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf16le_with_errors(buf, len);
+  }
 
-    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final override
-    {
-        return set_best()->validate_utf16be_with_errors(buf, len);
-    }
+  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf16be_with_errors(buf, len);
+  }
 
-    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final override
-    {
-        return set_best()->validate_utf32(buf, len);
-    }
+  simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf32(buf, len);
+  }
 
-    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final override
-    {
-        return set_best()->validate_utf32_with_errors(buf, len);
-    }
+  simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf32_with_errors(buf, len);
+  }
 
-    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final override
-    {
-        return set_best()->convert_latin1_to_utf8(buf, len, utf8_output);
-    }
+  simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final override {
+    return set_best()->convert_latin1_to_utf8(buf, len,utf8_output);
+  }
 
-    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output);
-    }
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output);
-    }
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_latin1_to_utf32(buf, len, latin1_output);
-    }
+  simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t * latin1_output) const noexcept final override {
+    return set_best()->convert_latin1_to_utf32(buf, len,latin1_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_utf8_to_latin1(buf, len, latin1_output);
-    }
+  simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final override {
+    return set_best()->convert_utf8_to_latin1(buf, len,latin1_output);
+  }
 
-    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_utf8_to_latin1_with_errors(buf, len, latin1_output);
-    }
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept  final override {
+  return set_best()->convert_utf8_to_latin1_with_errors(buf, len, latin1_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf8_to_latin1(buf, len, latin1_output);
-    }
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_latin1(buf, len,latin1_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
-    }
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
-    }
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_utf8_to_utf16le_with_errors(buf, len, utf16_output);
-    }
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16le_with_errors(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_utf8_to_utf16be_with_errors(buf, len, utf16_output);
-    }
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16be_with_errors(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
-    }
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
-    }
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final override
-    {
-        return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
-    }
+  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
+  }
 
-    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final override
-    {
-        return set_best()->convert_utf8_to_utf32_with_errors(buf, len, utf32_output);
-    }
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf32_with_errors(buf, len, utf32_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
-    }
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output);
-    }
+   simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output);
-    }
+     simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output);
+  }
 
-    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_output);
-    }
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_output);
+  }
 
-    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_output);
-    }
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output);
-    }
+   simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override {
+    return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output);
-    }
+   simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_output) const noexcept final override {
+    return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
-    {
-        return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
-    }
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
-    {
-        return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
-    }
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
+  }
 
-    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
-    {
-        return set_best()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_output);
-    }
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_output);
+  }
 
-    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
-    {
-        return set_best()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_output);
-    }
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
-    }
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
+    return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
-    }
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
+    return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
-    }
+  simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final override {
+    return set_best()->convert_utf32_to_latin1(buf, len,latin1_output);
+  }
 
-    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
-    }
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * buf, size_t len, char* latin1_output) const noexcept final override {
+    return set_best()->convert_utf32_to_latin1_with_errors(buf, len,latin1_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final override
-    {
-        return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
-    }
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * buf, size_t len, char* latin1_output) const noexcept final override {
+    return set_best()->convert_utf32_to_latin1(buf, len,latin1_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept final override
-    {
-        return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
-    }
+  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
+  }
 
-    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept final override
-    {
-        return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-    }
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
-    }
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
+    return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
-    }
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
-    }
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_output);
-    }
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_output);
-    }
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
-    }
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
-    }
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
-    {
-        return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
-    }
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
+  }
 
-    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
-    {
-        return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
-    }
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
+  }
 
-    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
-    {
-        return set_best()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_output);
-    }
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_output);
+  }
 
-    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
-    {
-        return set_best()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_output);
-    }
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
-    }
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
+    return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
-    {
-        return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
-    }
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
+    return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
+  }
 
-    void change_endianness_utf16(const char16_t* buf, size_t len, char16_t* output) const noexcept final override
-    {
-        set_best()->change_endianness_utf16(buf, len, output);
-    }
+  void change_endianness_utf16(const char16_t * buf, size_t len, char16_t * output) const noexcept final override {
+    set_best()->change_endianness_utf16(buf, len, output);
+  }
 
-    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t len) const noexcept final override
-    {
-        return set_best()->count_utf16le(buf, len);
-    }
+  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t len) const noexcept final override {
+    return set_best()->count_utf16le(buf, len);
+  }
 
-    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t len) const noexcept final override
-    {
-        return set_best()->count_utf16be(buf, len);
-    }
+  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t len) const noexcept final override {
+    return set_best()->count_utf16be(buf, len);
+  }
 
-    simdutf_warn_unused size_t count_utf8(const char* buf, size_t len) const noexcept final override
-    {
-        return set_best()->count_utf8(buf, len);
-    }
+  simdutf_warn_unused size_t count_utf8(const char * buf, size_t len) const noexcept final override {
+    return set_best()->count_utf8(buf, len);
+  }
 
-    simdutf_warn_unused size_t latin1_length_from_utf8(const char* buf, size_t len) const noexcept override
-    {
-        return set_best()->latin1_length_from_utf8(buf, len);
-    }
+  simdutf_warn_unused size_t latin1_length_from_utf8(const char * buf, size_t len) const noexcept override {
+    return set_best()->latin1_length_from_utf8(buf, len);
+  }
 
-    simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) const noexcept override
-    {
-        return set_best()->latin1_length_from_utf16(len);
-    }
+  simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) const noexcept override {
+    return set_best()->latin1_length_from_utf16(len);
+  }
 
-    simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) const noexcept override
-    {
-        return set_best()->latin1_length_from_utf32(len);
-    }
+  simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) const noexcept override {
+    return set_best()->latin1_length_from_utf32(len);
+  }
 
-    simdutf_warn_unused size_t utf8_length_from_latin1(const char* buf, size_t len) const noexcept override
-    {
-        return set_best()->utf8_length_from_latin1(buf, len);
-    }
+  simdutf_warn_unused size_t utf8_length_from_latin1(const char * buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_latin1(buf, len);
+  }
 
-    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* buf, size_t len) const noexcept override
-    {
-        return set_best()->utf8_length_from_utf16le(buf, len);
-    }
+  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_utf16le(buf, len);
+  }
 
-    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* buf, size_t len) const noexcept override
-    {
-        return set_best()->utf8_length_from_utf16be(buf, len);
-    }
+  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_utf16be(buf, len);
+  }
 
-    simdutf_warn_unused size_t utf16_length_from_latin1(size_t len) const noexcept override
-    {
-        return set_best()->utf16_length_from_latin1(len);
-    }
+  simdutf_warn_unused size_t utf16_length_from_latin1(size_t len) const noexcept override {
+    return set_best()->utf16_length_from_latin1(len);
+  }
 
-    simdutf_warn_unused size_t utf32_length_from_latin1(size_t len) const noexcept override
-    {
-        return set_best()->utf32_length_from_latin1(len);
-    }
+  simdutf_warn_unused size_t utf32_length_from_latin1(size_t len) const noexcept override {
+    return set_best()->utf32_length_from_latin1(len);
+  }
 
-    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* buf, size_t len) const noexcept override
-    {
-        return set_best()->utf32_length_from_utf16le(buf, len);
-    }
+  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override {
+    return set_best()->utf32_length_from_utf16le(buf, len);
+  }
 
-    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* buf, size_t len) const noexcept override
-    {
-        return set_best()->utf32_length_from_utf16be(buf, len);
-    }
+  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override {
+    return set_best()->utf32_length_from_utf16be(buf, len);
+  }
 
-    simdutf_warn_unused size_t utf16_length_from_utf8(const char* buf, size_t len) const noexcept override
-    {
-        return set_best()->utf16_length_from_utf8(buf, len);
-    }
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char * buf, size_t len) const noexcept override {
+    return set_best()->utf16_length_from_utf8(buf, len);
+  }
 
-    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* buf, size_t len) const noexcept override
-    {
-        return set_best()->utf8_length_from_utf32(buf, len);
-    }
+  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_utf32(buf, len);
+  }
 
-    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* buf, size_t len) const noexcept override
-    {
-        return set_best()->utf16_length_from_utf32(buf, len);
-    }
+  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * buf, size_t len) const noexcept override {
+    return set_best()->utf16_length_from_utf32(buf, len);
+  }
 
-    simdutf_warn_unused size_t utf32_length_from_utf8(const char* buf, size_t len) const noexcept override
-    {
-        return set_best()->utf32_length_from_utf8(buf, len);
-    }
+  simdutf_warn_unused size_t utf32_length_from_utf8(const char * buf, size_t len) const noexcept override {
+    return set_best()->utf32_length_from_utf8(buf, len);
+  }
 
-    simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept
-        : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0)
-    {
-    }
+  simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
 
 private:
-    const implementation* set_best() const noexcept;
+  const implementation *set_best() const noexcept;
 };
 
-static const std::initializer_list<const implementation*>& get_available_implementation_pointers()
-{
-    static const std::initializer_list<const implementation*> available_implementation_pointers
-    {
+static const std::initializer_list<const implementation *>& get_available_implementation_pointers() {
+  static const std::initializer_list<const implementation *> available_implementation_pointers {
 #if SIMDUTF_IMPLEMENTATION_ICELAKE
-        get_icelake_singleton(),
+    get_icelake_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_HASWELL
-            get_haswell_singleton(),
+    get_haswell_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_WESTMERE
-            get_westmere_singleton(),
+    get_westmere_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_ARM64
-            get_arm64_singleton(),
+    get_arm64_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_PPC64
-            get_ppc64_singleton(),
+    get_ppc64_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
-            get_fallback_singleton(),
+    get_fallback_singleton(),
 #endif
-    }; // available_implementation_pointers
-    return available_implementation_pointers;
+  }; // available_implementation_pointers
+  return available_implementation_pointers;
 }
 
 // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
 class unsupported_implementation final : public implementation {
 public:
-    simdutf_warn_unused int detect_encodings(const char*, size_t) const noexcept override
-    {
-        return encoding_type::unspecified;
-    }
-
-    simdutf_warn_unused bool validate_utf8(const char*, size_t) const noexcept final override
-    {
-        return false; // Just refuse to validate. Given that we have a fallback implementation
-        // it seems unlikely that unsupported_implementation will ever be used. If it is used,
-        // then it will flag all strings as invalid. The alternative is to return an error_code
-        // from which the user has to figure out whether the string is valid UTF-8... which seems
-        // like a lot of work just to handle the very unlikely case that we have an unsupported
-        // implementation. And, when it does happen (that we have an unsupported implementation),
-        // what are the chances that the programmer has a fallback? Given that *we* provide the
-        // fallback, it implies that the programmer would need a fallback for our fallback.
-    }
-
-    simdutf_warn_unused result validate_utf8_with_errors(const char*, size_t) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
-
-    simdutf_warn_unused bool validate_ascii(const char*, size_t) const noexcept final override
-    {
-        return false;
-    }
-
-    simdutf_warn_unused result validate_ascii_with_errors(const char*, size_t) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
-
-    simdutf_warn_unused bool validate_utf16le(const char16_t*, size_t) const noexcept final override
-    {
-        return false;
-    }
-
-    simdutf_warn_unused bool validate_utf16be(const char16_t*, size_t) const noexcept final override
-    {
-        return false;
-    }
-
-    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t*, size_t) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
-
-    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t*, size_t) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
-
-    simdutf_warn_unused bool validate_utf32(const char32_t*, size_t) const noexcept final override
-    {
-        return false;
-    }
-
-    simdutf_warn_unused result validate_utf32_with_errors(const char32_t*, size_t) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
-
-    simdutf_warn_unused size_t convert_latin1_to_utf8(const char*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char*, size_t, char16_t*) const noexcept final override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char*, size_t, char16_t*) const noexcept final override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t convert_latin1_to_utf32(const char*, size_t, char32_t*) const noexcept final override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t convert_utf8_to_latin1(const char*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char*, size_t, char*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
-
-    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused int detect_encodings(const char *, size_t) const noexcept override {
+    return encoding_type::unspecified;
+  }
+
+  simdutf_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override {
+    return false; // Just refuse to validate. Given that we have a fallback implementation
+    // it seems unlikely that unsupported_implementation will ever be used. If it is used,
+    // then it will flag all strings as invalid. The alternative is to return an error_code
+    // from which the user has to figure out whether the string is valid UTF-8... which seems
+    // like a lot of work just to handle the very unlikely case that we have an unsupported
+    // implementation. And, when it does happen (that we have an unsupported implementation),
+    // what are the chances that the programmer has a fallback? Given that *we* provide the
+    // fallback, it implies that the programmer would need a fallback for our fallback.
+  }
+
+  simdutf_warn_unused result validate_utf8_with_errors(const char *, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused bool validate_ascii(const char *, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused result validate_ascii_with_errors(const char *, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused bool validate_utf16le(const char16_t*, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused bool validate_utf16be(const char16_t*, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t*, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t*, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused bool validate_utf32(const char32_t*, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused result validate_utf32_with_errors(const char32_t*, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_latin1_to_utf8(const char*, size_t, char*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_latin1_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_latin1(const char*, size_t, char*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char*, size_t, char*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char*, size_t, char*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char*, size_t, char16_t*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char*, size_t, char16_t*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char*, size_t, char32_t*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
 
-    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char*, size_t, char16_t*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
+  simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char*, size_t, char16_t*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
+  simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
 
-    simdutf_warn_unused size_t convert_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char*, size_t, char32_t*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
 
-    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
 
-    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t *, size_t, char* ) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t *, size_t, char* ) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
 
-    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t *, size_t, char* ) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
+  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t*, size_t, char*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t*, size_t, char*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t*, size_t, char*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
 
-    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
+  void change_endianness_utf16(const char16_t *, size_t, char16_t *) const noexcept final override {
+
+  }
 
-    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
+  simdutf_warn_unused size_t count_utf16le(const char16_t *, size_t) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t count_utf16be(const char16_t *, size_t) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t count_utf8(const char *, size_t) const noexcept final override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override
-    {
-        return 0;
-    }
+  simdutf_warn_unused size_t latin1_length_from_utf8(const char *, size_t) const noexcept override {
+    return 0;
+  }
 
-    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
+  simdutf_warn_unused size_t latin1_length_from_utf16( size_t) const noexcept override {
+    return 0;
+  }
 
-    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override
-    {
-        return result(error_code::OTHER, 0);
-    }
-
-    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override
-    {
-        return 0;
-    }
-
-    void change_endianness_utf16(const char16_t*, size_t, char16_t*) const noexcept final override
-    {
-    }
-
-    simdutf_warn_unused size_t count_utf16le(const char16_t*, size_t) const noexcept final override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t count_utf16be(const char16_t*, size_t) const noexcept final override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t count_utf8(const char*, size_t) const noexcept final override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t latin1_length_from_utf8(const char*, size_t) const noexcept override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t latin1_length_from_utf16(size_t) const noexcept override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t latin1_length_from_utf32(size_t) const noexcept override
-    {
-        return 0;
-    }
-    simdutf_warn_unused size_t utf8_length_from_latin1(const char*, size_t) const noexcept override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t*, size_t) const noexcept override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t*, size_t) const noexcept override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t*, size_t) const noexcept override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t*, size_t) const noexcept override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t utf32_length_from_latin1(size_t) const noexcept override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t utf16_length_from_utf8(const char*, size_t) const noexcept override
-    {
-        return 0;
-    }
-    simdutf_warn_unused size_t utf16_length_from_latin1(size_t) const noexcept override
-    {
-        return 0;
-    }
-    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t*, size_t) const noexcept override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t*, size_t) const noexcept override
-    {
-        return 0;
-    }
-
-    simdutf_warn_unused size_t utf32_length_from_utf8(const char*, size_t) const noexcept override
-    {
-        return 0;
-    }
-
-    unsupported_implementation()
-        : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0)
-    {
-    }
+  simdutf_warn_unused size_t latin1_length_from_utf32(size_t) const noexcept override {
+    return 0;
+  }
+  simdutf_warn_unused size_t utf8_length_from_latin1(const char *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char *, size_t) const noexcept override {
+    return 0;
+  }
+  simdutf_warn_unused size_t utf16_length_from_latin1(size_t) const noexcept override {
+    return 0;
+  }
+  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t utf32_length_from_utf8(const char *, size_t) const noexcept override {
+    return 0;
+  }
+
+  unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
 };
 
-const unsupported_implementation unsupported_singleton {};
+const unsupported_implementation unsupported_singleton{};
 
-size_t available_implementation_list::size() const noexcept
-{
-    return internal::get_available_implementation_pointers().size();
+size_t available_implementation_list::size() const noexcept {
+  return internal::get_available_implementation_pointers().size();
 }
-const implementation* const* available_implementation_list::begin() const noexcept
-{
-    return internal::get_available_implementation_pointers().begin();
+const implementation * const *available_implementation_list::begin() const noexcept {
+  return internal::get_available_implementation_pointers().begin();
 }
-const implementation* const* available_implementation_list::end() const noexcept
-{
-    return internal::get_available_implementation_pointers().end();
+const implementation * const *available_implementation_list::end() const noexcept {
+  return internal::get_available_implementation_pointers().end();
 }
-const implementation* available_implementation_list::detect_best_supported() const noexcept
-{
-    // They are prelisted in priority order, so we just go down the list
-    uint32_t supported_instruction_sets = internal::detect_supported_architectures();
-    for (const implementation* impl : internal::get_available_implementation_pointers()) {
-        uint32_t required_instruction_sets = impl->required_instruction_sets();
-        if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) {
-            return impl;
-        }
-    }
-    return &unsupported_singleton; // this should never happen?
+const implementation *available_implementation_list::detect_best_supported() const noexcept {
+  // They are prelisted in priority order, so we just go down the list
+  uint32_t supported_instruction_sets = internal::detect_supported_architectures();
+  for (const implementation *impl : internal::get_available_implementation_pointers()) {
+    uint32_t required_instruction_sets = impl->required_instruction_sets();
+    if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; }
+  }
+  return &unsupported_singleton; // this should never happen?
 }
 
-const implementation* detect_best_supported_implementation_on_first_use::set_best() const noexcept
-{
-    SIMDUTF_PUSH_DISABLE_WARNINGS
-    SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
-        char* force_implementation_name
-        = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
-    SIMDUTF_POP_DISABLE_WARNINGS
+const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept {
+  SIMDUTF_PUSH_DISABLE_WARNINGS
+  SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
+  char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
+  SIMDUTF_POP_DISABLE_WARNINGS
 
-    if (force_implementation_name) {
-        auto force_implementation = get_available_implementations()[force_implementation_name];
-        if (force_implementation) {
-            return get_active_implementation() = force_implementation;
-        } else {
-            // Note: abort() and stderr usage within the library is forbidden.
-            return get_active_implementation() = &unsupported_singleton;
-        }
+  if (force_implementation_name) {
+    auto force_implementation = get_available_implementations()[force_implementation_name];
+    if (force_implementation) {
+      return get_active_implementation() = force_implementation;
+    } else {
+      // Note: abort() and stderr usage within the library is forbidden.
+      return get_active_implementation() = &unsupported_singleton;
     }
-    return get_active_implementation() = get_available_implementations().detect_best_supported();
+  }
+  return get_active_implementation() = get_available_implementations().detect_best_supported();
 }
 
 } // namespace internal
 
+
+
 /**
  * The list of available implementations compiled into simdutf.
  */
-SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations()
-{
-    static const internal::available_implementation_list available_implementations {};
-    return available_implementations;
+SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations() {
+  static const internal::available_implementation_list available_implementations{};
+  return available_implementations;
 }
 
 /**
- * The active implementation.
- */
-SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation()
-{
+  * The active implementation.
+  */
+SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation() {
     static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
-    static internal::atomic_ptr<const implementation> active_implementation { &detect_best_supported_implementation_on_first_use_singleton };
+    static internal::atomic_ptr<const implementation> active_implementation{&detect_best_supported_implementation_on_first_use_singleton};
     return active_implementation;
 }
 
-simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) noexcept
-{
-    return get_active_implementation()->validate_utf8(buf, len);
+simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
+  return get_active_implementation()->validate_utf8(buf, len);
 }
-simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) noexcept
-{
-    return get_active_implementation()->validate_utf8_with_errors(buf, len);
+simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept {
+  return get_active_implementation()->validate_utf8_with_errors(buf, len);
 }
-simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) noexcept
-{
-    return get_active_implementation()->validate_ascii(buf, len);
+simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
+  return get_active_implementation()->validate_ascii(buf, len);
 }
-simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) noexcept
-{
-    return get_active_implementation()->validate_ascii_with_errors(buf, len);
+simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept {
+  return get_active_implementation()->validate_ascii_with_errors(buf, len);
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16(const char* input, size_t length, char16_t* utf16_output) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return convert_utf8_to_utf16be(input, length, utf16_output);
-#else
-    return convert_utf8_to_utf16le(input, length, utf16_output);
-#endif
+simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf8_to_utf16be(input, length, utf16_output);
+  #else
+  return convert_utf8_to_utf16le(input, length, utf16_output);
+  #endif
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* input, size_t length, char16_t* utf16_output) noexcept
-{
-    return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output);
+simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept {
+  return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output);
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* input, size_t length, char16_t* utf16_output) noexcept
-{
-    return get_active_implementation()->convert_utf8_to_utf16be(input, length, utf16_output);
+simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept {
+  return get_active_implementation()->convert_utf8_to_utf16be(input, length, utf16_output);
 }
-simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char* input, size_t length, char16_t* utf16_output) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
-#else
-    return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
-#endif
+simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
+  #else
+  return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
+  #endif
 }
-simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* input, size_t length, char16_t* utf16_output) noexcept
-{
-    return get_active_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
+simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
+  return get_active_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
 }
-simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* input, size_t length, char16_t* utf16_output) noexcept
-{
-    return get_active_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
+simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
+  return get_active_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
 }
-simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* input, size_t length, char16_t* utf16_output) noexcept
-{
-    return get_active_implementation()->convert_latin1_to_utf16le(input, length, utf16_output);
+simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept {
+  return get_active_implementation()->convert_latin1_to_utf16le(input, length, utf16_output);
 }
-simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* input, size_t length, char16_t* utf16_output) noexcept
-{
-    return get_active_implementation()->convert_latin1_to_utf16be(input, length, utf16_output);
+simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept {
+  return get_active_implementation()->convert_latin1_to_utf16be(input, length, utf16_output);
 }
-simdutf_warn_unused size_t convert_utf8_to_utf32(const char* input, size_t length, char32_t* utf32_output) noexcept
-{
-    return get_active_implementation()->convert_utf8_to_utf32(input, length, utf32_output);
+simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept {
+  return get_active_implementation()->convert_utf8_to_utf32(input, length, utf32_output);
 }
-simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* input, size_t length, char32_t* utf32_output) noexcept
-{
-    return get_active_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output);
+simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept {
+  return get_active_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output);
 }
-simdutf_warn_unused bool validate_utf16(const char16_t* buf, size_t len) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return validate_utf16be(buf, len);
-#else
-    return validate_utf16le(buf, len);
-#endif
+simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return validate_utf16be(buf, len);
+  #else
+  return validate_utf16le(buf, len);
+  #endif
 }
-simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) noexcept
-{
-    return get_active_implementation()->validate_utf16le(buf, len);
+simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) noexcept {
+  return get_active_implementation()->validate_utf16le(buf, len);
 }
-simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) noexcept
-{
-    return get_active_implementation()->validate_utf16be(buf, len);
+simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) noexcept {
+  return get_active_implementation()->validate_utf16be(buf, len);
 }
-simdutf_warn_unused result validate_utf16_with_errors(const char16_t* buf, size_t len) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return validate_utf16be_with_errors(buf, len);
-#else
-    return validate_utf16le_with_errors(buf, len);
-#endif
+simdutf_warn_unused result validate_utf16_with_errors(const char16_t * buf, size_t len) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return validate_utf16be_with_errors(buf, len);
+  #else
+  return validate_utf16le_with_errors(buf, len);
+  #endif
 }
-simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) noexcept
-{
-    return get_active_implementation()->validate_utf16le_with_errors(buf, len);
+simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) noexcept {
+  return get_active_implementation()->validate_utf16le_with_errors(buf, len);
 }
-simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) noexcept
-{
-    return get_active_implementation()->validate_utf16be_with_errors(buf, len);
+simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) noexcept {
+  return get_active_implementation()->validate_utf16be_with_errors(buf, len);
 }
-simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) noexcept
-{
-    return get_active_implementation()->validate_utf32(buf, len);
+simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) noexcept {
+  return get_active_implementation()->validate_utf32(buf, len);
 }
-simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) noexcept
-{
-    return get_active_implementation()->validate_utf32_with_errors(buf, len);
+simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) noexcept {
+  return get_active_implementation()->validate_utf32_with_errors(buf, len);
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char* input, size_t length, char16_t* utf16_buffer) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
-#else
-    return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
-#endif
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
+  #else
+  return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
+  #endif
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* input, size_t length, char16_t* utf16_buffer) noexcept
-{
-    return get_active_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
+  return get_active_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* input, size_t length, char16_t* utf16_buffer) noexcept
-{
-    return get_active_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
+  return get_active_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* input, size_t length, char32_t* utf32_buffer) noexcept
-{
-    return get_active_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer);
+simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept {
+  return get_active_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer);
 }
-simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return convert_utf16be_to_utf8(buf, len, utf8_buffer);
-#else
-    return convert_utf16le_to_utf8(buf, len, utf8_buffer);
-#endif
+simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_utf8(buf, len, utf8_buffer);
+  #else
+  return convert_utf16le_to_utf8(buf, len, utf8_buffer);
+  #endif
 }
-simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf16be_to_latin1(buf, len, latin1_buffer);
+simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
+  return get_active_implementation()->convert_utf16be_to_latin1(buf, len, latin1_buffer);
 }
-simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf16le_to_latin1(buf, len, latin1_buffer);
+simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
+  return get_active_implementation()->convert_utf16le_to_latin1(buf, len, latin1_buffer);
 }
-simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
+simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
+  return get_active_implementation()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
 }
-simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
+simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
+  return get_active_implementation()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
 }
-simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
+  return get_active_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
+  return get_active_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
-#else
-    return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
-#endif
+simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
+  #else
+  return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
+  #endif
 }
-simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
+simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
+  return get_active_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
 }
-simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
+simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
+  return get_active_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
-#else
-    return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
-#endif
+simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
+  #else
+  return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
+  #endif
 }
-simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
-{
-    return get_active_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
+  return get_active_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
-{
-    return get_active_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
+  return get_active_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
+  return get_active_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer);
+simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
+  return get_active_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) noexcept
-{
-    return get_active_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
+  return get_active_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return convert_utf32_to_utf16be(buf, len, utf16_buffer);
-#else
-    return convert_utf32_to_utf16le(buf, len, utf16_buffer);
-#endif
+simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf32_to_utf16be(buf, len, utf16_buffer);
+  #else
+  return convert_utf32_to_utf16le(buf, len, utf16_buffer);
+  #endif
 }
-simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* input, size_t length, char* latin1_output) noexcept
-{
-    return get_active_implementation()->convert_utf32_to_latin1(input, length, latin1_output);
+simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_output) noexcept {
+  return get_active_implementation()->convert_utf32_to_latin1(input, length, latin1_output);
 }
-simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer);
+simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
+  return get_active_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer);
+simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
+  return get_active_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer);
 }
-simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
-#else
-    return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
-#endif
+simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
+  #else
+  return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
+  #endif
 }
-simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
+simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
+  return get_active_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
 }
-simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
+simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
+  return get_active_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
-#else
-    return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
-#endif
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
+  #else
+  return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
+  #endif
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
-{
-    return get_active_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
+  return get_active_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
-{
-    return get_active_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
+  return get_active_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return convert_utf16be_to_utf32(buf, len, utf32_buffer);
-#else
-    return convert_utf16le_to_utf32(buf, len, utf32_buffer);
-#endif
+simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_utf32(buf, len, utf32_buffer);
+  #else
+  return convert_utf16le_to_utf32(buf, len, utf32_buffer);
+  #endif
 }
-simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
+  return get_active_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer);
 }
-simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
+  return get_active_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer);
 }
-simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
-#else
-    return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
-#endif
+simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
+  #else
+  return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
+  #endif
 }
-simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
+simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
+  return get_active_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
 }
-simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
-{
-    return get_active_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
+simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
+  return get_active_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
-#else
-    return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
-#endif
+simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
+  #else
+  return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
+  #endif
 }
-simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
-{
-    return get_active_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
+  return get_active_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
-{
-    return get_active_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
+  return get_active_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
 }
-void change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) noexcept
-{
-    get_active_implementation()->change_endianness_utf16(input, length, output);
+void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept {
+  get_active_implementation()->change_endianness_utf16(input, length, output);
 }
-simdutf_warn_unused size_t count_utf16(const char16_t* input, size_t length) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return count_utf16be(input, length);
-#else
-    return count_utf16le(input, length);
-#endif
+simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return count_utf16be(input, length);
+  #else
+  return count_utf16le(input, length);
+  #endif
 }
-simdutf_warn_unused size_t count_utf16le(const char16_t* input, size_t length) noexcept
-{
-    return get_active_implementation()->count_utf16le(input, length);
+simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept {
+  return get_active_implementation()->count_utf16le(input, length);
 }
-simdutf_warn_unused size_t count_utf16be(const char16_t* input, size_t length) noexcept
-{
-    return get_active_implementation()->count_utf16be(input, length);
+simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept {
+  return get_active_implementation()->count_utf16be(input, length);
 }
-simdutf_warn_unused size_t count_utf8(const char* input, size_t length) noexcept
-{
-    return get_active_implementation()->count_utf8(input, length);
+simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept {
+  return get_active_implementation()->count_utf8(input, length);
 }
-simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t* input, size_t length) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return utf8_length_from_utf16be(input, length);
-#else
-    return utf8_length_from_utf16le(input, length);
-#endif
+simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return utf8_length_from_utf16be(input, length);
+  #else
+  return utf8_length_from_utf16le(input, length);
+  #endif
 }
-simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) noexcept
-{
-    return get_active_implementation()->utf8_length_from_utf16le(input, length);
+simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept {
+  return get_active_implementation()->utf8_length_from_utf16le(input, length);
 }
-simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) noexcept
-{
-    return get_active_implementation()->utf8_length_from_utf16be(input, length);
+simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept {
+  return get_active_implementation()->utf8_length_from_utf16be(input, length);
 }
-simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t* input, size_t length) noexcept
-{
-#if SIMDUTF_IS_BIG_ENDIAN
-    return utf32_length_from_utf16be(input, length);
-#else
-    return utf32_length_from_utf16le(input, length);
-#endif
+simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept {
+  #if SIMDUTF_IS_BIG_ENDIAN
+  return utf32_length_from_utf16be(input, length);
+  #else
+  return utf32_length_from_utf16le(input, length);
+  #endif
 }
-simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) noexcept
-{
-    return get_active_implementation()->utf32_length_from_utf16le(input, length);
+simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept {
+  return get_active_implementation()->utf32_length_from_utf16le(input, length);
 }
-simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) noexcept
-{
-    return get_active_implementation()->utf32_length_from_utf16be(input, length);
+simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept {
+  return get_active_implementation()->utf32_length_from_utf16be(input, length);
 }
-simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) noexcept
-{
-    return get_active_implementation()->utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept {
+  return get_active_implementation()->utf16_length_from_utf8(input, length);
 }
-simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept
-{
-    return get_active_implementation()->utf16_length_from_latin1(length);
+simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept {
+  return get_active_implementation()->utf16_length_from_latin1(length);
 }
-simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) noexcept
-{
-    return get_active_implementation()->utf8_length_from_utf32(input, length);
+simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept {
+  return get_active_implementation()->utf8_length_from_utf32(input, length);
 }
-simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) noexcept
-{
-    return get_active_implementation()->utf16_length_from_utf32(input, length);
+simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept {
+  return get_active_implementation()->utf16_length_from_utf32(input, length);
 }
-simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) noexcept
-{
-    return get_active_implementation()->utf32_length_from_utf8(input, length);
+simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept {
+  return get_active_implementation()->utf32_length_from_utf8(input, length);
 }
-simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char* buf, size_t length) noexcept
-{
-    return get_active_implementation()->autodetect_encoding(buf, length);
+simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept {
+  return get_active_implementation()->autodetect_encoding(buf, length);
 }
-simdutf_warn_unused int detect_encodings(const char* buf, size_t length) noexcept
-{
-    return get_active_implementation()->detect_encodings(buf, length);
+simdutf_warn_unused int detect_encodings(const char * buf, size_t length) noexcept {
+  return get_active_implementation()->detect_encodings(buf, length);
 }
 
-const implementation* builtin_implementation()
-{
-    static const implementation* builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)];
-    return builtin_impl;
+const implementation * builtin_implementation() {
+  static const implementation * builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)];
+  return builtin_impl;
 }
 
+
 } // namespace simdutf
 
 /* end file src/implementation.cpp */
@@ -6259,8 +5347,7 @@ const implementation* builtin_implementation()
 /* begin file src/encoding_types.cpp */
 
 namespace simdutf {
-bool match_system(endianness e)
-{
+bool match_system(endianness e) {
 #if SIMDUTF_IS_BIG_ENDIAN
     return e == endianness::BIG;
 #else
@@ -6268,69 +5355,51 @@ bool match_system(endianness e)
 #endif
 }
 
-std::string to_string(encoding_type bom)
-{
-    switch (bom) {
-    case UTF16_LE:
-        return "UTF16 little-endian";
-    case UTF16_BE:
-        return "UTF16 big-endian";
-    case UTF32_LE:
-        return "UTF32 little-endian";
-    case UTF32_BE:
-        return "UTF32 big-endian";
-    case UTF8:
-        return "UTF8";
-    case unspecified:
-        return "unknown";
-    default:
-        return "error";
-    }
+std::string to_string(encoding_type bom) {
+  switch (bom) {
+      case UTF16_LE:     return "UTF16 little-endian";
+      case UTF16_BE:     return "UTF16 big-endian";
+      case UTF32_LE:     return "UTF32 little-endian";
+      case UTF32_BE:     return "UTF32 big-endian";
+      case UTF8:         return "UTF8";
+      case unspecified:  return "unknown";
+      default:           return "error";
+  }
 }
 
 namespace BOM {
 // Note that BOM for UTF8 is discouraged.
-encoding_type check_bom(const uint8_t* byte, size_t length)
-{
-    if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
-        if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
-            return encoding_type::UTF32_LE;
-        } else {
-            return encoding_type::UTF16_LE;
+encoding_type check_bom(const uint8_t* byte, size_t length) {
+        if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
+            if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
+                return encoding_type::UTF32_LE;
+            } else {
+                return encoding_type::UTF16_LE;
+            }
+        } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
+            return encoding_type::UTF16_BE;
+        } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) {
+            return encoding_type::UTF32_BE;
+        } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) {
+            return encoding_type::UTF8;
         }
-    } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
-        return encoding_type::UTF16_BE;
-    } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) {
-        return encoding_type::UTF32_BE;
-    } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) {
-        return encoding_type::UTF8;
+        return encoding_type::unspecified;
     }
-    return encoding_type::unspecified;
-}
 
-encoding_type check_bom(const char* byte, size_t length)
-{
-    return check_bom(reinterpret_cast<const uint8_t*>(byte), length);
-}
-
-size_t bom_byte_size(encoding_type bom)
-{
-    switch (bom) {
-    case UTF16_LE:
-        return 2;
-    case UTF16_BE:
-        return 2;
-    case UTF32_LE:
-        return 4;
-    case UTF32_BE:
-        return 4;
-    case UTF8:
-        return 3;
-    case unspecified:
-        return 0;
-    default:
-        return 0;
-    }
+encoding_type check_bom(const char* byte, size_t length) {
+      return check_bom(reinterpret_cast<const uint8_t*>(byte), length);
+ }
+
+ size_t bom_byte_size(encoding_type bom) {
+        switch (bom) {
+            case UTF16_LE:     return 2;
+            case UTF16_BE:     return 2;
+            case UTF32_LE:     return 4;
+            case UTF32_BE:     return 4;
+            case UTF8:         return 3;
+            case unspecified:  return 0;
+            default:           return 0;
+        }
 }
 
 }
@@ -6340,13 +5409,9 @@ size_t bom_byte_size(encoding_type bom)
 /* begin file src/error.cpp */
 namespace simdutf {
 
-simdutf_really_inline result::result()
-    : error { error_code::SUCCESS }
-    , count { 0 } {};
+  simdutf_really_inline result::result() : error{error_code::SUCCESS}, count{0} {};
 
-simdutf_really_inline result::result(error_code _err, size_t _pos)
-    : error { _err }
-    , count { _pos } {};
+  simdutf_really_inline result::result(error_code _err, size_t _pos) : error{_err}, count{_pos} {};
 
 }
 /* end file src/error.cpp */
@@ -6374,4314 +5439,4316 @@ namespace utf8_to_utf16 {
  * performance penalty.
  */
 
-const uint8_t shufutf8[209][16] = { { 0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0 },
-    { 0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0 },
-    { 0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0 },
-    { 1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0 },
-    { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0 },
-    { 0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0 },
-    { 1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0 },
-    { 2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0 },
-    { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0 } };
+const uint8_t shufutf8[209][16] =
+{	{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+ 	{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+ 	{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+ 	{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
+ 	{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
+ 	{0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
+ 	{1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
+ 	{2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
+ 	{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
 /* number of two bytes : 64 */
 /* number of two + three bytes : 145 */
 /* number of two + three + four bytes : 209 */
-const uint8_t utf8bigindex[4096][2] = { { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 147, 5 },
-    { 209, 12 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 209, 12 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 148, 6 },
-    { 209, 12 },
-    { 151, 6 },
-    { 163, 6 },
-    { 66, 6 },
-    { 209, 12 },
-    { 154, 6 },
-    { 166, 6 },
-    { 68, 6 },
-    { 178, 6 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 169, 6 },
-    { 70, 6 },
-    { 181, 6 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 209, 12 },
-    { 155, 7 },
-    { 167, 7 },
-    { 69, 7 },
-    { 179, 7 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 170, 7 },
-    { 71, 7 },
-    { 182, 7 },
-    { 77, 7 },
-    { 95, 7 },
-    { 65, 5 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 67, 5 },
-    { 119, 7 },
-    { 73, 5 },
-    { 91, 5 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 185, 7 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 68, 6 },
-    { 121, 7 },
-    { 74, 6 },
-    { 92, 6 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 76, 6 },
-    { 94, 6 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 171, 8 },
-    { 72, 8 },
-    { 183, 8 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 186, 8 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 104, 8 },
-    { 68, 6 },
-    { 122, 8 },
-    { 74, 6 },
-    { 92, 6 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 76, 6 },
-    { 94, 6 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 77, 7 },
-    { 95, 7 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 160, 9 },
-    { 172, 9 },
-    { 147, 5 },
-    { 184, 9 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 196, 9 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 175, 9 },
-    { 148, 6 },
-    { 187, 9 },
-    { 81, 9 },
-    { 99, 9 },
-    { 66, 6 },
-    { 199, 9 },
-    { 87, 9 },
-    { 105, 9 },
-    { 68, 6 },
-    { 123, 9 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 111, 9 },
-    { 70, 6 },
-    { 129, 9 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 190, 9 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 202, 9 },
-    { 89, 9 },
-    { 107, 9 },
-    { 69, 7 },
-    { 125, 9 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 113, 9 },
-    { 71, 7 },
-    { 131, 9 },
-    { 77, 7 },
-    { 95, 7 },
-    { 7, 9 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 11, 9 },
-    { 119, 7 },
-    { 19, 9 },
-    { 35, 9 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 137, 9 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 13, 9 },
-    { 121, 7 },
-    { 21, 9 },
-    { 37, 9 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 25, 9 },
-    { 41, 9 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 49, 9 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 205, 9 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 115, 9 },
-    { 72, 8 },
-    { 133, 9 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 139, 9 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 104, 8 },
-    { 14, 9 },
-    { 122, 8 },
-    { 22, 9 },
-    { 38, 9 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 26, 9 },
-    { 42, 9 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 50, 9 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 28, 9 },
-    { 44, 9 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 52, 9 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 56, 9 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 147, 5 },
-    { 209, 12 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 209, 12 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 176, 10 },
-    { 148, 6 },
-    { 188, 10 },
-    { 151, 6 },
-    { 163, 6 },
-    { 66, 6 },
-    { 200, 10 },
-    { 154, 6 },
-    { 166, 6 },
-    { 68, 6 },
-    { 178, 6 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 169, 6 },
-    { 70, 6 },
-    { 181, 6 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 191, 10 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 203, 10 },
-    { 90, 10 },
-    { 108, 10 },
-    { 69, 7 },
-    { 126, 10 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 114, 10 },
-    { 71, 7 },
-    { 132, 10 },
-    { 77, 7 },
-    { 95, 7 },
-    { 65, 5 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 67, 5 },
-    { 119, 7 },
-    { 73, 5 },
-    { 91, 5 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 138, 10 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 68, 6 },
-    { 121, 7 },
-    { 74, 6 },
-    { 92, 6 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 76, 6 },
-    { 94, 6 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 206, 10 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 116, 10 },
-    { 72, 8 },
-    { 134, 10 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 140, 10 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 104, 8 },
-    { 15, 10 },
-    { 122, 8 },
-    { 23, 10 },
-    { 39, 10 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 27, 10 },
-    { 43, 10 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 51, 10 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 29, 10 },
-    { 45, 10 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 53, 10 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 57, 10 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 160, 9 },
-    { 172, 9 },
-    { 147, 5 },
-    { 184, 9 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 196, 9 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 175, 9 },
-    { 148, 6 },
-    { 142, 10 },
-    { 81, 9 },
-    { 99, 9 },
-    { 66, 6 },
-    { 199, 9 },
-    { 87, 9 },
-    { 105, 9 },
-    { 68, 6 },
-    { 123, 9 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 111, 9 },
-    { 70, 6 },
-    { 129, 9 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 190, 9 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 202, 9 },
-    { 89, 9 },
-    { 107, 9 },
-    { 69, 7 },
-    { 125, 9 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 113, 9 },
-    { 71, 7 },
-    { 131, 9 },
-    { 30, 10 },
-    { 46, 10 },
-    { 7, 9 },
-    { 194, 7 },
-    { 83, 7 },
-    { 54, 10 },
-    { 11, 9 },
-    { 119, 7 },
-    { 19, 9 },
-    { 35, 9 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 137, 9 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 58, 10 },
-    { 13, 9 },
-    { 121, 7 },
-    { 21, 9 },
-    { 37, 9 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 25, 9 },
-    { 41, 9 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 49, 9 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 205, 9 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 115, 9 },
-    { 72, 8 },
-    { 133, 9 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 139, 9 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 60, 10 },
-    { 14, 9 },
-    { 122, 8 },
-    { 22, 9 },
-    { 38, 9 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 26, 9 },
-    { 42, 9 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 50, 9 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 28, 9 },
-    { 44, 9 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 52, 9 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 56, 9 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 147, 5 },
-    { 209, 12 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 209, 12 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 148, 6 },
-    { 209, 12 },
-    { 151, 6 },
-    { 163, 6 },
-    { 66, 6 },
-    { 209, 12 },
-    { 154, 6 },
-    { 166, 6 },
-    { 68, 6 },
-    { 178, 6 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 169, 6 },
-    { 70, 6 },
-    { 181, 6 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 192, 11 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 204, 11 },
-    { 155, 7 },
-    { 167, 7 },
-    { 69, 7 },
-    { 179, 7 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 170, 7 },
-    { 71, 7 },
-    { 182, 7 },
-    { 77, 7 },
-    { 95, 7 },
-    { 65, 5 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 67, 5 },
-    { 119, 7 },
-    { 73, 5 },
-    { 91, 5 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 185, 7 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 68, 6 },
-    { 121, 7 },
-    { 74, 6 },
-    { 92, 6 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 76, 6 },
-    { 94, 6 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 207, 11 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 117, 11 },
-    { 72, 8 },
-    { 135, 11 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 141, 11 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 104, 8 },
-    { 68, 6 },
-    { 122, 8 },
-    { 74, 6 },
-    { 92, 6 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 76, 6 },
-    { 94, 6 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 77, 7 },
-    { 95, 7 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 160, 9 },
-    { 172, 9 },
-    { 147, 5 },
-    { 184, 9 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 196, 9 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 175, 9 },
-    { 148, 6 },
-    { 143, 11 },
-    { 81, 9 },
-    { 99, 9 },
-    { 66, 6 },
-    { 199, 9 },
-    { 87, 9 },
-    { 105, 9 },
-    { 68, 6 },
-    { 123, 9 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 111, 9 },
-    { 70, 6 },
-    { 129, 9 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 190, 9 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 202, 9 },
-    { 89, 9 },
-    { 107, 9 },
-    { 69, 7 },
-    { 125, 9 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 113, 9 },
-    { 71, 7 },
-    { 131, 9 },
-    { 31, 11 },
-    { 47, 11 },
-    { 7, 9 },
-    { 194, 7 },
-    { 83, 7 },
-    { 55, 11 },
-    { 11, 9 },
-    { 119, 7 },
-    { 19, 9 },
-    { 35, 9 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 137, 9 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 59, 11 },
-    { 13, 9 },
-    { 121, 7 },
-    { 21, 9 },
-    { 37, 9 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 25, 9 },
-    { 41, 9 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 49, 9 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 205, 9 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 115, 9 },
-    { 72, 8 },
-    { 133, 9 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 139, 9 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 61, 11 },
-    { 14, 9 },
-    { 122, 8 },
-    { 22, 9 },
-    { 38, 9 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 26, 9 },
-    { 42, 9 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 50, 9 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 28, 9 },
-    { 44, 9 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 52, 9 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 56, 9 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 147, 5 },
-    { 209, 12 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 209, 12 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 176, 10 },
-    { 148, 6 },
-    { 188, 10 },
-    { 151, 6 },
-    { 163, 6 },
-    { 66, 6 },
-    { 200, 10 },
-    { 154, 6 },
-    { 166, 6 },
-    { 68, 6 },
-    { 178, 6 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 169, 6 },
-    { 70, 6 },
-    { 181, 6 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 191, 10 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 203, 10 },
-    { 90, 10 },
-    { 108, 10 },
-    { 69, 7 },
-    { 126, 10 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 114, 10 },
-    { 71, 7 },
-    { 132, 10 },
-    { 77, 7 },
-    { 95, 7 },
-    { 65, 5 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 67, 5 },
-    { 119, 7 },
-    { 73, 5 },
-    { 91, 5 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 138, 10 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 68, 6 },
-    { 121, 7 },
-    { 74, 6 },
-    { 92, 6 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 76, 6 },
-    { 94, 6 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 206, 10 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 116, 10 },
-    { 72, 8 },
-    { 134, 10 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 140, 10 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 62, 11 },
-    { 15, 10 },
-    { 122, 8 },
-    { 23, 10 },
-    { 39, 10 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 27, 10 },
-    { 43, 10 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 51, 10 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 29, 10 },
-    { 45, 10 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 53, 10 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 57, 10 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 160, 9 },
-    { 172, 9 },
-    { 147, 5 },
-    { 184, 9 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 196, 9 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 175, 9 },
-    { 148, 6 },
-    { 142, 10 },
-    { 81, 9 },
-    { 99, 9 },
-    { 66, 6 },
-    { 199, 9 },
-    { 87, 9 },
-    { 105, 9 },
-    { 68, 6 },
-    { 123, 9 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 111, 9 },
-    { 70, 6 },
-    { 129, 9 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 190, 9 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 202, 9 },
-    { 89, 9 },
-    { 107, 9 },
-    { 69, 7 },
-    { 125, 9 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 113, 9 },
-    { 71, 7 },
-    { 131, 9 },
-    { 30, 10 },
-    { 46, 10 },
-    { 7, 9 },
-    { 194, 7 },
-    { 83, 7 },
-    { 54, 10 },
-    { 11, 9 },
-    { 119, 7 },
-    { 19, 9 },
-    { 35, 9 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 137, 9 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 58, 10 },
-    { 13, 9 },
-    { 121, 7 },
-    { 21, 9 },
-    { 37, 9 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 25, 9 },
-    { 41, 9 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 49, 9 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 205, 9 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 115, 9 },
-    { 72, 8 },
-    { 133, 9 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 139, 9 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 60, 10 },
-    { 14, 9 },
-    { 122, 8 },
-    { 22, 9 },
-    { 38, 9 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 26, 9 },
-    { 42, 9 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 50, 9 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 28, 9 },
-    { 44, 9 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 52, 9 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 56, 9 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 147, 5 },
-    { 209, 12 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 209, 12 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 148, 6 },
-    { 209, 12 },
-    { 151, 6 },
-    { 163, 6 },
-    { 66, 6 },
-    { 209, 12 },
-    { 154, 6 },
-    { 166, 6 },
-    { 68, 6 },
-    { 178, 6 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 169, 6 },
-    { 70, 6 },
-    { 181, 6 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 209, 12 },
-    { 155, 7 },
-    { 167, 7 },
-    { 69, 7 },
-    { 179, 7 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 170, 7 },
-    { 71, 7 },
-    { 182, 7 },
-    { 77, 7 },
-    { 95, 7 },
-    { 65, 5 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 67, 5 },
-    { 119, 7 },
-    { 73, 5 },
-    { 91, 5 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 185, 7 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 68, 6 },
-    { 121, 7 },
-    { 74, 6 },
-    { 92, 6 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 76, 6 },
-    { 94, 6 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 208, 12 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 171, 8 },
-    { 72, 8 },
-    { 183, 8 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 186, 8 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 104, 8 },
-    { 68, 6 },
-    { 122, 8 },
-    { 74, 6 },
-    { 92, 6 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 76, 6 },
-    { 94, 6 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 77, 7 },
-    { 95, 7 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 160, 9 },
-    { 172, 9 },
-    { 147, 5 },
-    { 184, 9 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 196, 9 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 175, 9 },
-    { 148, 6 },
-    { 144, 12 },
-    { 81, 9 },
-    { 99, 9 },
-    { 66, 6 },
-    { 199, 9 },
-    { 87, 9 },
-    { 105, 9 },
-    { 68, 6 },
-    { 123, 9 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 111, 9 },
-    { 70, 6 },
-    { 129, 9 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 190, 9 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 202, 9 },
-    { 89, 9 },
-    { 107, 9 },
-    { 69, 7 },
-    { 125, 9 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 113, 9 },
-    { 71, 7 },
-    { 131, 9 },
-    { 77, 7 },
-    { 95, 7 },
-    { 7, 9 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 11, 9 },
-    { 119, 7 },
-    { 19, 9 },
-    { 35, 9 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 137, 9 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 13, 9 },
-    { 121, 7 },
-    { 21, 9 },
-    { 37, 9 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 25, 9 },
-    { 41, 9 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 49, 9 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 205, 9 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 115, 9 },
-    { 72, 8 },
-    { 133, 9 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 139, 9 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 104, 8 },
-    { 14, 9 },
-    { 122, 8 },
-    { 22, 9 },
-    { 38, 9 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 26, 9 },
-    { 42, 9 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 50, 9 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 28, 9 },
-    { 44, 9 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 52, 9 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 56, 9 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 147, 5 },
-    { 209, 12 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 209, 12 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 176, 10 },
-    { 148, 6 },
-    { 188, 10 },
-    { 151, 6 },
-    { 163, 6 },
-    { 66, 6 },
-    { 200, 10 },
-    { 154, 6 },
-    { 166, 6 },
-    { 68, 6 },
-    { 178, 6 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 169, 6 },
-    { 70, 6 },
-    { 181, 6 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 191, 10 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 203, 10 },
-    { 90, 10 },
-    { 108, 10 },
-    { 69, 7 },
-    { 126, 10 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 114, 10 },
-    { 71, 7 },
-    { 132, 10 },
-    { 77, 7 },
-    { 95, 7 },
-    { 65, 5 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 67, 5 },
-    { 119, 7 },
-    { 73, 5 },
-    { 91, 5 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 138, 10 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 68, 6 },
-    { 121, 7 },
-    { 74, 6 },
-    { 92, 6 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 76, 6 },
-    { 94, 6 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 206, 10 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 116, 10 },
-    { 72, 8 },
-    { 134, 10 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 140, 10 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 63, 12 },
-    { 15, 10 },
-    { 122, 8 },
-    { 23, 10 },
-    { 39, 10 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 27, 10 },
-    { 43, 10 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 51, 10 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 29, 10 },
-    { 45, 10 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 53, 10 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 57, 10 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 160, 9 },
-    { 172, 9 },
-    { 147, 5 },
-    { 184, 9 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 196, 9 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 175, 9 },
-    { 148, 6 },
-    { 142, 10 },
-    { 81, 9 },
-    { 99, 9 },
-    { 66, 6 },
-    { 199, 9 },
-    { 87, 9 },
-    { 105, 9 },
-    { 68, 6 },
-    { 123, 9 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 111, 9 },
-    { 70, 6 },
-    { 129, 9 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 190, 9 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 202, 9 },
-    { 89, 9 },
-    { 107, 9 },
-    { 69, 7 },
-    { 125, 9 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 113, 9 },
-    { 71, 7 },
-    { 131, 9 },
-    { 30, 10 },
-    { 46, 10 },
-    { 7, 9 },
-    { 194, 7 },
-    { 83, 7 },
-    { 54, 10 },
-    { 11, 9 },
-    { 119, 7 },
-    { 19, 9 },
-    { 35, 9 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 137, 9 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 58, 10 },
-    { 13, 9 },
-    { 121, 7 },
-    { 21, 9 },
-    { 37, 9 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 25, 9 },
-    { 41, 9 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 49, 9 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 205, 9 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 115, 9 },
-    { 72, 8 },
-    { 133, 9 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 139, 9 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 60, 10 },
-    { 14, 9 },
-    { 122, 8 },
-    { 22, 9 },
-    { 38, 9 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 26, 9 },
-    { 42, 9 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 50, 9 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 28, 9 },
-    { 44, 9 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 52, 9 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 56, 9 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 147, 5 },
-    { 209, 12 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 209, 12 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 148, 6 },
-    { 209, 12 },
-    { 151, 6 },
-    { 163, 6 },
-    { 66, 6 },
-    { 209, 12 },
-    { 154, 6 },
-    { 166, 6 },
-    { 68, 6 },
-    { 178, 6 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 169, 6 },
-    { 70, 6 },
-    { 181, 6 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 192, 11 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 204, 11 },
-    { 155, 7 },
-    { 167, 7 },
-    { 69, 7 },
-    { 179, 7 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 170, 7 },
-    { 71, 7 },
-    { 182, 7 },
-    { 77, 7 },
-    { 95, 7 },
-    { 65, 5 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 67, 5 },
-    { 119, 7 },
-    { 73, 5 },
-    { 91, 5 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 185, 7 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 68, 6 },
-    { 121, 7 },
-    { 74, 6 },
-    { 92, 6 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 76, 6 },
-    { 94, 6 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 207, 11 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 117, 11 },
-    { 72, 8 },
-    { 135, 11 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 141, 11 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 104, 8 },
-    { 68, 6 },
-    { 122, 8 },
-    { 74, 6 },
-    { 92, 6 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 76, 6 },
-    { 94, 6 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 77, 7 },
-    { 95, 7 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 160, 9 },
-    { 172, 9 },
-    { 147, 5 },
-    { 184, 9 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 196, 9 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 175, 9 },
-    { 148, 6 },
-    { 143, 11 },
-    { 81, 9 },
-    { 99, 9 },
-    { 66, 6 },
-    { 199, 9 },
-    { 87, 9 },
-    { 105, 9 },
-    { 68, 6 },
-    { 123, 9 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 111, 9 },
-    { 70, 6 },
-    { 129, 9 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 190, 9 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 202, 9 },
-    { 89, 9 },
-    { 107, 9 },
-    { 69, 7 },
-    { 125, 9 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 113, 9 },
-    { 71, 7 },
-    { 131, 9 },
-    { 31, 11 },
-    { 47, 11 },
-    { 7, 9 },
-    { 194, 7 },
-    { 83, 7 },
-    { 55, 11 },
-    { 11, 9 },
-    { 119, 7 },
-    { 19, 9 },
-    { 35, 9 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 137, 9 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 59, 11 },
-    { 13, 9 },
-    { 121, 7 },
-    { 21, 9 },
-    { 37, 9 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 25, 9 },
-    { 41, 9 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 49, 9 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 205, 9 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 115, 9 },
-    { 72, 8 },
-    { 133, 9 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 139, 9 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 61, 11 },
-    { 14, 9 },
-    { 122, 8 },
-    { 22, 9 },
-    { 38, 9 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 26, 9 },
-    { 42, 9 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 50, 9 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 28, 9 },
-    { 44, 9 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 52, 9 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 56, 9 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 147, 5 },
-    { 209, 12 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 209, 12 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 176, 10 },
-    { 148, 6 },
-    { 188, 10 },
-    { 151, 6 },
-    { 163, 6 },
-    { 66, 6 },
-    { 200, 10 },
-    { 154, 6 },
-    { 166, 6 },
-    { 68, 6 },
-    { 178, 6 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 169, 6 },
-    { 70, 6 },
-    { 181, 6 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 191, 10 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 203, 10 },
-    { 90, 10 },
-    { 108, 10 },
-    { 69, 7 },
-    { 126, 10 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 114, 10 },
-    { 71, 7 },
-    { 132, 10 },
-    { 77, 7 },
-    { 95, 7 },
-    { 65, 5 },
-    { 194, 7 },
-    { 83, 7 },
-    { 101, 7 },
-    { 67, 5 },
-    { 119, 7 },
-    { 73, 5 },
-    { 91, 5 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 138, 10 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 103, 7 },
-    { 68, 6 },
-    { 121, 7 },
-    { 74, 6 },
-    { 92, 6 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 76, 6 },
-    { 94, 6 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 206, 10 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 116, 10 },
-    { 72, 8 },
-    { 134, 10 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 140, 10 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 62, 11 },
-    { 15, 10 },
-    { 122, 8 },
-    { 23, 10 },
-    { 39, 10 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 27, 10 },
-    { 43, 10 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 51, 10 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 29, 10 },
-    { 45, 10 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 53, 10 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 57, 10 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 146, 4 },
-    { 209, 12 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 160, 9 },
-    { 172, 9 },
-    { 147, 5 },
-    { 184, 9 },
-    { 150, 5 },
-    { 162, 5 },
-    { 65, 5 },
-    { 196, 9 },
-    { 153, 5 },
-    { 165, 5 },
-    { 67, 5 },
-    { 177, 5 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 175, 9 },
-    { 148, 6 },
-    { 142, 10 },
-    { 81, 9 },
-    { 99, 9 },
-    { 66, 6 },
-    { 199, 9 },
-    { 87, 9 },
-    { 105, 9 },
-    { 68, 6 },
-    { 123, 9 },
-    { 74, 6 },
-    { 92, 6 },
-    { 64, 4 },
-    { 209, 12 },
-    { 157, 6 },
-    { 111, 9 },
-    { 70, 6 },
-    { 129, 9 },
-    { 76, 6 },
-    { 94, 6 },
-    { 65, 5 },
-    { 193, 6 },
-    { 82, 6 },
-    { 100, 6 },
-    { 67, 5 },
-    { 118, 6 },
-    { 73, 5 },
-    { 91, 5 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 190, 9 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 202, 9 },
-    { 89, 9 },
-    { 107, 9 },
-    { 69, 7 },
-    { 125, 9 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 113, 9 },
-    { 71, 7 },
-    { 131, 9 },
-    { 30, 10 },
-    { 46, 10 },
-    { 7, 9 },
-    { 194, 7 },
-    { 83, 7 },
-    { 54, 10 },
-    { 11, 9 },
-    { 119, 7 },
-    { 19, 9 },
-    { 35, 9 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 137, 9 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 58, 10 },
-    { 13, 9 },
-    { 121, 7 },
-    { 21, 9 },
-    { 37, 9 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 25, 9 },
-    { 41, 9 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 49, 9 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 145, 3 },
-    { 205, 9 },
-    { 156, 8 },
-    { 168, 8 },
-    { 146, 4 },
-    { 180, 8 },
-    { 149, 4 },
-    { 161, 4 },
-    { 64, 4 },
-    { 209, 12 },
-    { 159, 8 },
-    { 115, 9 },
-    { 72, 8 },
-    { 133, 9 },
-    { 78, 8 },
-    { 96, 8 },
-    { 65, 5 },
-    { 195, 8 },
-    { 84, 8 },
-    { 102, 8 },
-    { 67, 5 },
-    { 120, 8 },
-    { 73, 5 },
-    { 91, 5 },
-    { 64, 4 },
-    { 209, 12 },
-    { 209, 12 },
-    { 174, 8 },
-    { 148, 6 },
-    { 139, 9 },
-    { 80, 8 },
-    { 98, 8 },
-    { 66, 6 },
-    { 198, 8 },
-    { 86, 8 },
-    { 60, 10 },
-    { 14, 9 },
-    { 122, 8 },
-    { 22, 9 },
-    { 38, 9 },
-    { 3, 8 },
-    { 209, 12 },
-    { 157, 6 },
-    { 110, 8 },
-    { 70, 6 },
-    { 128, 8 },
-    { 26, 9 },
-    { 42, 9 },
-    { 5, 8 },
-    { 193, 6 },
-    { 82, 6 },
-    { 50, 9 },
-    { 9, 8 },
-    { 118, 6 },
-    { 17, 8 },
-    { 33, 8 },
-    { 0, 6 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 209, 12 },
-    { 189, 8 },
-    { 152, 7 },
-    { 164, 7 },
-    { 145, 3 },
-    { 201, 8 },
-    { 88, 8 },
-    { 106, 8 },
-    { 69, 7 },
-    { 124, 8 },
-    { 75, 7 },
-    { 93, 7 },
-    { 64, 4 },
-    { 209, 12 },
-    { 158, 7 },
-    { 112, 8 },
-    { 71, 7 },
-    { 130, 8 },
-    { 28, 9 },
-    { 44, 9 },
-    { 6, 8 },
-    { 194, 7 },
-    { 83, 7 },
-    { 52, 9 },
-    { 10, 8 },
-    { 119, 7 },
-    { 18, 8 },
-    { 34, 8 },
-    { 1, 7 },
-    { 209, 12 },
-    { 209, 12 },
-    { 173, 7 },
-    { 148, 6 },
-    { 136, 8 },
-    { 79, 7 },
-    { 97, 7 },
-    { 66, 6 },
-    { 197, 7 },
-    { 85, 7 },
-    { 56, 9 },
-    { 12, 8 },
-    { 121, 7 },
-    { 20, 8 },
-    { 36, 8 },
-    { 2, 7 },
-    { 209, 12 },
-    { 157, 6 },
-    { 109, 7 },
-    { 70, 6 },
-    { 127, 7 },
-    { 24, 8 },
-    { 40, 8 },
-    { 4, 7 },
-    { 193, 6 },
-    { 82, 6 },
-    { 48, 8 },
-    { 8, 7 },
-    { 118, 6 },
-    { 16, 7 },
-    { 32, 7 },
-    { 0, 6 } };
+const uint8_t utf8bigindex[4096][2] =
+{	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{147, 5},
+ 	{209, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{209, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{148, 6},
+ 	{209, 12},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{209, 12},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{209, 12},
+ 	{155, 7},
+ 	{167, 7},
+ 	{69, 7},
+ 	{179, 7},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{170, 7},
+ 	{71, 7},
+ 	{182, 7},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{185, 7},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{171, 8},
+ 	{72, 8},
+ 	{183, 8},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{186, 8},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{68, 6},
+ 	{122, 8},
+ 	{74, 6},
+ 	{92, 6},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{76, 6},
+ 	{94, 6},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{77, 7},
+ 	{95, 7},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{187, 9},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{77, 7},
+ 	{95, 7},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{147, 5},
+ 	{209, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{209, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{176, 10},
+ 	{148, 6},
+ 	{188, 10},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{200, 10},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{191, 10},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{203, 10},
+ 	{90, 10},
+ 	{108, 10},
+ 	{69, 7},
+ 	{126, 10},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{114, 10},
+ 	{71, 7},
+ 	{132, 10},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{138, 10},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{206, 10},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{116, 10},
+ 	{72, 8},
+ 	{134, 10},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{140, 10},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{15, 10},
+ 	{122, 8},
+ 	{23, 10},
+ 	{39, 10},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{27, 10},
+ 	{43, 10},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{51, 10},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{29, 10},
+ 	{45, 10},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{53, 10},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{57, 10},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{142, 10},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{30, 10},
+ 	{46, 10},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{54, 10},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{58, 10},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{60, 10},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{147, 5},
+ 	{209, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{209, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{148, 6},
+ 	{209, 12},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{209, 12},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{192, 11},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{204, 11},
+ 	{155, 7},
+ 	{167, 7},
+ 	{69, 7},
+ 	{179, 7},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{170, 7},
+ 	{71, 7},
+ 	{182, 7},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{185, 7},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{207, 11},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{117, 11},
+ 	{72, 8},
+ 	{135, 11},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{141, 11},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{68, 6},
+ 	{122, 8},
+ 	{74, 6},
+ 	{92, 6},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{76, 6},
+ 	{94, 6},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{77, 7},
+ 	{95, 7},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{143, 11},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{31, 11},
+ 	{47, 11},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{55, 11},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{59, 11},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{61, 11},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{147, 5},
+ 	{209, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{209, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{176, 10},
+ 	{148, 6},
+ 	{188, 10},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{200, 10},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{191, 10},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{203, 10},
+ 	{90, 10},
+ 	{108, 10},
+ 	{69, 7},
+ 	{126, 10},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{114, 10},
+ 	{71, 7},
+ 	{132, 10},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{138, 10},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{206, 10},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{116, 10},
+ 	{72, 8},
+ 	{134, 10},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{140, 10},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{62, 11},
+ 	{15, 10},
+ 	{122, 8},
+ 	{23, 10},
+ 	{39, 10},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{27, 10},
+ 	{43, 10},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{51, 10},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{29, 10},
+ 	{45, 10},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{53, 10},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{57, 10},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{142, 10},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{30, 10},
+ 	{46, 10},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{54, 10},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{58, 10},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{60, 10},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{147, 5},
+ 	{209, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{209, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{148, 6},
+ 	{209, 12},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{209, 12},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{209, 12},
+ 	{155, 7},
+ 	{167, 7},
+ 	{69, 7},
+ 	{179, 7},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{170, 7},
+ 	{71, 7},
+ 	{182, 7},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{185, 7},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{208, 12},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{171, 8},
+ 	{72, 8},
+ 	{183, 8},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{186, 8},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{68, 6},
+ 	{122, 8},
+ 	{74, 6},
+ 	{92, 6},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{76, 6},
+ 	{94, 6},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{77, 7},
+ 	{95, 7},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{144, 12},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{77, 7},
+ 	{95, 7},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{147, 5},
+ 	{209, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{209, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{176, 10},
+ 	{148, 6},
+ 	{188, 10},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{200, 10},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{191, 10},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{203, 10},
+ 	{90, 10},
+ 	{108, 10},
+ 	{69, 7},
+ 	{126, 10},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{114, 10},
+ 	{71, 7},
+ 	{132, 10},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{138, 10},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{206, 10},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{116, 10},
+ 	{72, 8},
+ 	{134, 10},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{140, 10},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{63, 12},
+ 	{15, 10},
+ 	{122, 8},
+ 	{23, 10},
+ 	{39, 10},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{27, 10},
+ 	{43, 10},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{51, 10},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{29, 10},
+ 	{45, 10},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{53, 10},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{57, 10},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{142, 10},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{30, 10},
+ 	{46, 10},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{54, 10},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{58, 10},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{60, 10},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{147, 5},
+ 	{209, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{209, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{148, 6},
+ 	{209, 12},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{209, 12},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{192, 11},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{204, 11},
+ 	{155, 7},
+ 	{167, 7},
+ 	{69, 7},
+ 	{179, 7},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{170, 7},
+ 	{71, 7},
+ 	{182, 7},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{185, 7},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{207, 11},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{117, 11},
+ 	{72, 8},
+ 	{135, 11},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{141, 11},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{104, 8},
+ 	{68, 6},
+ 	{122, 8},
+ 	{74, 6},
+ 	{92, 6},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{76, 6},
+ 	{94, 6},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{77, 7},
+ 	{95, 7},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{143, 11},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{31, 11},
+ 	{47, 11},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{55, 11},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{59, 11},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{61, 11},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{147, 5},
+ 	{209, 12},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{209, 12},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{176, 10},
+ 	{148, 6},
+ 	{188, 10},
+ 	{151, 6},
+ 	{163, 6},
+ 	{66, 6},
+ 	{200, 10},
+ 	{154, 6},
+ 	{166, 6},
+ 	{68, 6},
+ 	{178, 6},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{169, 6},
+ 	{70, 6},
+ 	{181, 6},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{191, 10},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{203, 10},
+ 	{90, 10},
+ 	{108, 10},
+ 	{69, 7},
+ 	{126, 10},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{114, 10},
+ 	{71, 7},
+ 	{132, 10},
+ 	{77, 7},
+ 	{95, 7},
+ 	{65, 5},
+ 	{194, 7},
+ 	{83, 7},
+ 	{101, 7},
+ 	{67, 5},
+ 	{119, 7},
+ 	{73, 5},
+ 	{91, 5},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{138, 10},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{103, 7},
+ 	{68, 6},
+ 	{121, 7},
+ 	{74, 6},
+ 	{92, 6},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{76, 6},
+ 	{94, 6},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{206, 10},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{116, 10},
+ 	{72, 8},
+ 	{134, 10},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{140, 10},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{62, 11},
+ 	{15, 10},
+ 	{122, 8},
+ 	{23, 10},
+ 	{39, 10},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{27, 10},
+ 	{43, 10},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{51, 10},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{29, 10},
+ 	{45, 10},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{53, 10},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{57, 10},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{146, 4},
+ 	{209, 12},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{160, 9},
+ 	{172, 9},
+ 	{147, 5},
+ 	{184, 9},
+ 	{150, 5},
+ 	{162, 5},
+ 	{65, 5},
+ 	{196, 9},
+ 	{153, 5},
+ 	{165, 5},
+ 	{67, 5},
+ 	{177, 5},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{175, 9},
+ 	{148, 6},
+ 	{142, 10},
+ 	{81, 9},
+ 	{99, 9},
+ 	{66, 6},
+ 	{199, 9},
+ 	{87, 9},
+ 	{105, 9},
+ 	{68, 6},
+ 	{123, 9},
+ 	{74, 6},
+ 	{92, 6},
+ 	{64, 4},
+ 	{209, 12},
+ 	{157, 6},
+ 	{111, 9},
+ 	{70, 6},
+ 	{129, 9},
+ 	{76, 6},
+ 	{94, 6},
+ 	{65, 5},
+ 	{193, 6},
+ 	{82, 6},
+ 	{100, 6},
+ 	{67, 5},
+ 	{118, 6},
+ 	{73, 5},
+ 	{91, 5},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{190, 9},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{202, 9},
+ 	{89, 9},
+ 	{107, 9},
+ 	{69, 7},
+ 	{125, 9},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{113, 9},
+ 	{71, 7},
+ 	{131, 9},
+ 	{30, 10},
+ 	{46, 10},
+ 	{7, 9},
+ 	{194, 7},
+ 	{83, 7},
+ 	{54, 10},
+ 	{11, 9},
+ 	{119, 7},
+ 	{19, 9},
+ 	{35, 9},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{137, 9},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{58, 10},
+ 	{13, 9},
+ 	{121, 7},
+ 	{21, 9},
+ 	{37, 9},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{25, 9},
+ 	{41, 9},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{49, 9},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{145, 3},
+ 	{205, 9},
+ 	{156, 8},
+ 	{168, 8},
+ 	{146, 4},
+ 	{180, 8},
+ 	{149, 4},
+ 	{161, 4},
+ 	{64, 4},
+ 	{209, 12},
+ 	{159, 8},
+ 	{115, 9},
+ 	{72, 8},
+ 	{133, 9},
+ 	{78, 8},
+ 	{96, 8},
+ 	{65, 5},
+ 	{195, 8},
+ 	{84, 8},
+ 	{102, 8},
+ 	{67, 5},
+ 	{120, 8},
+ 	{73, 5},
+ 	{91, 5},
+ 	{64, 4},
+ 	{209, 12},
+ 	{209, 12},
+ 	{174, 8},
+ 	{148, 6},
+ 	{139, 9},
+ 	{80, 8},
+ 	{98, 8},
+ 	{66, 6},
+ 	{198, 8},
+ 	{86, 8},
+ 	{60, 10},
+ 	{14, 9},
+ 	{122, 8},
+ 	{22, 9},
+ 	{38, 9},
+ 	{3, 8},
+ 	{209, 12},
+ 	{157, 6},
+ 	{110, 8},
+ 	{70, 6},
+ 	{128, 8},
+ 	{26, 9},
+ 	{42, 9},
+ 	{5, 8},
+ 	{193, 6},
+ 	{82, 6},
+ 	{50, 9},
+ 	{9, 8},
+ 	{118, 6},
+ 	{17, 8},
+ 	{33, 8},
+ 	{0, 6},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{209, 12},
+ 	{189, 8},
+ 	{152, 7},
+ 	{164, 7},
+ 	{145, 3},
+ 	{201, 8},
+ 	{88, 8},
+ 	{106, 8},
+ 	{69, 7},
+ 	{124, 8},
+ 	{75, 7},
+ 	{93, 7},
+ 	{64, 4},
+ 	{209, 12},
+ 	{158, 7},
+ 	{112, 8},
+ 	{71, 7},
+ 	{130, 8},
+ 	{28, 9},
+ 	{44, 9},
+ 	{6, 8},
+ 	{194, 7},
+ 	{83, 7},
+ 	{52, 9},
+ 	{10, 8},
+ 	{119, 7},
+ 	{18, 8},
+ 	{34, 8},
+ 	{1, 7},
+ 	{209, 12},
+ 	{209, 12},
+ 	{173, 7},
+ 	{148, 6},
+ 	{136, 8},
+ 	{79, 7},
+ 	{97, 7},
+ 	{66, 6},
+ 	{197, 7},
+ 	{85, 7},
+ 	{56, 9},
+ 	{12, 8},
+ 	{121, 7},
+ 	{20, 8},
+ 	{36, 8},
+ 	{2, 7},
+ 	{209, 12},
+ 	{157, 6},
+ 	{109, 7},
+ 	{70, 6},
+ 	{127, 7},
+ 	{24, 8},
+ 	{40, 8},
+ 	{4, 7},
+ 	{193, 6},
+ 	{82, 6},
+ 	{48, 8},
+ 	{8, 7},
+ 	{118, 6},
+ 	{16, 7},
+ 	{32, 7},
+ 	{0, 6}};
 } // utf8_to_utf16 namespace
 } // tables namespace
 } // unnamed namespace
@@ -10700,525 +9767,525 @@ namespace {
 namespace tables {
 namespace utf16_to_utf8 {
 
-// 1 byte for length, 16 bytes for mask
-const uint8_t pack_1_2_utf8_bytes[256][17] = {
-    { 16, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 },
-    { 15, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80 },
-    { 15, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80 },
-    { 14, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 15, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80 },
-    { 14, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 14, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80 },
-    { 14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 15, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80 },
-    { 14, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80 },
-    { 14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 15, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80 },
-    { 14, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 1, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80 },
-    { 14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
-    { 13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 1, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
-    { 13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 1, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
-    { 12, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 1, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 12, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 1, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 1, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 1, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 1, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }
-};
-
-// 1 byte for length, 16 bytes for mask
-const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
-    { 12, 2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 0, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 2, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 0, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 2, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 0, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 2, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 0, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 2, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 0, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 2, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 0, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 2, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 0, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 2, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 0, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 2, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 0, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 2, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 0, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 2, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 0, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 2, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 0, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 2, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 0, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 2, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 0, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 2, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 0, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 2, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 0, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 2, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 0, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 2, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 0, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 2, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 0, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 2, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 0, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 2, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 0, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 2, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 2, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 0, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 2, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 2, 0, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 2, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 0, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 2, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 2, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 0, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 2, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 0, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 2, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 0, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 2, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 0, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 2, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 2, 0, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 2, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 0, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 2, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 2, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 0, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 11, 2, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 0, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 2, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 0, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 2, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 0, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 2, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 0, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 2, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 0, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 2, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 2, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 0, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 2, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 0, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 2, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 0, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 2, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 0, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 2, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 0, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 2, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 0, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 2, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 0, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 2, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 0, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 2, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 0, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 2, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 0, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 2, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 0, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 10, 2, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 0, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 2, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 0, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 2, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 0, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 2, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 0, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 2, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 0, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 2, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 2, 0, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 2, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 0, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 2, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 2, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 0, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 9, 2, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 0, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 2, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 0, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 2, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 0, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 2, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 0, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 8, 2, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 0, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 2, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 2, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 0, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 7, 2, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 0, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 6, 2, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 3, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 5, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
-    { 4, 0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }
-};
+  // 1 byte for length, 16 bytes for mask
+  const uint8_t pack_1_2_utf8_bytes[256][17] = {
+    {16,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
+    {15,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
+    {15,1,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80},
+    {14,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
+    {15,1,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
+    {14,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
+    {14,1,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
+    {13,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {15,1,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80},
+    {14,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
+    {14,1,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80},
+    {13,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
+    {14,1,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
+    {13,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
+    {13,1,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {15,1,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80},
+    {14,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
+    {14,1,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
+    {13,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {14,1,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
+    {13,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {13,1,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
+    {13,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
+    {13,1,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {15,1,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80},
+    {14,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
+    {14,1,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80},
+    {13,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
+    {14,1,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
+    {13,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
+    {13,1,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
+    {12,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80},
+    {13,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
+    {12,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
+    {13,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
+    {13,1,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
+    {12,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {15,1,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80},
+    {14,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
+    {14,1,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80},
+    {13,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {14,1,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
+    {13,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {13,1,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80},
+    {13,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
+    {13,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {13,1,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80},
+    {13,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
+    {12,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
+    {12,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {15,1,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80},
+    {14,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
+    {14,1,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80},
+    {13,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
+    {14,1,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
+    {13,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
+    {13,1,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
+    {12,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80},
+    {13,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
+    {12,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
+    {13,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
+    {13,1,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
+    {12,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
+    {12,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
+    {12,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80},
+    {13,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
+    {12,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
+    {12,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {14,1,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80},
+    {13,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {13,1,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
+    {12,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
+    {12,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {13,1,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80},
+    {12,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {12,1,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
+    {11,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,1,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
+    {10,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,1,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,1,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
+  };
+
+  // 1 byte for length, 16 bytes for mask
+  const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
+    {12,2,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80},
+    {9,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
+    {10,0,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,2,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
+    {8,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,2,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,2,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
+    {8,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,2,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,2,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {1,0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,2,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,0,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,2,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,0,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,2,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,0,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,2,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,0,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,2,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,0,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {11,2,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80},
+    {8,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,0,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,2,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,2,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,0,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,2,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {10,2,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,0,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,2,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,0,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,2,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,0,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {9,2,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,0,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {8,2,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,0,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,2,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {2,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,0,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {7,2,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,0,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {6,2,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {3,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {5,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
+    {4,0,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
+  };
 
 } // utf16_to_utf8 namespace
 } // tables namespace
@@ -11241,55 +10308,45 @@ namespace {
 namespace ascii {
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
 // Only used by the fallback kernel.
-inline simdutf_warn_unused bool validate(const char* buf, size_t len) noexcept
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
+    const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
     uint64_t pos = 0;
     // process in blocks of 16 bytes when possible
-    for (; pos + 16 < len; pos += 16) {
+    for (;pos + 16 <= len; pos += 16) {
         uint64_t v1;
         std::memcpy(&v1, data + pos, sizeof(uint64_t));
         uint64_t v2;
         std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-        uint64_t v { v1 | v2 };
-        if ((v & 0x8080808080808080) != 0) {
-            return false;
-        }
+        uint64_t v{v1 | v2};
+        if ((v & 0x8080808080808080) != 0) { return false; }
     }
     // process the tail byte-by-byte
-    for (; pos < len; pos++) {
-        if (data[pos] >= 0b10000000) {
-            return false;
-        }
+    for (;pos < len; pos ++) {
+        if (data[pos] >= 0b10000000) { return false; }
     }
     return true;
 }
 #endif
 
-inline simdutf_warn_unused result validate_with_errors(const char* buf, size_t len) noexcept
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept {
+    const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
     size_t pos = 0;
     // process in blocks of 16 bytes when possible
-    for (; pos + 16 < len; pos += 16) {
+    for (;pos + 16 <= len; pos += 16) {
         uint64_t v1;
         std::memcpy(&v1, data + pos, sizeof(uint64_t));
         uint64_t v2;
         std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-        uint64_t v { v1 | v2 };
+        uint64_t v{v1 | v2};
         if ((v & 0x8080808080808080) != 0) {
-            for (; pos < len; pos++) {
-                if (data[pos] >= 0b10000000) {
-                    return result(error_code::TOO_LARGE, pos);
-                }
+            for (;pos < len; pos ++) {
+                if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); }
             }
         }
     }
     // process the tail byte-by-byte
-    for (; pos < len; pos++) {
-        if (data[pos] >= 0b10000000) {
-            return result(error_code::TOO_LARGE, pos);
-        }
+    for (;pos < len; pos ++) {
+        if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); }
     }
     return result(error_code::SUCCESS, pos);
 }
@@ -11313,245 +10370,183 @@ namespace utf8 {
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
 // only used by the fallback kernel.
 // credit: based on code from Google Fuchsia (Apache Licensed)
-inline simdutf_warn_unused bool validate(const char* buf, size_t len) noexcept
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-    uint64_t pos = 0;
-    uint32_t code_point = 0;
-    while (pos < len) {
-        // check of the next 8 bytes are ascii.
-        uint64_t next_pos = pos + 16;
-        if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-            uint64_t v1;
-            std::memcpy(&v1, data + pos, sizeof(uint64_t));
-            uint64_t v2;
-            std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-            uint64_t v { v1 | v2 };
-            if ((v & 0x8080808080808080) == 0) {
-                pos = next_pos;
-                continue;
-            }
-        }
-        unsigned char byte = data[pos];
-
-        while (byte < 0b10000000) {
-            if (++pos == len) {
-                return true;
-            }
-            byte = data[pos];
-        }
-
-        if ((byte & 0b11100000) == 0b11000000) {
-            next_pos = pos + 2;
-            if (next_pos > len) {
-                return false;
-            }
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return false;
-            }
-            // range check
-            code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-            if ((code_point < 0x80) || (0x7ff < code_point)) {
-                return false;
-            }
-        } else if ((byte & 0b11110000) == 0b11100000) {
-            next_pos = pos + 3;
-            if (next_pos > len) {
-                return false;
-            }
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return false;
-            }
-            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-                return false;
-            }
-            // range check
-            code_point = (byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
-            if ((code_point < 0x800) || (0xffff < code_point) || (0xd7ff < code_point && code_point < 0xe000)) {
-                return false;
-            }
-        } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
-            next_pos = pos + 4;
-            if (next_pos > len) {
-                return false;
-            }
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return false;
-            }
-            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-                return false;
-            }
-            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-                return false;
-            }
-            // range check
-            code_point = (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-            if (code_point <= 0xffff || 0x10ffff < code_point) {
-                return false;
-            }
-        } else {
-            // we may have a continuation
-            return false;
-        }
+inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  uint64_t pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    // check of the next 8 bytes are ascii.
+    uint64_t next_pos = pos + 16;
+    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v1;
+      std::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
         pos = next_pos;
+        continue;
+      }
+    }
+    unsigned char byte = data[pos];
+
+    while (byte < 0b10000000) {
+      if (++pos == len) { return true; }
+      byte = data[pos];
+    }
+
+    if ((byte & 0b11100000) == 0b11000000) {
+      next_pos = pos + 2;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if ((code_point < 0x80) || (0x7ff < code_point)) { return false; }
+    } else if ((byte & 0b11110000) == 0b11100000) {
+      next_pos = pos + 3;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if ((code_point < 0x800) || (0xffff < code_point) ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return false;
+      }
+    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point =
+          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
+    } else {
+      // we may have a continuation
+      return false;
     }
-    return true;
+    pos = next_pos;
+  }
+  return true;
 }
 #endif
 
-inline simdutf_warn_unused result validate_with_errors(const char* buf, size_t len) noexcept
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-    size_t pos = 0;
-    uint32_t code_point = 0;
-    while (pos < len) {
-        // check of the next 8 bytes are ascii.
-        size_t next_pos = pos + 16;
-        if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-            uint64_t v1;
-            std::memcpy(&v1, data + pos, sizeof(uint64_t));
-            uint64_t v2;
-            std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-            uint64_t v { v1 | v2 };
-            if ((v & 0x8080808080808080) == 0) {
-                pos = next_pos;
-                continue;
-            }
-        }
-        unsigned char byte = data[pos];
-
-        while (byte < 0b10000000) {
-            if (++pos == len) {
-                return result(error_code::SUCCESS, len);
-            }
-            byte = data[pos];
-        }
-
-        if ((byte & 0b11100000) == 0b11000000) {
-            next_pos = pos + 2;
-            if (next_pos > len) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            // range check
-            code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-            if ((code_point < 0x80) || (0x7ff < code_point)) {
-                return result(error_code::OVERLONG, pos);
-            }
-        } else if ((byte & 0b11110000) == 0b11100000) {
-            next_pos = pos + 3;
-            if (next_pos > len) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            // range check
-            code_point = (byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
-            if ((code_point < 0x800) || (0xffff < code_point)) {
-                return result(error_code::OVERLONG, pos);
-            }
-            if (0xd7ff < code_point && code_point < 0xe000) {
-                return result(error_code::SURROGATE, pos);
-            }
-        } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
-            next_pos = pos + 4;
-            if (next_pos > len) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            // range check
-            code_point = (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-            if (code_point <= 0xffff) {
-                return result(error_code::OVERLONG, pos);
-            }
-            if (0x10ffff < code_point) {
-                return result(error_code::TOO_LARGE, pos);
-            }
-        } else {
-            // we either have too many continuation bytes or an invalid leading byte
-            if ((byte & 0b11000000) == 0b10000000) {
-                return result(error_code::TOO_LONG, pos);
-            } else {
-                return result(error_code::HEADER_BITS, pos);
-            }
-        }
+inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    // check of the next 8 bytes are ascii.
+    size_t next_pos = pos + 16;
+    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v1;
+      std::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
         pos = next_pos;
+        continue;
+      }
+    }
+    unsigned char byte = data[pos];
+
+    while (byte < 0b10000000) {
+      if (++pos == len) { return result(error_code::SUCCESS, len); }
+      byte = data[pos];
+    }
+
+    if ((byte & 0b11100000) == 0b11000000) {
+      next_pos = pos + 2;
+      if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      // range check
+      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if ((code_point < 0x80) || (0x7ff < code_point)) { return result(error_code::OVERLONG, pos); }
+    } else if ((byte & 0b11110000) == 0b11100000) {
+      next_pos = pos + 3;
+      if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      // range check
+      code_point = (byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);}
+      if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
+    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      // range check
+      code_point =
+          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
+      if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
+    } else {
+      // we either have too many continuation bytes or an invalid leading byte
+      if ((byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
+      else { return result(error_code::HEADER_BITS, pos); }
     }
-    return result(error_code::SUCCESS, len);
+    pos = next_pos;
+  }
+  return result(error_code::SUCCESS, len);
 }
 
 // Finds the previous leading byte and validates with errors from there
 // Used to pinpoint the location of an error when an invalid chunk is detected
-inline simdutf_warn_unused result rewind_and_validate_with_errors(const char* buf, size_t len) noexcept
-{
-    size_t extra_len { 0 };
-    // A leading byte cannot be further than 4 bytes away
-    for (int i = 0; i < 5; i++) {
-        unsigned char byte = *buf;
-        if ((byte & 0b11000000) != 0b10000000) {
-            break;
-        } else {
-            buf--;
-            extra_len++;
-        }
+inline simdutf_warn_unused result rewind_and_validate_with_errors(const char *buf, size_t len) noexcept {
+  size_t extra_len{0};
+  // A leading byte cannot be further than 4 bytes away
+  for(int i = 0; i < 5; i++) {
+    unsigned char byte = *buf;
+    if ((byte & 0b11000000) != 0b10000000) {
+      break;
+    } else {
+      buf--;
+      extra_len++;
     }
+  }
 
-    result res = validate_with_errors(buf, len + extra_len);
-    res.count -= extra_len;
-    return res;
+  result res = validate_with_errors(buf, len + extra_len);
+  res.count -= extra_len;
+  return res;
 }
 
-inline size_t count_code_points(const char* buf, size_t len)
-{
-    const int8_t* p = reinterpret_cast<const int8_t*>(buf);
-    size_t counter { 0 };
-    for (size_t i = 0; i < len; i++) {
+inline size_t count_code_points(const char* buf, size_t len) {
+    const int8_t * p = reinterpret_cast<const int8_t *>(buf);
+    size_t counter{0};
+    for(size_t i = 0; i < len; i++) {
         // -65 is 0b10111111, anything larger in two-complement's should start a new code point.
-        if (p[i] > -65) {
-            counter++;
-        }
+        if(p[i] > -65) { counter++; }
     }
     return counter;
 }
 
-inline size_t utf16_length_from_utf8(const char* buf, size_t len)
-{
-    const int8_t* p = reinterpret_cast<const int8_t*>(buf);
-    size_t counter { 0 };
-    for (size_t i = 0; i < len; i++) {
-        if (p[i] > -65) {
-            counter++;
-        }
-        if (uint8_t(p[i]) >= 240) {
-            counter++;
-        }
+inline size_t utf16_length_from_utf8(const char* buf, size_t len) {
+    const int8_t * p = reinterpret_cast<const int8_t *>(buf);
+    size_t counter{0};
+    for(size_t i = 0; i < len; i++) {
+        if(p[i] > -65) { counter++; }
+        if(uint8_t(p[i]) >= 240) { counter++; }
     }
     return counter;
 }
 
-inline size_t latin1_length_from_utf8(const char* buf, size_t len)
-{
-    const uint8_t* c = reinterpret_cast<const uint8_t*>(buf);
+inline size_t latin1_length_from_utf8(const char *buf, size_t len) {
+  const uint8_t * c = reinterpret_cast<const uint8_t *>(buf);
 
     size_t answer = len;
-    for (size_t i = 0; i < len; i++) {
-        if ((c[i] & 0b11100000) == 0b11000000) {
-            answer--;
-        } // if we have a two-byte UTF8 character
+    for(size_t i = 0; i < len; i++) {
+        if((c[i] & 0b11100000) == 0b11000000) { answer--;} //if we have a two-byte UTF8 character
     }
     return answer;
 }
@@ -11573,133 +10568,106 @@ namespace scalar {
 namespace {
 namespace utf16 {
 
-inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word)
-{
-    return uint16_t((word >> 8) | (word << 8));
-}
-
-template<endianness big_endian>
-inline simdutf_warn_unused bool validate(const char16_t* buf, size_t len) noexcept
-{
-    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
-    uint64_t pos = 0;
-    while (pos < len) {
-        uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
-        if ((word & 0xF800) == 0xD800) {
-            if (pos + 1 >= len) {
-                return false;
-            }
-            uint16_t diff = uint16_t(word - 0xD800);
-            if (diff > 0x3FF) {
-                return false;
-            }
-            uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
-            uint16_t diff2 = uint16_t(next_word - 0xDC00);
-            if (diff2 > 0x3FF) {
-                return false;
-            }
-            pos += 2;
-        } else {
-            pos++;
-        }
-    }
-    return true;
-}
-
-template<endianness big_endian>
-inline simdutf_warn_unused result validate_with_errors(const char16_t* buf, size_t len) noexcept
-{
-    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
-    size_t pos = 0;
-    while (pos < len) {
-        uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
-        if ((word & 0xF800) == 0xD800) {
-            if (pos + 1 >= len) {
-                return result(error_code::SURROGATE, pos);
-            }
-            uint16_t diff = uint16_t(word - 0xD800);
-            if (diff > 0x3FF) {
-                return result(error_code::SURROGATE, pos);
-            }
-            uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
-            uint16_t diff2 = uint16_t(next_word - 0xDC00);
-            if (diff2 > 0x3FF) {
-                return result(error_code::SURROGATE, pos);
-            }
-            pos += 2;
-        } else {
-            pos++;
-        }
-    }
-    return result(error_code::SUCCESS, pos);
-}
-
-template<endianness big_endian>
-inline size_t count_code_points(const char16_t* buf, size_t len)
-{
-    // We are not BOM aware.
-    const uint16_t* p = reinterpret_cast<const uint16_t*>(buf);
-    size_t counter { 0 };
-    for (size_t i = 0; i < len; i++) {
-        uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-        counter += ((word & 0xFC00) != 0xDC00);
-    }
-    return counter;
-}
-
-template<endianness big_endian>
-inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len)
-{
-    // We are not BOM aware.
-    const uint16_t* p = reinterpret_cast<const uint16_t*>(buf);
-    size_t counter { 0 };
-    for (size_t i = 0; i < len; i++) {
-        uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-        /** ASCII **/
-        if (word <= 0x7F) {
-            counter++;
-        }
-        /** two-byte **/
-        else if (word <= 0x7FF) {
-            counter += 2;
-        }
-        /** three-byte **/
-        else if ((word <= 0xD7FF) || (word >= 0xE000)) {
-            counter += 3;
-        }
-        /** surrogates -- 4 bytes **/
-        else {
-            counter += 2;
-        }
-    }
-    return counter;
-}
-
-template<endianness big_endian>
-inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len)
-{
-    // We are not BOM aware.
-    const uint16_t* p = reinterpret_cast<const uint16_t*>(buf);
-    size_t counter { 0 };
-    for (size_t i = 0; i < len; i++) {
-        uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-        counter += ((word & 0xFC00) != 0xDC00);
+inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
+  return uint16_t((word >> 8) | (word << 8));
+}
+
+template <endianness big_endian>
+inline simdutf_warn_unused bool validate(const char16_t *buf, size_t len) noexcept {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  uint64_t pos = 0;
+  while (pos < len) {
+    uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+    if((word &0xF800) == 0xD800) {
+        if(pos + 1 >= len) { return false; }
+        uint16_t diff = uint16_t(word - 0xD800);
+        if(diff > 0x3FF) { return false; }
+        uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+        uint16_t diff2 = uint16_t(next_word - 0xDC00);
+        if(diff2 > 0x3FF) { return false; }
+        pos += 2;
+    } else {
+        pos++;
     }
-    return counter;
-}
-
-inline size_t latin1_length_from_utf16(size_t len)
-{
-    return len;
-}
-
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* out)
-{
-    const uint16_t* input = reinterpret_cast<const uint16_t*>(in);
-    uint16_t* output = reinterpret_cast<uint16_t*>(out);
-    for (size_t i = 0; i < size; i++) {
-        *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
+  }
+  return true;
+}
+
+template <endianness big_endian>
+inline simdutf_warn_unused result validate_with_errors(const char16_t *buf, size_t len) noexcept {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  while (pos < len) {
+    uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+    if((word & 0xF800) == 0xD800) {
+        if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); }
+        uint16_t diff = uint16_t(word - 0xD800);
+        if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
+        uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+        uint16_t diff2 = uint16_t(next_word - 0xDC00);
+        if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
+        pos += 2;
+    } else {
+        pos++;
     }
+  }
+  return result(error_code::SUCCESS, pos);
+}
+
+template <endianness big_endian>
+inline size_t count_code_points(const char16_t* buf, size_t len) {
+  // We are not BOM aware.
+  const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
+  size_t counter{0};
+  for(size_t i = 0; i < len; i++) {
+    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+    counter += ((word & 0xFC00) != 0xDC00);
+  }
+  return counter;
+}
+
+template <endianness big_endian>
+inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len) {
+  // We are not BOM aware.
+  const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
+  size_t counter{0};
+  for(size_t i = 0; i < len; i++) {
+    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+    /** ASCII **/
+    if(word <= 0x7F) { counter++; }
+    /** two-byte **/
+    else if (word <= 0x7FF) { counter += 2; }
+    /** three-byte **/
+    else if((word <= 0xD7FF) || (word >= 0xE000)) { counter += 3; }
+    /** surrogates -- 4 bytes **/
+    else { counter += 2; }
+  }
+  return counter;
+}
+
+template <endianness big_endian>
+inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len) {
+  // We are not BOM aware.
+  const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
+  size_t counter{0};
+  for(size_t i = 0; i < len; i++) {
+    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+    counter += ((word & 0xFC00) != 0xDC00);
+  }
+  return counter;
+}
+
+
+inline size_t latin1_length_from_utf16(size_t len) {
+  return len;
+}
+
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* out) {
+  const uint16_t * input = reinterpret_cast<const uint16_t *>(in);
+  uint16_t * output = reinterpret_cast<uint16_t *>(out);
+  for (size_t i = 0; i < size; i++) {
+    *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
+  }
 }
 
 } // utf16 namespace
@@ -11719,85 +10687,70 @@ namespace scalar {
 namespace {
 namespace utf32 {
 
-inline simdutf_warn_unused bool validate(const char32_t* buf, size_t len) noexcept
-{
-    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
-    uint64_t pos = 0;
-    for (; pos < len; pos++) {
-        uint32_t word = data[pos];
-        if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
-            return false;
-        }
+inline simdutf_warn_unused bool validate(const char32_t *buf, size_t len) noexcept {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  uint64_t pos = 0;
+  for(;pos < len; pos++) {
+    uint32_t word = data[pos];
+    if(word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
+        return false;
     }
-    return true;
+  }
+  return true;
 }
 
-inline simdutf_warn_unused result validate_with_errors(const char32_t* buf, size_t len) noexcept
-{
-    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
-    size_t pos = 0;
-    for (; pos < len; pos++) {
-        uint32_t word = data[pos];
-        if (word > 0x10FFFF) {
-            return result(error_code::TOO_LARGE, pos);
-        }
-        if (word >= 0xD800 && word <= 0xDFFF) {
-            return result(error_code::SURROGATE, pos);
-        }
+inline simdutf_warn_unused result validate_with_errors(const char32_t *buf, size_t len) noexcept {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  for(;pos < len; pos++) {
+    uint32_t word = data[pos];
+    if(word > 0x10FFFF) {
+        return result(error_code::TOO_LARGE, pos);
     }
-    return result(error_code::SUCCESS, pos);
+    if(word >= 0xD800 && word <= 0xDFFF) {
+        return result(error_code::SURROGATE, pos);
+    }
+  }
+  return result(error_code::SUCCESS, pos);
 }
 
-inline size_t utf8_length_from_utf32(const char32_t* buf, size_t len)
-{
-    // We are not BOM aware.
-    const uint32_t* p = reinterpret_cast<const uint32_t*>(buf);
-    size_t counter { 0 };
-    for (size_t i = 0; i < len; i++) {
-        /** ASCII **/
-        if (p[i] <= 0x7F) {
-            counter++;
-        }
-        /** two-byte **/
-        else if (p[i] <= 0x7FF) {
-            counter += 2;
-        }
-        /** three-byte **/
-        else if (p[i] <= 0xFFFF) {
-            counter += 3;
-        }
-        /** four-bytes **/
-        else {
-            counter += 4;
-        }
-    }
-    return counter;
+inline size_t utf8_length_from_utf32(const char32_t* buf, size_t len) {
+  // We are not BOM aware.
+  const uint32_t * p = reinterpret_cast<const uint32_t *>(buf);
+  size_t counter{0};
+  for(size_t i = 0; i < len; i++) {
+    /** ASCII **/
+    if(p[i] <= 0x7F) { counter++; }
+    /** two-byte **/
+    else if(p[i] <= 0x7FF) { counter += 2; }
+    /** three-byte **/
+    else if(p[i] <= 0xFFFF) { counter += 3; }
+    /** four-bytes **/
+    else { counter += 4; }
+  }
+  return counter;
 }
 
-inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len)
-{
-    // We are not BOM aware.
-    const uint32_t* p = reinterpret_cast<const uint32_t*>(buf);
-    size_t counter { 0 };
-    for (size_t i = 0; i < len; i++) {
-        /** non-surrogate word **/
-        if (p[i] <= 0xFFFF) {
-            counter++;
-        }
-        /** surrogate pair **/
-        else {
-            counter += 2;
-        }
-    }
-    return counter;
+inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len) {
+  // We are not BOM aware.
+  const uint32_t * p = reinterpret_cast<const uint32_t *>(buf);
+  size_t counter{0};
+  for(size_t i = 0; i < len; i++) {
+    /** non-surrogate word **/
+    if(p[i] <= 0xFFFF) { counter++; }
+    /** surrogate pair **/
+    else { counter += 2; }
+  }
+  return counter;
 }
 
-inline size_t latin1_length_from_utf32(size_t len)
-{
-    // We are not BOM aware.
-    return len; // a utf32 codepoint will always represent 1 latin1 character
+inline size_t latin1_length_from_utf32(size_t len) {
+  // We are not BOM aware.
+  return len; // a utf32 codepoint will always represent 1 latin1 character
 }
 
+
+
 } // utf32 namespace
 } // unnamed namespace
 } // namespace scalar
@@ -11815,27 +10768,22 @@ namespace scalar {
 namespace {
 namespace latin1 {
 
-inline size_t utf32_length_from_latin1(size_t len)
-{
-    // We are not BOM aware.
-    return len; // a utf32 unit will always represent 1 latin1 character
+inline size_t utf32_length_from_latin1(size_t len) {
+  // We are not BOM aware.
+  return len; // a utf32 unit will always represent 1 latin1 character
 }
 
-inline size_t utf8_length_from_latin1(const char* buf, size_t len)
-{
-    const uint8_t* c = reinterpret_cast<const uint8_t*>(buf);
-    size_t answer = 0;
-    for (size_t i = 0; i < len; i++) {
-        if ((c[i] >> 7)) {
-            answer++;
-        }
-    }
-    return answer + len;
+inline size_t utf8_length_from_latin1(const char *buf, size_t len) {
+  const uint8_t * c = reinterpret_cast<const uint8_t *>(buf);
+  size_t answer = 0;
+  for(size_t i = 0; i<len; i++) {
+    if((c[i]>>7)) { answer++; }
+  }
+  return answer + len;
 }
 
-inline size_t utf16_length_from_latin1(size_t len)
-{
-    return len;
+inline size_t utf16_length_from_latin1(size_t len) {
+  return len;
 }
 
 } // utf32 namespace
@@ -11858,52 +10806,51 @@ namespace utf32_to_utf8 {
 
 #if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
 // only used by the fallback and POWER kernel
-inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output)
-{
-    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
-    size_t pos = 0;
-    char* start { utf8_output };
-    while (pos < len) {
-        // try to convert the next block of 2 ASCII characters
-        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-            uint64_t v;
-            ::memcpy(&v, data + pos, sizeof(uint64_t));
-            if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-                *utf8_output++ = char(buf[pos]);
-                *utf8_output++ = char(buf[pos + 1]);
-                pos += 2;
-                continue;
-            }
-        }
-        uint32_t word = data[pos];
-        if ((word & 0xFFFFFF80) == 0) {
-            // will generate one UTF-8 bytes
-            *utf8_output++ = char(word);
-            pos++;
-        } else if ((word & 0xFFFFF800) == 0) {
-            // will generate two UTF-8 bytes
-            // we have 0b110XXXXX 0b10XXXXXX
-            *utf8_output++ = char((word >> 6) | 0b11000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        } else if ((word & 0xFFFF0000) == 0) {
-            // will generate three UTF-8 bytes
-            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-            *utf8_output++ = char((word >> 12) | 0b11100000);
-            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        } else {
-            // will generate four UTF-8 bytes
-            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-            *utf8_output++ = char((word >> 18) | 0b11110000);
-            *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        }
+inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output) {
+	const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char* start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 2 ASCII characters
+    if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+        *utf8_output++ = char(buf[pos]);
+				*utf8_output++ = char(buf[pos+1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t word = data[pos];
+    if((word & 0xFFFFFF80)==0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if((word & 0xFFFFF800)==0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if((word & 0xFFFF0000)==0) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>12) | 0b11100000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>18) | 0b11110000);
+      *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos ++;
     }
-    return utf8_output - start;
+  }
+  return utf8_output - start;
 }
 #endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
 
@@ -11924,112 +10871,102 @@ namespace scalar {
 namespace {
 namespace utf32_to_utf8 {
 
-inline size_t convert(const char32_t* buf, size_t len, char* utf8_output)
-{
-    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
-    size_t pos = 0;
-    char* start { utf8_output };
-    while (pos < len) {
-        // try to convert the next block of 2 ASCII characters
-        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-            uint64_t v;
-            ::memcpy(&v, data + pos, sizeof(uint64_t));
-            if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-                *utf8_output++ = char(buf[pos]);
-                *utf8_output++ = char(buf[pos + 1]);
-                pos += 2;
-                continue;
-            }
-        }
-        uint32_t word = data[pos];
-        if ((word & 0xFFFFFF80) == 0) {
-            // will generate one UTF-8 bytes
-            *utf8_output++ = char(word);
-            pos++;
-        } else if ((word & 0xFFFFF800) == 0) {
-            // will generate two UTF-8 bytes
-            // we have 0b110XXXXX 0b10XXXXXX
-            *utf8_output++ = char((word >> 6) | 0b11000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        } else if ((word & 0xFFFF0000) == 0) {
-            // will generate three UTF-8 bytes
-            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-            if (word >= 0xD800 && word <= 0xDFFF) {
-                return 0;
-            }
-            *utf8_output++ = char((word >> 12) | 0b11100000);
-            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        } else {
-            // will generate four UTF-8 bytes
-            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-            if (word > 0x10FFFF) {
-                return 0;
-            }
-            *utf8_output++ = char((word >> 18) | 0b11110000);
-            *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        }
-    }
-    return utf8_output - start;
-}
-
-inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_output)
-{
-    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
-    size_t pos = 0;
-    char* start { utf8_output };
-    while (pos < len) {
-        // try to convert the next block of 2 ASCII characters
-        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-            uint64_t v;
-            ::memcpy(&v, data + pos, sizeof(uint64_t));
-            if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-                *utf8_output++ = char(buf[pos]);
-                *utf8_output++ = char(buf[pos + 1]);
-                pos += 2;
-                continue;
-            }
-        }
-        uint32_t word = data[pos];
-        if ((word & 0xFFFFFF80) == 0) {
-            // will generate one UTF-8 bytes
-            *utf8_output++ = char(word);
-            pos++;
-        } else if ((word & 0xFFFFF800) == 0) {
-            // will generate two UTF-8 bytes
-            // we have 0b110XXXXX 0b10XXXXXX
-            *utf8_output++ = char((word >> 6) | 0b11000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        } else if ((word & 0xFFFF0000) == 0) {
-            // will generate three UTF-8 bytes
-            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-            if (word >= 0xD800 && word <= 0xDFFF) {
-                return result(error_code::SURROGATE, pos);
-            }
-            *utf8_output++ = char((word >> 12) | 0b11100000);
-            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        } else {
-            // will generate four UTF-8 bytes
-            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-            if (word > 0x10FFFF) {
-                return result(error_code::TOO_LARGE, pos);
-            }
-            *utf8_output++ = char((word >> 18) | 0b11110000);
-            *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        }
+inline size_t convert(const char32_t* buf, size_t len, char* utf8_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char* start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 2 ASCII characters
+    if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+        *utf8_output++ = char(buf[pos]);
+				*utf8_output++ = char(buf[pos+1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t word = data[pos];
+    if((word & 0xFFFFFF80)==0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if((word & 0xFFFFF800)==0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if((word & 0xFFFF0000)==0) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+			if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
+      *utf8_output++ = char((word>>12) | 0b11100000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+			if (word > 0x10FFFF) { return 0; }
+      *utf8_output++ = char((word>>18) | 0b11110000);
+      *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos ++;
+    }
+  }
+  return utf8_output - start;
+}
+
+inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char* start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 2 ASCII characters
+    if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+        *utf8_output++ = char(buf[pos]);
+				*utf8_output++ = char(buf[pos+1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t word = data[pos];
+    if((word & 0xFFFFFF80)==0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if((word & 0xFFFFF800)==0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if((word & 0xFFFF0000)==0) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+			if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
+      *utf8_output++ = char((word>>12) | 0b11100000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+			if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
+      *utf8_output++ = char((word>>18) | 0b11110000);
+      *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos ++;
     }
-    return result(error_code::SUCCESS, utf8_output - start);
+  }
+  return result(error_code::SUCCESS, utf8_output - start);
 }
 
 } // utf32_to_utf8 namespace
@@ -12050,33 +10987,32 @@ namespace scalar {
 namespace {
 namespace utf32_to_utf16 {
 
-template<endianness big_endian>
-inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_output)
-{
-    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
-    size_t pos = 0;
-    char16_t* start { utf16_output };
-    while (pos < len) {
-        uint32_t word = data[pos];
-        if ((word & 0xFFFF0000) == 0) {
-            // will not generate a surrogate pair
-            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
-            pos++;
-        } else {
-            // will generate a surrogate pair
-            word -= 0x10000;
-            uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-            uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-            if (!match_system(big_endian)) {
-                high_surrogate = utf16::swap_bytes(high_surrogate);
-                low_surrogate = utf16::swap_bytes(low_surrogate);
-            }
-            *utf16_output++ = char16_t(high_surrogate);
-            *utf16_output++ = char16_t(low_surrogate);
-            pos++;
-        }
-    }
-    return utf16_output - start;
+template <endianness big_endian>
+inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  while (pos < len) {
+    uint32_t word = data[pos];
+    if((word & 0xFFFF0000)==0) {
+      // will not generate a surrogate pair
+      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
+      pos++;
+    } else {
+      // will generate a surrogate pair
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      pos++;
+    }
+  }
+  return utf16_output - start;
 }
 
 } // utf32_to_utf16 namespace
@@ -12096,72 +11032,62 @@ namespace scalar {
 namespace {
 namespace utf32_to_utf16 {
 
-template<endianness big_endian>
-inline size_t convert(const char32_t* buf, size_t len, char16_t* utf16_output)
-{
-    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
-    size_t pos = 0;
-    char16_t* start { utf16_output };
-    while (pos < len) {
-        uint32_t word = data[pos];
-        if ((word & 0xFFFF0000) == 0) {
-            if (word >= 0xD800 && word <= 0xDFFF) {
-                return 0;
-            }
-            // will not generate a surrogate pair
-            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
-        } else {
-            // will generate a surrogate pair
-            if (word > 0x10FFFF) {
-                return 0;
-            }
-            word -= 0x10000;
-            uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-            uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-            if (!match_system(big_endian)) {
-                high_surrogate = utf16::swap_bytes(high_surrogate);
-                low_surrogate = utf16::swap_bytes(low_surrogate);
-            }
-            *utf16_output++ = char16_t(high_surrogate);
-            *utf16_output++ = char16_t(low_surrogate);
-        }
-        pos++;
-    }
-    return utf16_output - start;
-}
-
-template<endianness big_endian>
-inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)
-{
-    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
-    size_t pos = 0;
-    char16_t* start { utf16_output };
-    while (pos < len) {
-        uint32_t word = data[pos];
-        if ((word & 0xFFFF0000) == 0) {
-            if (word >= 0xD800 && word <= 0xDFFF) {
-                return result(error_code::SURROGATE, pos);
-            }
-            // will not generate a surrogate pair
-            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
-        } else {
-            // will generate a surrogate pair
-            if (word > 0x10FFFF) {
-                return result(error_code::TOO_LARGE, pos);
-            }
-            word -= 0x10000;
-            uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-            uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-            if (!match_system(big_endian)) {
-                high_surrogate = utf16::swap_bytes(high_surrogate);
-                low_surrogate = utf16::swap_bytes(low_surrogate);
-            }
-            *utf16_output++ = char16_t(high_surrogate);
-            *utf16_output++ = char16_t(low_surrogate);
-        }
-        pos++;
-    }
-    return result(error_code::SUCCESS, utf16_output - start);
+template <endianness big_endian>
+inline size_t convert(const char32_t* buf, size_t len, char16_t* utf16_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  while (pos < len) {
+    uint32_t word = data[pos];
+    if((word & 0xFFFF0000)==0) {
+      if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
+      // will not generate a surrogate pair
+      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
+    } else {
+      // will generate a surrogate pair
+      if (word > 0x10FFFF) { return 0; }
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+    }
+    pos++;
+  }
+  return utf16_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  while (pos < len) {
+    uint32_t word = data[pos];
+    if((word & 0xFFFF0000)==0) {
+      if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
+      // will not generate a surrogate pair
+      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
+    } else {
+      // will generate a surrogate pair
+      if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+    }
+    pos++;
+  }
+  return result(error_code::SUCCESS, utf16_output - start);
 }
 
 } // utf32_to_utf16 namespace
@@ -12182,67 +11108,62 @@ namespace scalar {
 namespace {
 namespace utf16_to_utf8 {
 
-template<endianness big_endian>
-inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output)
-{
-    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
-    size_t pos = 0;
-    char* start { utf8_output };
-    while (pos < len) {
-        // try to convert the next block of 4 ASCII characters
-        if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-            uint64_t v;
-            ::memcpy(&v, data + pos, sizeof(uint64_t));
-            if (!match_system(big_endian)) {
-                v = (v >> 8) | (v << (64 - 8));
-            }
-            if ((v & 0xFF80FF80FF80FF80) == 0) {
-                size_t final_pos = pos + 4;
-                while (pos < final_pos) {
-                    *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-
-        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-        if ((word & 0xFF80) == 0) {
-            // will generate one UTF-8 bytes
-            *utf8_output++ = char(word);
-            pos++;
-        } else if ((word & 0xF800) == 0) {
-            // will generate two UTF-8 bytes
-            // we have 0b110XXXXX 0b10XXXXXX
-            *utf8_output++ = char((word >> 6) | 0b11000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        } else if ((word & 0xF800) != 0xD800) {
-            // will generate three UTF-8 bytes
-            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-            *utf8_output++ = char((word >> 12) | 0b11100000);
-            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        } else {
-            // must be a surrogate pair
-            uint16_t diff = uint16_t(word - 0xD800);
-            if (pos + 1 >= len) {
-                return 0;
-            } // minimal bound checking
-            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-            uint16_t diff2 = uint16_t(next_word - 0xDC00);
-            uint32_t value = (diff << 10) + diff2 + 0x10000;
-            // will generate four UTF-8 bytes
-            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-            *utf8_output++ = char((value >> 18) | 0b11110000);
-            *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-            *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-            *utf8_output++ = char((value & 0b111111) | 0b10000000);
-            pos += 2;
-        }
-    }
-    return utf8_output - start;
+template <endianness big_endian>
+inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char* start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 4 ASCII characters
+    if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if (!match_system(big_endian)) { v = (v >> 8) | (v << (64 - 8)); }
+      if ((v & 0xFF80FF80FF80FF80) == 0) {
+        size_t final_pos = pos + 4;
+        while(pos < final_pos) {
+          *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if((word & 0xFF80)==0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if((word & 0xF800)==0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if((word &0xF800 ) != 0xD800) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>12) | 0b11100000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if(pos + 1 >= len) { return 0; } // minimal bound checking
+      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((value>>18) | 0b11110000);
+      *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      pos += 2;
+    }
+  }
+  return utf8_output - start;
 }
 
 } // utf16_to_utf8 namespace
@@ -12262,139 +11183,122 @@ namespace scalar {
 namespace {
 namespace utf16_to_utf8 {
 
-template<endianness big_endian>
-inline size_t convert(const char16_t* buf, size_t len, char* utf8_output)
-{
-    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
-    size_t pos = 0;
-    char* start { utf8_output };
-    while (pos < len) {
-        // try to convert the next block of 8 ASCII characters
-        if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-            uint64_t v;
-            ::memcpy(&v, data + pos, sizeof(uint64_t));
-            if (!match_system(big_endian)) {
-                v = (v >> 8) | (v << (64 - 8));
-            }
-            if ((v & 0xFF80FF80FF80FF80) == 0) {
-                size_t final_pos = pos + 4;
-                while (pos < final_pos) {
-                    *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-        if ((word & 0xFF80) == 0) {
-            // will generate one UTF-8 bytes
-            *utf8_output++ = char(word);
-            pos++;
-        } else if ((word & 0xF800) == 0) {
-            // will generate two UTF-8 bytes
-            // we have 0b110XXXXX 0b10XXXXXX
-            *utf8_output++ = char((word >> 6) | 0b11000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        } else if ((word & 0xF800) != 0xD800) {
-            // will generate three UTF-8 bytes
-            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-            *utf8_output++ = char((word >> 12) | 0b11100000);
-            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        } else {
-            // must be a surrogate pair
-            if (pos + 1 >= len) {
-                return 0;
-            }
-            uint16_t diff = uint16_t(word - 0xD800);
-            if (diff > 0x3FF) {
-                return 0;
-            }
-            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-            uint16_t diff2 = uint16_t(next_word - 0xDC00);
-            if (diff2 > 0x3FF) {
-                return 0;
-            }
-            uint32_t value = (diff << 10) + diff2 + 0x10000;
-            // will generate four UTF-8 bytes
-            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-            *utf8_output++ = char((value >> 18) | 0b11110000);
-            *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-            *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-            *utf8_output++ = char((value & 0b111111) | 0b10000000);
-            pos += 2;
-        }
-    }
-    return utf8_output - start;
-}
-
-template<endianness big_endian>
-inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_output)
-{
-    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
-    size_t pos = 0;
-    char* start { utf8_output };
-    while (pos < len) {
-        // try to convert the next block of 8 ASCII characters
-        if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-            uint64_t v;
-            ::memcpy(&v, data + pos, sizeof(uint64_t));
-            if (!match_system(big_endian))
-                v = (v >> 8) | (v << (64 - 8));
-            if ((v & 0xFF80FF80FF80FF80) == 0) {
-                size_t final_pos = pos + 4;
-                while (pos < final_pos) {
-                    *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-        if ((word & 0xFF80) == 0) {
-            // will generate one UTF-8 bytes
-            *utf8_output++ = char(word);
-            pos++;
-        } else if ((word & 0xF800) == 0) {
-            // will generate two UTF-8 bytes
-            // we have 0b110XXXXX 0b10XXXXXX
-            *utf8_output++ = char((word >> 6) | 0b11000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        } else if ((word & 0xF800) != 0xD800) {
-            // will generate three UTF-8 bytes
-            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-            *utf8_output++ = char((word >> 12) | 0b11100000);
-            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-            *utf8_output++ = char((word & 0b111111) | 0b10000000);
-            pos++;
-        } else {
-            // must be a surrogate pair
-            if (pos + 1 >= len) {
-                return result(error_code::SURROGATE, pos);
-            }
-            uint16_t diff = uint16_t(word - 0xD800);
-            if (diff > 0x3FF) {
-                return result(error_code::SURROGATE, pos);
-            }
-            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-            uint16_t diff2 = uint16_t(next_word - 0xDC00);
-            if (diff2 > 0x3FF) {
-                return result(error_code::SURROGATE, pos);
-            }
-            uint32_t value = (diff << 10) + diff2 + 0x10000;
-            // will generate four UTF-8 bytes
-            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-            *utf8_output++ = char((value >> 18) | 0b11110000);
-            *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-            *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-            *utf8_output++ = char((value & 0b111111) | 0b10000000);
-            pos += 2;
-        }
-    }
-    return result(error_code::SUCCESS, utf8_output - start);
+template <endianness big_endian>
+inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char* start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 8 ASCII characters
+    if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if (!match_system(big_endian)) { v = (v >> 8) | (v << (64 - 8)); }
+      if ((v & 0xFF80FF80FF80FF80) == 0) {
+        size_t final_pos = pos + 4;
+        while(pos < final_pos) {
+          *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if((word & 0xFF80)==0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if((word & 0xF800)==0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if((word &0xF800 ) != 0xD800) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>12) | 0b11100000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      if(pos + 1 >= len) { return 0; }
+      uint16_t diff = uint16_t(word - 0xD800);
+      if(diff > 0x3FF) { return 0; }
+      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if(diff2 > 0x3FF) { return 0; }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((value>>18) | 0b11110000);
+      *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      pos += 2;
+    }
+  }
+  return utf8_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char* start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 8 ASCII characters
+    if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
+      if ((v & 0xFF80FF80FF80FF80) == 0) {
+        size_t final_pos = pos + 4;
+        while(pos < final_pos) {
+          *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if((word & 0xFF80)==0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if((word & 0xF800)==0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if((word &0xF800 ) != 0xD800) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>12) | 0b11100000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); }
+      uint16_t diff = uint16_t(word - 0xD800);
+      if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
+      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((value>>18) | 0b11110000);
+      *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      pos += 2;
+    }
+  }
+  return result(error_code::SUCCESS, utf8_output - start);
 }
 
 } // utf16_to_utf8 namespace
@@ -12415,32 +11319,29 @@ namespace scalar {
 namespace {
 namespace utf16_to_utf32 {
 
-template<endianness big_endian>
-inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_output)
-{
-    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
-    size_t pos = 0;
-    char32_t* start { utf32_output };
-    while (pos < len) {
-        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-        if ((word & 0xF800) != 0xD800) {
-            // No surrogate pair, extend 16-bit word to 32-bit word
-            *utf32_output++ = char32_t(word);
-            pos++;
-        } else {
-            // must be a surrogate pair
-            uint16_t diff = uint16_t(word - 0xD800);
-            if (pos + 1 >= len) {
-                return 0;
-            } // minimal bound checking
-            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-            uint16_t diff2 = uint16_t(next_word - 0xDC00);
-            uint32_t value = (diff << 10) + diff2 + 0x10000;
-            *utf32_output++ = char32_t(value);
-            pos += 2;
-        }
+template <endianness big_endian>
+inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char32_t* start{utf32_output};
+  while (pos < len) {
+    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if((word &0xF800 ) != 0xD800) {
+      // No surrogate pair, extend 16-bit word to 32-bit word
+      *utf32_output++ = char32_t(word);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if(pos + 1 >= len) { return 0; } // minimal bound checking
+      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      pos += 2;
     }
-    return utf32_output - start;
+  }
+  return utf32_output - start;
 }
 
 } // utf16_to_utf32 namespace
@@ -12460,72 +11361,58 @@ namespace scalar {
 namespace {
 namespace utf16_to_utf32 {
 
-template<endianness big_endian>
-inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output)
-{
-    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
-    size_t pos = 0;
-    char32_t* start { utf32_output };
-    while (pos < len) {
-        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-        if ((word & 0xF800) != 0xD800) {
-            // No surrogate pair, extend 16-bit word to 32-bit word
-            *utf32_output++ = char32_t(word);
-            pos++;
-        } else {
-            // must be a surrogate pair
-            uint16_t diff = uint16_t(word - 0xD800);
-            if (diff > 0x3FF) {
-                return 0;
-            }
-            if (pos + 1 >= len) {
-                return 0;
-            } // minimal bound checking
-            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-            uint16_t diff2 = uint16_t(next_word - 0xDC00);
-            if (diff2 > 0x3FF) {
-                return 0;
-            }
-            uint32_t value = (diff << 10) + diff2 + 0x10000;
-            *utf32_output++ = char32_t(value);
-            pos += 2;
-        }
-    }
-    return utf32_output - start;
-}
-
-template<endianness big_endian>
-inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output)
-{
-    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
-    size_t pos = 0;
-    char32_t* start { utf32_output };
-    while (pos < len) {
-        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-        if ((word & 0xF800) != 0xD800) {
-            // No surrogate pair, extend 16-bit word to 32-bit word
-            *utf32_output++ = char32_t(word);
-            pos++;
-        } else {
-            // must be a surrogate pair
-            uint16_t diff = uint16_t(word - 0xD800);
-            if (diff > 0x3FF) {
-                return result(error_code::SURROGATE, pos);
-            }
-            if (pos + 1 >= len) {
-                return result(error_code::SURROGATE, pos);
-            } // minimal bound checking
-            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-            uint16_t diff2 = uint16_t(next_word - 0xDC00);
-            if (diff2 > 0x3FF) {
-                return result(error_code::SURROGATE, pos);
-            }
-            uint32_t value = (diff << 10) + diff2 + 0x10000;
-            *utf32_output++ = char32_t(value);
-            pos += 2;
-        }
+template <endianness big_endian>
+inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char32_t* start{utf32_output};
+  while (pos < len) {
+    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if((word &0xF800 ) != 0xD800) {
+      // No surrogate pair, extend 16-bit word to 32-bit word
+      *utf32_output++ = char32_t(word);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if(diff > 0x3FF) { return 0; }
+      if(pos + 1 >= len) { return 0; } // minimal bound checking
+      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if(diff2 > 0x3FF) { return 0; }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      pos += 2;
+    }
+  }
+  return utf32_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char32_t* start{utf32_output};
+  while (pos < len) {
+    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if((word &0xF800 ) != 0xD800) {
+      // No surrogate pair, extend 16-bit word to 32-bit word
+      *utf32_output++ = char32_t(word);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
+      if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } // minimal bound checking
+      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      pos += 2;
     }
-    return result(error_code::SUCCESS, utf32_output - start);
+  }
+  return result(error_code::SUCCESS, utf32_output - start);
 }
 
 } // utf16_to_utf32 namespace
@@ -12546,80 +11433,74 @@ namespace scalar {
 namespace {
 namespace utf8_to_utf16 {
 
-template<endianness big_endian>
-inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output)
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-    size_t pos = 0;
-    char16_t* start { utf16_output };
-    while (pos < len) {
-        // try to convert the next block of 8 ASCII bytes
-        if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-            uint64_t v;
-            ::memcpy(&v, data + pos, sizeof(uint64_t));
-            if ((v & 0x8080808080808080) == 0) {
-                size_t final_pos = pos + 8;
-                while (pos < final_pos) {
-                    *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-        uint8_t leading_byte = data[pos]; // leading byte
-        if (leading_byte < 0b10000000) {
-            // converting one ASCII byte !!!
-            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
-            pos++;
-        } else if ((leading_byte & 0b11100000) == 0b11000000) {
-            // We have a two-byte UTF-8, it should become
-            // a single UTF-16 word.
-            if (pos + 1 >= len) {
-                break;
-            } // minimal bound checking
-            uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) | (data[pos + 1] & 0b00111111));
-            if (!match_system(big_endian)) {
-                code_point = utf16::swap_bytes(uint16_t(code_point));
-            }
-            *utf16_output++ = char16_t(code_point);
-            pos += 2;
-        } else if ((leading_byte & 0b11110000) == 0b11100000) {
-            // We have a three-byte UTF-8, it should become
-            // a single UTF-16 word.
-            if (pos + 2 >= len) {
-                break;
-            } // minimal bound checking
-            uint16_t code_point = uint16_t(((leading_byte & 0b00001111) << 12) | ((data[pos + 1] & 0b00111111) << 6) | (data[pos + 2] & 0b00111111));
-            if (!match_system(big_endian)) {
-                code_point = utf16::swap_bytes(uint16_t(code_point));
-            }
-            *utf16_output++ = char16_t(code_point);
-            pos += 3;
-        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-            // we have a 4-byte UTF-8 word.
-            if (pos + 3 >= len) {
-                break;
-            } // minimal bound checking
-            uint32_t code_point = ((leading_byte & 0b00000111) << 18) | ((data[pos + 1] & 0b00111111) << 12)
-                | ((data[pos + 2] & 0b00111111) << 6) | (data[pos + 3] & 0b00111111);
-            code_point -= 0x10000;
-            uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-            uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-            if (!match_system(big_endian)) {
-                high_surrogate = utf16::swap_bytes(high_surrogate);
-                low_surrogate = utf16::swap_bytes(low_surrogate);
-            }
-            *utf16_output++ = char16_t(high_surrogate);
-            *utf16_output++ = char16_t(low_surrogate);
-            pos += 4;
-        } else {
-            // we may have a continuation but we do not do error checking
-            return 0;
-        }
+template <endianness big_endian>
+inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  while (pos < len) {
+    // try to convert the next block of 8 ASCII bytes
+    if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 8;
+        while(pos < final_pos) {
+          *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if(pos + 1 >= len) { break; } // minimal bound checking
+      uint16_t code_point = uint16_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
+      if (!match_system(big_endian)) {
+        code_point = utf16::swap_bytes(uint16_t(code_point));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if(pos + 2 >= len) { break; } // minimal bound checking
+      uint16_t code_point = uint16_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
+      if (!match_system(big_endian)) {
+        code_point = utf16::swap_bytes(uint16_t(code_point));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if(pos + 3 >= len) { break; } // minimal bound checking
+      uint32_t code_point = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
+                           | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
+      code_point -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      pos += 4;
+    } else {
+      // we may have a continuation but we do not do error checking
+      return 0;
     }
-    return utf16_output - start;
+  }
+  return utf16_output - start;
 }
 
+
 } // namespace utf8_to_utf16
 } // unnamed namespace
 } // namespace scalar
@@ -12637,230 +11518,184 @@ namespace scalar {
 namespace {
 namespace utf8_to_utf16 {
 
-template<endianness big_endian>
-inline size_t convert(const char* buf, size_t len, char16_t* utf16_output)
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-    size_t pos = 0;
-    char16_t* start { utf16_output };
-    while (pos < len) {
-        // try to convert the next block of 16 ASCII bytes
-        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-            uint64_t v1;
-            ::memcpy(&v1, data + pos, sizeof(uint64_t));
-            uint64_t v2;
-            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-            uint64_t v { v1 | v2 };
-            if ((v & 0x8080808080808080) == 0) {
-                size_t final_pos = pos + 16;
-                while (pos < final_pos) {
-                    *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-
-        uint8_t leading_byte = data[pos]; // leading byte
-        if (leading_byte < 0b10000000) {
-            // converting one ASCII byte !!!
-            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
-            pos++;
-        } else if ((leading_byte & 0b11100000) == 0b11000000) {
-            // We have a two-byte UTF-8, it should become
-            // a single UTF-16 word.
-            if (pos + 1 >= len) {
-                return 0;
-            } // minimal bound checking
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return 0;
-            }
-            // range check
-            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-            if (code_point < 0x80 || 0x7ff < code_point) {
-                return 0;
-            }
-            if (!match_system(big_endian)) {
-                code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-            }
-            *utf16_output++ = char16_t(code_point);
-            pos += 2;
-        } else if ((leading_byte & 0b11110000) == 0b11100000) {
-            // We have a three-byte UTF-8, it should become
-            // a single UTF-16 word.
-            if (pos + 2 >= len) {
-                return 0;
-            } // minimal bound checking
-
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return 0;
-            }
-            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-                return 0;
-            }
-            // range check
-            uint32_t code_point = (leading_byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
-            if (code_point < 0x800 || 0xffff < code_point || (0xd7ff < code_point && code_point < 0xe000)) {
-                return 0;
-            }
-            if (!match_system(big_endian)) {
-                code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-            }
-            *utf16_output++ = char16_t(code_point);
-            pos += 3;
-        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-            // we have a 4-byte UTF-8 word.
-            if (pos + 3 >= len) {
-                return 0;
-            } // minimal bound checking
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return 0;
-            }
-            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-                return 0;
-            }
-            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-                return 0;
-            }
-
-            // range check
-            uint32_t code_point = (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-            if (code_point <= 0xffff || 0x10ffff < code_point) {
-                return 0;
-            }
-            code_point -= 0x10000;
-            uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-            uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-            if (!match_system(big_endian)) {
-                high_surrogate = utf16::swap_bytes(high_surrogate);
-                low_surrogate = utf16::swap_bytes(low_surrogate);
-            }
-            *utf16_output++ = char16_t(high_surrogate);
-            *utf16_output++ = char16_t(low_surrogate);
-            pos += 4;
-        } else {
-            return 0;
-        }
-    }
-    return utf16_output - start;
-}
-
-template<endianness big_endian>
-inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output)
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-    size_t pos = 0;
-    char16_t* start { utf16_output };
-    while (pos < len) {
-        // try to convert the next block of 16 ASCII bytes
-        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-            uint64_t v1;
-            ::memcpy(&v1, data + pos, sizeof(uint64_t));
-            uint64_t v2;
-            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-            uint64_t v { v1 | v2 };
-            if ((v & 0x8080808080808080) == 0) {
-                size_t final_pos = pos + 16;
-                while (pos < final_pos) {
-                    *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-        uint8_t leading_byte = data[pos]; // leading byte
-        if (leading_byte < 0b10000000) {
-            // converting one ASCII byte !!!
-            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
-            pos++;
-        } else if ((leading_byte & 0b11100000) == 0b11000000) {
-            // We have a two-byte UTF-8, it should become
-            // a single UTF-16 word.
-            if (pos + 1 >= len) {
-                return result(error_code::TOO_SHORT, pos);
-            } // minimal bound checking
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            // range check
-            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-            if (code_point < 0x80 || 0x7ff < code_point) {
-                return result(error_code::OVERLONG, pos);
-            }
-            if (!match_system(big_endian)) {
-                code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-            }
-            *utf16_output++ = char16_t(code_point);
-            pos += 2;
-        } else if ((leading_byte & 0b11110000) == 0b11100000) {
-            // We have a three-byte UTF-8, it should become
-            // a single UTF-16 word.
-            if (pos + 2 >= len) {
-                return result(error_code::TOO_SHORT, pos);
-            } // minimal bound checking
-
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            // range check
-            uint32_t code_point = (leading_byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
-            if ((code_point < 0x800) || (0xffff < code_point)) {
-                return result(error_code::OVERLONG, pos);
-            }
-            if (0xd7ff < code_point && code_point < 0xe000) {
-                return result(error_code::SURROGATE, pos);
-            }
-            if (!match_system(big_endian)) {
-                code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-            }
-            *utf16_output++ = char16_t(code_point);
-            pos += 3;
-        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-            // we have a 4-byte UTF-8 word.
-            if (pos + 3 >= len) {
-                return result(error_code::TOO_SHORT, pos);
-            } // minimal bound checking
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-
-            // range check
-            uint32_t code_point = (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-            if (code_point <= 0xffff) {
-                return result(error_code::OVERLONG, pos);
-            }
-            if (0x10ffff < code_point) {
-                return result(error_code::TOO_LARGE, pos);
-            }
-            code_point -= 0x10000;
-            uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-            uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-            if (!match_system(big_endian)) {
-                high_surrogate = utf16::swap_bytes(high_surrogate);
-                low_surrogate = utf16::swap_bytes(low_surrogate);
-            }
-            *utf16_output++ = char16_t(high_surrogate);
-            *utf16_output++ = char16_t(low_surrogate);
-            pos += 4;
-        } else {
-            // we either have too many continuation bytes or an invalid leading byte
-            if ((leading_byte & 0b11000000) == 0b10000000) {
-                return result(error_code::TOO_LONG, pos);
-            } else {
-                return result(error_code::HEADER_BITS, pos);
-            }
-        }
+template <endianness big_endian>
+inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while(pos < final_pos) {
+          *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if(pos + 1 >= len) { return 0; } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
+      if (!match_system(big_endian)) {
+        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if(pos + 2 >= len) { return 0; } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return 0;
+      }
+      if (!match_system(big_endian)) {
+        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if(pos + 3 >= len) { return 0; } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
+
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
+      code_point -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      pos += 4;
+    } else {
+      return 0;
+    }
+  }
+  return utf16_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while(pos < final_pos) {
+          *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); }
+      if (!match_system(big_endian)) {
+        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);}
+      if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
+      if (!match_system(big_endian)) {
+        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
+      if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
+      code_point -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      pos += 4;
+    } else {
+      // we either have too many continuation bytes or an invalid leading byte
+      if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
+      else { return result(error_code::HEADER_BITS, pos); }
     }
-    return result(error_code::SUCCESS, utf16_output - start);
+  }
+  return result(error_code::SUCCESS, utf16_output - start);
 }
 
 /**
@@ -12876,44 +11711,43 @@ inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_o
  * If the error is believed to have occured prior to 'buf', the count value contain in the result
  * will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
  */
-template<endianness endian>
-inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char16_t* utf16_output)
-{
-    size_t extra_len { 0 };
-    // We potentially need to go back in time and find a leading byte.
-    // In theory '3' would be sufficient, but sometimes the error can go back quite far.
-    size_t how_far_back = prior_bytes;
-    // size_t how_far_back = 3; // 3 bytes in the past + current position
-    // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
-    bool found_leading_bytes { false };
-    // important: it is i <= how_far_back and not 'i < how_far_back'.
-    for (size_t i = 0; i <= how_far_back; i++) {
-        unsigned char byte = buf[0 - i];
-        found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
-        if (found_leading_bytes) {
-            buf -= i;
-            extra_len = i;
-            break;
-        }
-    }
-    //
-    // It is possible for this function to return a negative count in its result.
-    // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
-    // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
-    //
-    // An unsigned type will simply wrap round arithmetically (well defined).
-    //
-    if (!found_leading_bytes) {
-        // If how_far_back == 3, we may have four consecutive continuation bytes!!!
-        // [....] [continuation] [continuation] [continuation] | [buf is continuation]
-        // Or we possibly have a stream that does not start with a leading byte.
-        return result(error_code::TOO_LONG, 0 - how_far_back);
-    }
-    result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
-    if (res.error) {
-        res.count -= extra_len;
-    }
-    return res;
+template <endianness endian>
+inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char16_t* utf16_output) {
+  size_t extra_len{0};
+  // We potentially need to go back in time and find a leading byte.
+  // In theory '3' would be sufficient, but sometimes the error can go back quite far.
+  size_t how_far_back = prior_bytes;
+  // size_t how_far_back = 3; // 3 bytes in the past + current position
+  // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
+  bool found_leading_bytes{false};
+  // important: it is i <= how_far_back and not 'i < how_far_back'.
+  for(size_t i = 0; i <= how_far_back; i++) {
+    unsigned char byte = buf[0-i];
+    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+    if(found_leading_bytes) {
+      buf -= i;
+      extra_len = i;
+      break;
+    }
+  }
+  //
+  // It is possible for this function to return a negative count in its result.
+  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
+  // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
+  //
+  // An unsigned type will simply wrap round arithmetically (well defined).
+  //
+  if(!found_leading_bytes) {
+    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+    // [....] [continuation] [continuation] [continuation] | [buf is continuation]
+    // Or we possibly have a stream that does not start with a leading byte.
+    return result(error_code::TOO_LONG, 0-how_far_back);
+  }
+  result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
+  if (res.error) {
+    res.count -= extra_len;
+  }
+  return res;
 }
 
 } // utf8_to_utf16 namespace
@@ -12934,61 +11768,55 @@ namespace scalar {
 namespace {
 namespace utf8_to_utf32 {
 
-inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output)
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-    size_t pos = 0;
-    char32_t* start { utf32_output };
-    while (pos < len) {
-        // try to convert the next block of 8 ASCII bytes
-        if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-            uint64_t v;
-            ::memcpy(&v, data + pos, sizeof(uint64_t));
-            if ((v & 0x8080808080808080) == 0) {
-                size_t final_pos = pos + 8;
-                while (pos < final_pos) {
-                    *utf32_output++ = char32_t(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-        uint8_t leading_byte = data[pos]; // leading byte
-        if (leading_byte < 0b10000000) {
-            // converting one ASCII byte !!!
-            *utf32_output++ = char32_t(leading_byte);
-            pos++;
-        } else if ((leading_byte & 0b11100000) == 0b11000000) {
-            // We have a two-byte UTF-8
-            if (pos + 1 >= len) {
-                break;
-            } // minimal bound checking
-            *utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) | (data[pos + 1] & 0b00111111));
-            pos += 2;
-        } else if ((leading_byte & 0b11110000) == 0b11100000) {
-            // We have a three-byte UTF-8
-            if (pos + 2 >= len) {
-                break;
-            } // minimal bound checking
-            *utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) | ((data[pos + 1] & 0b00111111) << 6) | (data[pos + 2] & 0b00111111));
-            pos += 3;
-        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-            // we have a 4-byte UTF-8 word.
-            if (pos + 3 >= len) {
-                break;
-            } // minimal bound checking
-            uint32_t code_word = ((leading_byte & 0b00000111) << 18) | ((data[pos + 1] & 0b00111111) << 12)
-                | ((data[pos + 2] & 0b00111111) << 6) | (data[pos + 3] & 0b00111111);
-            *utf32_output++ = char32_t(code_word);
-            pos += 4;
-        } else {
-            // we may have a continuation but we do not do error checking
-            return 0;
-        }
+inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char32_t* start{utf32_output};
+  while (pos < len) {
+    // try to convert the next block of 8 ASCII bytes
+    if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 8;
+        while(pos < final_pos) {
+          *utf32_output++ = char32_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf32_output++ = char32_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8
+      if(pos + 1 >= len) { break; } // minimal bound checking
+      *utf32_output++ = char32_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8
+      if(pos + 2 >= len) { break; } // minimal bound checking
+      *utf32_output++ = char32_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if(pos + 3 >= len) { break; } // minimal bound checking
+      uint32_t code_word = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
+                           | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
+      *utf32_output++ = char32_t(code_word);
+      pos += 4;
+    } else {
+      // we may have a continuation but we do not do error checking
+      return 0;
     }
-    return utf32_output - start;
+  }
+  return utf32_output - start;
 }
 
+
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace scalar
@@ -13006,195 +11834,149 @@ namespace scalar {
 namespace {
 namespace utf8_to_utf32 {
 
-inline size_t convert(const char* buf, size_t len, char32_t* utf32_output)
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-    size_t pos = 0;
-    char32_t* start { utf32_output };
-    while (pos < len) {
-        // try to convert the next block of 16 ASCII bytes
-        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-            uint64_t v1;
-            ::memcpy(&v1, data + pos, sizeof(uint64_t));
-            uint64_t v2;
-            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-            uint64_t v { v1 | v2 };
-            if ((v & 0x8080808080808080) == 0) {
-                size_t final_pos = pos + 16;
-                while (pos < final_pos) {
-                    *utf32_output++ = char32_t(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-        uint8_t leading_byte = data[pos]; // leading byte
-        if (leading_byte < 0b10000000) {
-            // converting one ASCII byte !!!
-            *utf32_output++ = char32_t(leading_byte);
-            pos++;
-        } else if ((leading_byte & 0b11100000) == 0b11000000) {
-            // We have a two-byte UTF-8
-            if (pos + 1 >= len) {
-                return 0;
-            } // minimal bound checking
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return 0;
-            }
-            // range check
-            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-            if (code_point < 0x80 || 0x7ff < code_point) {
-                return 0;
-            }
-            *utf32_output++ = char32_t(code_point);
-            pos += 2;
-        } else if ((leading_byte & 0b11110000) == 0b11100000) {
-            // We have a three-byte UTF-8
-            if (pos + 2 >= len) {
-                return 0;
-            } // minimal bound checking
-
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return 0;
-            }
-            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-                return 0;
-            }
-            // range check
-            uint32_t code_point = (leading_byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
-            if (code_point < 0x800 || 0xffff < code_point || (0xd7ff < code_point && code_point < 0xe000)) {
-                return 0;
-            }
-            *utf32_output++ = char32_t(code_point);
-            pos += 3;
-        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-            // we have a 4-byte UTF-8 word.
-            if (pos + 3 >= len) {
-                return 0;
-            } // minimal bound checking
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return 0;
-            }
-            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-                return 0;
-            }
-            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-                return 0;
-            }
-
-            // range check
-            uint32_t code_point = (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-            if (code_point <= 0xffff || 0x10ffff < code_point) {
-                return 0;
-            }
-            *utf32_output++ = char32_t(code_point);
-            pos += 4;
-        } else {
-            return 0;
-        }
-    }
-    return utf32_output - start;
-}
-
-inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_output)
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-    size_t pos = 0;
-    char32_t* start { utf32_output };
-    while (pos < len) {
-        // try to convert the next block of 16 ASCII bytes
-        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-            uint64_t v1;
-            ::memcpy(&v1, data + pos, sizeof(uint64_t));
-            uint64_t v2;
-            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-            uint64_t v { v1 | v2 };
-            if ((v & 0x8080808080808080) == 0) {
-                size_t final_pos = pos + 16;
-                while (pos < final_pos) {
-                    *utf32_output++ = char32_t(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-        uint8_t leading_byte = data[pos]; // leading byte
-        if (leading_byte < 0b10000000) {
-            // converting one ASCII byte !!!
-            *utf32_output++ = char32_t(leading_byte);
-            pos++;
-        } else if ((leading_byte & 0b11100000) == 0b11000000) {
-            // We have a two-byte UTF-8
-            if (pos + 1 >= len) {
-                return result(error_code::TOO_SHORT, pos);
-            } // minimal bound checking
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            // range check
-            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-            if (code_point < 0x80 || 0x7ff < code_point) {
-                return result(error_code::OVERLONG, pos);
-            }
-            *utf32_output++ = char32_t(code_point);
-            pos += 2;
-        } else if ((leading_byte & 0b11110000) == 0b11100000) {
-            // We have a three-byte UTF-8
-            if (pos + 2 >= len) {
-                return result(error_code::TOO_SHORT, pos);
-            } // minimal bound checking
-
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            // range check
-            uint32_t code_point = (leading_byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
-            if (code_point < 0x800 || 0xffff < code_point) {
-                return result(error_code::OVERLONG, pos);
-            }
-            if (0xd7ff < code_point && code_point < 0xe000) {
-                return result(error_code::SURROGATE, pos);
-            }
-            *utf32_output++ = char32_t(code_point);
-            pos += 3;
-        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-            // we have a 4-byte UTF-8 word.
-            if (pos + 3 >= len) {
-                return result(error_code::TOO_SHORT, pos);
-            } // minimal bound checking
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            }
-
-            // range check
-            uint32_t code_point = (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-            if (code_point <= 0xffff) {
-                return result(error_code::OVERLONG, pos);
-            }
-            if (0x10ffff < code_point) {
-                return result(error_code::TOO_LARGE, pos);
-            }
-            *utf32_output++ = char32_t(code_point);
-            pos += 4;
-        } else {
-            // we either have too many continuation bytes or an invalid leading byte
-            if ((leading_byte & 0b11000000) == 0b10000000) {
-                return result(error_code::TOO_LONG, pos);
-            } else {
-                return result(error_code::HEADER_BITS, pos);
-            }
-        }
+inline size_t convert(const char* buf, size_t len, char32_t* utf32_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char32_t* start{utf32_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while(pos < final_pos) {
+          *utf32_output++ = char32_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf32_output++ = char32_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8
+      if(pos + 1 >= len) { return 0; } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
+      *utf32_output++ = char32_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8
+      if(pos + 2 >= len) { return 0; } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return 0;
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if(pos + 3 >= len) { return 0; } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
+
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
+      *utf32_output++ = char32_t(code_point);
+      pos += 4;
+    } else {
+      return 0;
+    }
+  }
+  return utf32_output - start;
+}
+
+inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char32_t* start{utf32_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while(pos < final_pos) {
+          *utf32_output++ = char32_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf32_output++ = char32_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8
+      if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); }
+      *utf32_output++ = char32_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8
+      if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point) { return result(error_code::OVERLONG, pos); }
+      if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
+      *utf32_output++ = char32_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos);}
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
+
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
+      if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
+      *utf32_output++ = char32_t(code_point);
+      pos += 4;
+    } else {
+      // we either have too many continuation bytes or an invalid leading byte
+      if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
+      else { return result(error_code::HEADER_BITS, pos); }
     }
-    return result(error_code::SUCCESS, utf32_output - start);
+  }
+  return result(error_code::SUCCESS, utf32_output - start);
 }
 
 /**
@@ -13210,44 +11992,41 @@ inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_o
  * If the error is believed to have occured prior to 'buf', the count value contain in the result
  * will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
  */
-inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char32_t* utf32_output)
-{
-    size_t extra_len { 0 };
-    // We potentially need to go back in time and find a leading byte.
-    size_t how_far_back = 3; // 3 bytes in the past + current position
-    if (how_far_back > prior_bytes) {
-        how_far_back = prior_bytes;
-    }
-    bool found_leading_bytes { false };
-    // important: it is i <= how_far_back and not 'i < how_far_back'.
-    for (size_t i = 0; i <= how_far_back; i++) {
-        unsigned char byte = buf[0 - i];
-        found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
-        if (found_leading_bytes) {
-            buf -= i;
-            extra_len = i;
-            break;
-        }
-    }
-    //
-    // It is possible for this function to return a negative count in its result.
-    // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
-    // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
-    //
-    // An unsigned type will simply wrap round arithmetically (well defined).
-    //
-    if (!found_leading_bytes) {
-        // If how_far_back == 3, we may have four consecutive continuation bytes!!!
-        // [....] [continuation] [continuation] [continuation] | [buf is continuation]
-        // Or we possibly have a stream that does not start with a leading byte.
-        return result(error_code::TOO_LONG, 0 - how_far_back);
-    }
-
-    result res = convert_with_errors(buf, len + extra_len, utf32_output);
-    if (res.error) {
-        res.count -= extra_len;
-    }
-    return res;
+inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char32_t* utf32_output) {
+  size_t extra_len{0};
+  // We potentially need to go back in time and find a leading byte.
+  size_t how_far_back = 3; // 3 bytes in the past + current position
+  if(how_far_back > prior_bytes) { how_far_back = prior_bytes; }
+  bool found_leading_bytes{false};
+  // important: it is i <= how_far_back and not 'i < how_far_back'.
+  for(size_t i = 0; i <= how_far_back; i++) {
+    unsigned char byte = buf[0-i];
+    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+    if(found_leading_bytes) {
+      buf -= i;
+      extra_len = i;
+      break;
+    }
+  }
+  //
+  // It is possible for this function to return a negative count in its result.
+  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
+  // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
+  //
+  // An unsigned type will simply wrap round arithmetically (well defined).
+  //
+  if(!found_leading_bytes) {
+    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+    // [....] [continuation] [continuation] [continuation] | [buf is continuation]
+    // Or we possibly have a stream that does not start with a leading byte.
+    return result(error_code::TOO_LONG, 0-how_far_back);
+  }
+
+  result res = convert_with_errors(buf, len + extra_len, utf32_output);
+  if (res.error) {
+    res.count -= extra_len;
+  }
+  return res;
 }
 
 } // utf8_to_utf32 namespace
@@ -13268,42 +12047,41 @@ namespace scalar {
 namespace {
 namespace latin1_to_utf8 {
 
-inline size_t convert(const char* buf, size_t len, char* utf8_output)
-{
-    const unsigned char* data = reinterpret_cast<const unsigned char*>(buf);
-    size_t pos = 0;
-    char* start { utf8_output };
-    while (pos < len) {
-        // try to convert the next block of 16 ASCII bytes
-        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-            uint64_t v1;
-            ::memcpy(&v1, data + pos, sizeof(uint64_t));
-            uint64_t v2;
-            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-            uint64_t v { v1 | v2 }; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
-            if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
-                size_t final_pos = pos + 16;
-                while (pos < final_pos) {
-                    *utf8_output++ = char(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-
-        unsigned char byte = data[pos];
-        if ((byte & 0x80) == 0) { // if ASCII
-            // will generate one UTF-8 bytes
-            *utf8_output++ = char(byte);
-            pos++;
-        } else {
-            // will generate two UTF-8 bytes
-            *utf8_output++ = char((byte >> 6) | 0b11000000);
-            *utf8_output++ = char((byte & 0b111111) | 0b10000000);
-            pos++;
-        }
+inline size_t convert(const char* buf, size_t len, char* utf8_output) {
+  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+  size_t pos = 0;
+  char* start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
+      if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+        size_t final_pos = pos + 16;
+        while(pos < final_pos) {
+          *utf8_output++ = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    unsigned char byte = data[pos];
+    if((byte & 0x80) == 0) { // if ASCII
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(byte);
+      pos++;
+    } else {
+      // will generate two UTF-8 bytes
+      *utf8_output++ = char((byte>>6) | 0b11000000);
+      *utf8_output++ = char((byte & 0b111111) | 0b10000000);
+      pos++;
     }
-    return utf8_output - start;
+  }
+  return utf8_output - start;
 }
 
 } // latin1_to_utf8 namespace
@@ -13323,36 +12101,34 @@ namespace scalar {
 namespace {
 namespace latin1_to_utf16 {
 
-template<endianness big_endian>
-inline size_t convert(const char* buf, size_t len, char16_t* utf16_output)
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-    size_t pos = 0;
-    char16_t* start { utf16_output };
+template <endianness big_endian>
+inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) {
+  const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+  size_t pos = 0;
+  char16_t* start{ utf16_output };
 
-    while (pos < len) {
-        uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
-        *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
-        pos++;
-    }
+  while (pos < len) {
+    uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
+    *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
+    pos++;
+  }
 
-    return utf16_output - start;
+  return utf16_output - start;
 }
 
-template<endianness big_endian>
-inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output)
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-    size_t pos = 0;
-    char16_t* start { utf16_output };
+template <endianness big_endian>
+inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) {
+  const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+  size_t pos = 0;
+  char16_t* start{ utf16_output };
 
-    while (pos < len) {
-        uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
-        *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
-        pos++;
-    }
+  while (pos < len) {
+    uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
+    *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
+    pos++;
+  }
 
-    return result(error_code::SUCCESS, utf16_output - start);
+  return result(error_code::SUCCESS, utf16_output - start);
 }
 
 } // latin1_to_utf16 namespace
@@ -13372,24 +12148,23 @@ namespace scalar {
 namespace {
 namespace latin1_to_utf32 {
 
-inline size_t convert(const char* buf, size_t len, char32_t* utf32_output)
-{
-    const unsigned char* data = reinterpret_cast<const unsigned char*>(buf);
-    char32_t* start { utf32_output };
-    for (size_t i = 0; i < len; i++) {
-        *utf32_output++ = (char32_t)data[i];
-    }
-    return utf32_output - start;
+
+inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
+  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+  char32_t* start{utf32_output};
+  for (size_t i = 0; i < len; i++) {
+    *utf32_output++ = (char32_t)data[i];
+  }
+  return utf32_output - start;
 }
 
-inline result convert_with_errors(const char32_t* buf, size_t len, char32_t* utf32_output)
-{
-    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
-    char32_t* start { utf32_output };
-    for (size_t i = 0; i < len; i++) {
-        *utf32_output++ = (char32_t)data[i];
-    }
-    return result(error_code::SUCCESS, utf32_output - start);
+inline result convert_with_errors(const char32_t *buf, size_t len, char32_t *utf32_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  char32_t* start{utf32_output};
+  for (size_t i = 0; i < len; i++) {
+    *utf32_output++ = (char32_t)data[i];
+  }
+  return result(error_code::SUCCESS, utf32_output - start);
 }
 
 } // latin1_to_utf32 namespace
@@ -13411,121 +12186,113 @@ namespace scalar {
 namespace {
 namespace utf8_to_latin1 {
 
-inline size_t convert(const char* buf, size_t len, char* latin_output)
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-    size_t pos = 0;
-    char* start { latin_output };
-
-    while (pos < len) {
-        // try to convert the next block of 16 ASCII bytes
-        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-            uint64_t v1;
-            ::memcpy(&v1, data + pos, sizeof(uint64_t));
-            uint64_t v2;
-            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-            uint64_t v { v1 | v2 }; // We are only interested in these bits: 1000 1000 1000 1000 .... etc
-            if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
-                size_t final_pos = pos + 16;
-                while (pos < final_pos) {
-                    *latin_output++ = char(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-
-        // suppose it is not an all ASCII byte sequence
-        uint8_t leading_byte = data[pos]; // leading byte
-        if (leading_byte < 0b10000000) {
-            // converting one ASCII byte !!!
-            *latin_output++ = char(leading_byte);
-            pos++;
-        } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
-            // We have a two-byte UTF-8
-            if (pos + 1 >= len) {
-                return 0;
-            } // minimal bound checking
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return 0;
-            } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
-            // range check -
-            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
-            if (0xFF < code_point) {
-                return 0; // We only care about the range 129-255 which is Non-ASCII latin1 characters
-            }
-            *latin_output++ = char(code_point);
-            pos += 2;
-        } else {
-            return 0;
-        }
-    }
-    return latin_output - start;
-}
+inline size_t convert(const char* buf, size_t len, char* latin_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char* start{latin_output};
+
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000 .... etc
+      if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+        size_t final_pos = pos + 16;
+        while(pos < final_pos) {
+          *latin_output++ = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    // suppose it is not an all ASCII byte sequence
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *latin_output++ = char(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
+      // We have a two-byte UTF-8
+      if(pos + 1 >= len) {
+         return 0; } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
+      // range check -
+      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
+      if (code_point < 0x80 || 0xFF < code_point) {
+        return 0; // We only care about the range 129-255 which is Non-ASCII latin1 characters. A code_point beneath 0x80 is invalid as it's already covered by bytes whose leading bit is zero. 
+      }
+      *latin_output++ = char(code_point);
+      pos += 2;
+    } else {
+      return 0;
+    }
+  }
+  return latin_output - start;
+}
+
+inline result convert_with_errors(const char* buf, size_t len, char* latin_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char* start{latin_output};
+
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000...etc
+      if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+        size_t final_pos = pos + 16;
+        while(pos < final_pos) {
+          *latin_output++ = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    // suppose it is not an all ASCII byte sequence
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *latin_output++ = char(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
+      // We have a two-byte UTF-8
+      if(pos + 1 >= len) {
+        return result(error_code::TOO_SHORT, pos); } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos); } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
+      // range check -
+      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
+      if (code_point < 0x80) {
+        return result(error_code::OVERLONG, pos); }
+      if ( 0xFF < code_point) {
+          return result(error_code::TOO_LARGE, pos);
+          } // We only care about the range 129-255 which is Non-ASCII latin1 characters
+      *latin_output++ = char(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8
+      return result(error_code::TOO_LARGE, pos);
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      return result(error_code::TOO_LARGE, pos);
+    } else {
+      // we either have too many continuation bytes or an invalid leading byte
+      if ((leading_byte & 0b11000000) == 0b10000000) {
+                return result(error_code::TOO_LONG, pos); }
 
-inline result convert_with_errors(const char* buf, size_t len, char* latin_output)
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-    size_t pos = 0;
-    char* start { latin_output };
-
-    while (pos < len) {
-        // try to convert the next block of 16 ASCII bytes
-        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-            uint64_t v1;
-            ::memcpy(&v1, data + pos, sizeof(uint64_t));
-            uint64_t v2;
-            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-            uint64_t v { v1 | v2 }; // We are only interested in these bits: 1000 1000 1000 1000...etc
-            if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
-                size_t final_pos = pos + 16;
-                while (pos < final_pos) {
-                    *latin_output++ = char(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-        // suppose it is not an all ASCII byte sequence
-        uint8_t leading_byte = data[pos]; // leading byte
-        if (leading_byte < 0b10000000) {
-            // converting one ASCII byte !!!
-            *latin_output++ = char(leading_byte);
-            pos++;
-        } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
-            // We have a two-byte UTF-8
-            if (pos + 1 >= len) {
-                return result(error_code::TOO_SHORT, pos);
-            } // minimal bound checking
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return result(error_code::TOO_SHORT, pos);
-            } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
-            // range check -
-            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
-            if (code_point < 0x80) {
-                return result(error_code::OVERLONG, pos);
-            }
-            if (0xFF < code_point) {
-                return result(error_code::TOO_LARGE, pos);
-            } // We only care about the range 129-255 which is Non-ASCII latin1 characters
-            *latin_output++ = char(code_point);
-            pos += 2;
-        } else if ((leading_byte & 0b11110000) == 0b11100000) {
-            // We have a three-byte UTF-8
-            return result(error_code::TOO_LARGE, pos);
-        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-            // we have a 4-byte UTF-8 word.
-            return result(error_code::TOO_LARGE, pos);
-        } else {
-            // we either have too many continuation bytes or an invalid leading byte
-            if ((leading_byte & 0b11000000) == 0b10000000) {
-                return result(error_code::TOO_LONG, pos);
-            }
+      return result(error_code::HEADER_BITS, pos);
 
-            return result(error_code::HEADER_BITS, pos);
-        }
     }
-    return result(error_code::SUCCESS, latin_output - start);
+  }
+  return result(error_code::SUCCESS, latin_output - start);
 }
 
 } // utf8_to_latin1 namespace
@@ -13545,82 +12312,69 @@ namespace scalar {
 namespace {
 namespace utf16_to_latin1 {
 
-#include <cstring> // for std::memcpy
-
-template<endianness big_endian>
-inline size_t convert(const char16_t* buf, size_t len, char* latin_output)
-{
-    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
-    size_t pos = 0;
-    std::vector<char> temp_output(len);
-    char* current_write = temp_output.data();
-    uint16_t word = 0;
-    uint16_t too_large = 0;
-
-    while (pos < len) {
-        word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-        too_large |= word;
-        *current_write++ = char(word & 0xFF);
+#include <cstring>  // for std::memcpy
+
+template <endianness big_endian>
+inline size_t convert(const char16_t* buf, size_t len, char* latin_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  std::vector<char> temp_output(len);
+  char* current_write = temp_output.data();
+  uint16_t word = 0;
+  uint16_t too_large = 0;
+
+  while (pos < len) {
+    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    too_large |= word;
+    *current_write++ = char(word & 0xFF);
+    pos++;
+  }
+  if((too_large & 0xFF00) != 0) { return 0; }
+
+  // Only copy to latin_output if there were no errors
+  std::memcpy(latin_output, temp_output.data(), len);
+  
+  return current_write - temp_output.data();
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char16_t* buf, size_t len, char* latin_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char* start{latin_output};
+  uint16_t word;
+
+  while (pos < len) {
+    if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that they are Latin1
+      uint64_t v1, v2, v3, v4;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      ::memcpy(&v2, data + pos + 4, sizeof(uint64_t));
+      ::memcpy(&v3, data + pos + 8, sizeof(uint64_t));
+      ::memcpy(&v4, data + pos  + 12, sizeof(uint64_t));
+
+      if (!match_system(big_endian)) { v1 = (v1 >> 8) | (v1 << (64 - 8)); }
+      if (!match_system(big_endian)) { v2 = (v2 >> 8) | (v2 << (64 - 8)); }
+      if (!match_system(big_endian)) { v3 = (v3 >> 8) | (v3 << (64 - 8)); }
+      if (!match_system(big_endian)) { v4 = (v1 >> 8) | (v4 << (64 - 8)); }
+
+      if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
+        size_t final_pos = pos + 16;
+        while(pos < final_pos) {
+          *latin_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(data[pos])) : char(data[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if((word & 0xFF00 ) == 0) {
+        *latin_output++ = char(word & 0xFF);
         pos++;
-    }
-    if ((too_large & 0xFF00) != 0) {
-        return 0;
-    }
-
-    // Only copy to latin_output if there were no errors
-    std::memcpy(latin_output, temp_output.data(), len);
-
-    return current_write - temp_output.data();
+    } else { return result(error_code::TOO_LARGE, pos); }
+  }
+  return result(error_code::SUCCESS,latin_output - start);
 }
 
-template<endianness big_endian>
-inline result convert_with_errors(const char16_t* buf, size_t len, char* latin_output)
-{
-    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
-    size_t pos = 0;
-    char* start { latin_output };
-    uint16_t word;
-
-    while (pos < len) {
-        if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that they are Latin1
-            uint64_t v1, v2, v3, v4;
-            ::memcpy(&v1, data + pos, sizeof(uint64_t));
-            ::memcpy(&v2, data + pos + 4, sizeof(uint64_t));
-            ::memcpy(&v3, data + pos + 8, sizeof(uint64_t));
-            ::memcpy(&v4, data + pos + 12, sizeof(uint64_t));
-
-            if (!match_system(big_endian)) {
-                v1 = (v1 >> 8) | (v1 << (64 - 8));
-            }
-            if (!match_system(big_endian)) {
-                v2 = (v2 >> 8) | (v2 << (64 - 8));
-            }
-            if (!match_system(big_endian)) {
-                v3 = (v3 >> 8) | (v3 << (64 - 8));
-            }
-            if (!match_system(big_endian)) {
-                v4 = (v1 >> 8) | (v4 << (64 - 8));
-            }
-
-            if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
-                size_t final_pos = pos + 16;
-                while (pos < final_pos) {
-                    *latin_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(data[pos])) : char(data[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-        word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-        if ((word & 0xFF00) == 0) {
-            *latin_output++ = char(word & 0xFF);
-            pos++;
-        } else {
-            return result(error_code::TOO_LARGE, pos);
-        }
-    }
-    return result(error_code::SUCCESS, latin_output - start);
-}
 
 } // utf16_to_latin1 namespace
 } // unnamed namespace
@@ -13639,51 +12393,45 @@ namespace scalar {
 namespace {
 namespace utf32_to_latin1 {
 
-inline size_t convert(const char32_t* buf, size_t len, char* latin1_output)
-{
-    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
-    char* start = latin1_output;
-    uint32_t utf32_char;
-    size_t pos = 0;
-    uint32_t too_large = 0;
-
-    while (pos < len) {
-        utf32_char = (uint32_t)data[pos];
-        too_large |= utf32_char;
-        *latin1_output++ = (char)(utf32_char & 0xFF);
-        pos++;
-    }
-    if ((too_large & 0xFFFFFF00) != 0) {
-        return 0;
-    }
-    return latin1_output - start;
-}
-
-inline result convert_with_errors(const char32_t* buf, size_t len, char* latin1_output)
-{
-    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
-    char* start { latin1_output };
-    size_t pos = 0;
-    while (pos < len) {
-        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1
-            uint64_t v;
-            ::memcpy(&v, data + pos, sizeof(uint64_t));
-            if ((v & 0xFFFFFF00FFFFFF00) == 0) {
-                *latin1_output++ = char(buf[pos]);
-                *latin1_output++ = char(buf[pos + 1]);
-                pos += 2;
-                continue;
-            }
-        }
-        uint32_t utf32_char = data[pos];
-        if ((utf32_char & 0xFFFFFF00) == 0) { // Check if the character can be represented in Latin-1
-            *latin1_output++ = (char)(utf32_char & 0xFF);
-            pos++;
-        } else {
-            return result(error_code::TOO_LARGE, pos);
-        };
-    }
-    return result(error_code::SUCCESS, latin1_output - start);
+inline size_t convert(const char32_t *buf, size_t len, char *latin1_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  char* start = latin1_output;
+  uint32_t utf32_char;
+  size_t pos = 0;
+  uint32_t too_large = 0;
+
+  while (pos < len) {
+    utf32_char = (uint32_t)data[pos];
+    too_large |= utf32_char;
+    *latin1_output++ = (char)(utf32_char & 0xFF);
+    pos++;
+  }
+  if((too_large & 0xFFFFFF00) != 0) { return 0; }
+  return latin1_output - start;
+}
+
+inline result convert_with_errors(const char32_t *buf, size_t len, char *latin1_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  char* start{latin1_output};
+  size_t pos = 0;
+  while (pos < len) {
+    if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+        *latin1_output++ = char(buf[pos]);
+        *latin1_output++ = char(buf[pos+1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t utf32_char = data[pos];
+    if ((utf32_char & 0xFFFFFF00) == 0) { // Check if the character can be represented in Latin-1
+      *latin1_output++ = (char)(utf32_char & 0xFF);
+      pos++;
+    } else { return result(error_code::TOO_LARGE, pos); };
+  }
+  return result(error_code::SUCCESS, latin1_output - start);
 }
 
 } // utf32_to_latin1 namespace
@@ -13704,55 +12452,50 @@ namespace scalar {
 namespace {
 namespace utf8_to_latin1 {
 
-inline size_t convert_valid(const char* buf, size_t len, char* latin_output)
-{
-    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
-
-    size_t pos = 0;
-    char* start { latin_output };
-
-    while (pos < len) {
-        // try to convert the next block of 16 ASCII bytes
-        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-            uint64_t v1;
-            ::memcpy(&v1, data + pos, sizeof(uint64_t));
-            uint64_t v2;
-            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-            uint64_t v { v1 | v2 }; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
-            if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
-                size_t final_pos = pos + 16;
-                while (pos < final_pos) {
-                    *latin_output++ = char(buf[pos]);
-                    pos++;
-                }
-                continue;
-            }
-        }
-
-        // suppose it is not an all ASCII byte sequence
-        uint8_t leading_byte = data[pos]; // leading byte
-        if (leading_byte < 0b10000000) {
-            // converting one ASCII byte !!!
-            *latin_output++ = char(leading_byte);
-            pos++;
-        } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
-            // We have a two-byte UTF-8
-            if (pos + 1 >= len) {
-                break;
-            } // minimal bound checking
-            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-                return 0;
-            } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
-            // range check -
-            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
-            *latin_output++ = char(code_point);
-            pos += 2;
-        } else {
-            // we may have a continuation but we do not do error checking
-            return 0;
-        }
+inline size_t convert_valid(const char* buf, size_t len, char* latin_output) {
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+
+  size_t pos = 0;
+  char* start{latin_output};
+
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
+      if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+        size_t final_pos = pos + 16;
+        while(pos < final_pos) {
+          *latin_output++ = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    // suppose it is not an all ASCII byte sequence
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *latin_output++ = char(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
+      // We have a two-byte UTF-8
+      if(pos + 1 >= len) { break; } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
+      // range check -
+      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
+      *latin_output++ = char(code_point);
+      pos += 2;
+    } else {
+      // we may have a continuation but we do not do error checking
+      return 0;
     }
-    return latin_output - start;
+  }
+  return latin_output - start;
 }
 
 } // utf8_to_latin1 namespace
@@ -13772,21 +12515,20 @@ namespace scalar {
 namespace {
 namespace utf16_to_latin1 {
 
-template<endianness big_endian>
-inline size_t convert_valid(const char16_t* buf, size_t len, char* latin_output)
-{
-    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
-    size_t pos = 0;
-    char* start { latin_output };
-    uint16_t word = 0;
+template <endianness big_endian>
+inline size_t convert_valid(const char16_t* buf, size_t len, char* latin_output) {
+ const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char* start{latin_output};
+  uint16_t word = 0;
 
-    while (pos < len) {
-        word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-        *latin_output++ = char(word);
-        pos++;
-    }
+  while (pos < len) {
+    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    *latin_output++ = char(word);
+    pos++;
+  }
 
-    return latin_output - start;
+  return latin_output - start;
 }
 
 } // utf16_to_latin1 namespace
@@ -13806,32 +12548,33 @@ namespace scalar {
 namespace {
 namespace utf32_to_latin1 {
 
-inline size_t convert_valid(const char32_t* buf, size_t len, char* latin1_output)
-{
-    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
-    char* start = latin1_output;
-    uint32_t utf32_char;
-    size_t pos = 0;
+inline size_t convert_valid(const char32_t *buf, size_t len, char *latin1_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  char* start = latin1_output;
+  uint32_t utf32_char;
+  size_t pos = 0;
 
-    while (pos < len) {
-        utf32_char = (uint32_t)data[pos];
-
-        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1
-            uint64_t v;
-            ::memcpy(&v, data + pos, sizeof(uint64_t));
-            if ((v & 0xFFFFFF00FFFFFF00) == 0) {
-                *latin1_output++ = char(buf[pos]);
-                *latin1_output++ = char(buf[pos + 1]);
-                pos += 2;
-                continue;
-            }
-        }
-        *latin1_output++ = (char)(utf32_char & 0xFF);
-        pos++;
+  while (pos < len) {
+  utf32_char = (uint32_t)data[pos];
+
+  if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+      *latin1_output++ = char(buf[pos]);
+      *latin1_output++ = char(buf[pos+1]);
+      pos += 2;
+      continue;
     }
-    return latin1_output - start;
+  }
+  *latin1_output++ = (char)(utf32_char & 0xFF);
+  pos++;
+
+  }
+  return latin1_output - start;
 }
 
+
 } // utf32_to_latin1 namespace
 } // unnamed namespace
 } // namespace scalar
@@ -13840,9 +12583,12 @@ inline size_t convert_valid(const char32_t* buf, size_t len, char* latin1_output
 #endif
 /* end file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
 
+
+
 SIMDUTF_PUSH_DISABLE_WARNINGS
 SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 
+
 #if SIMDUTF_IMPLEMENTATION_ARM64
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/implementation.cpp
 /* begin file src/arm64/implementation.cpp */
@@ -13859,16 +12605,14 @@ namespace {
 #endif
 using namespace simd;
 
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input)
-{
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
     simd8<uint8_t> bits = input.reduce_or();
     return bits.max_val() < 0b10000000u;
 }
 
-simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
-{
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
     simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
-    simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+    simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
     simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
     // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
     // This will work fine because we only have to report errors for cases with 0-1 lead bytes.
@@ -13878,19 +12622,91 @@ simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd
     return is_second_byte ^ is_third_byte ^ is_fourth_byte;
 }
 
-simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
-{
-    simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+    simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
     simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
     return is_third_byte ^ is_fourth_byte;
 }
 
+// common functions for utf8 conversions
+simdutf_really_inline uint16x4_t convert_utf8_3_byte_to_utf16(uint8x16_t in) {
+  // Low half contains  10cccccc|1110aaaa
+  // High half contains 10bbbbbb|10bbbbbb
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  const uint8x16_t sh = make_uint8x16_t(0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10);
+#else
+  const uint8x16_t sh = {0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10};
+#endif
+  uint8x16_t perm = vqtbl1q_u8(in, sh);
+  // Split into half vectors.
+  // 10cccccc|1110aaaa
+  uint8x8_t perm_low = vget_low_u8(perm); // no-op
+  // 10bbbbbb|10bbbbbb
+  uint8x8_t perm_high = vget_high_u8(perm);
+  // xxxxxxxx 10bbbbbb
+  uint16x4_t mid = vreinterpret_u16_u8(perm_high); // no-op
+  // xxxxxxxx 1110aaaa
+  uint16x4_t high = vreinterpret_u16_u8(perm_low); // no-op
+  // Assemble with shift left insert.
+  // xxxxxxaa aabbbbbb
+  uint16x4_t mid_high = vsli_n_u16(mid, high, 6);
+  // (perm_low << 8) | (perm_low >> 8)
+  // xxxxxxxx 10cccccc
+  uint16x4_t low = vreinterpret_u16_u8(vrev16_u8(perm_low));
+  // Shift left insert into the low bits
+  // aaaabbbb bbcccccc
+  uint16x4_t composed = vsli_n_u16(low, mid_high, 6);
+  return composed;
+}
+
+simdutf_really_inline uint16x8_t convert_utf8_2_byte_to_utf16(uint8x16_t in) {
+  // Converts 6 2 byte UTF-8 characters to 6 UTF-16 characters.
+  // Technically this calculates 8, but 6 does better and happens more often
+  // (The languages which use these codepoints use ASCII spaces so 8 would need to be
+  // in the middle of a very long word).
+
+  // 10bbbbbb 110aaaaa
+  uint16x8_t upper = vreinterpretq_u16_u8(in);
+  // (in << 8) | (in >> 8)
+  // 110aaaaa 10bbbbbb
+  uint16x8_t lower = vreinterpretq_u16_u8(vrev16q_u8(in));
+  // 00000000 000aaaaa
+  uint16x8_t upper_masked = vandq_u16(upper, vmovq_n_u16(0x1F));
+  // Assemble with shift left insert.
+  // 00000aaa aabbbbbb
+  uint16x8_t composed = vsliq_n_u16(lower, upper_masked, 6);
+  return composed;
+}
+
+simdutf_really_inline uint16x8_t convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx) {
+  // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
+  // This is a relatively easy scenario
+  // we process SIX (6) input code-words. The max length in bytes of six code
+  // words spanning between 1 and 2 bytes each is 12 bytes.
+  uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]));
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
+  // Mask
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000000 00bbbbbb
+  uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
+  // 1 byte: 00000000 00000000
+  // 2 byte: 000aaaaa 00000000
+  uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
+  // Combine with a shift right accumulate
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000aaa aabbbbbb
+  uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
+  return composed;
+}
+
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_detect_encodings.cpp
 /* begin file src/arm64/arm_detect_encodings.cpp */
 template<class checker>
 // len is known to be a multiple of 2 when this is called
-int arm_detect_encodings(const char* buf, size_t len)
-{
+int arm_detect_encodings(const char * buf, size_t len) {
     const char* start = buf;
     const char* end = buf + len;
 
@@ -13905,13 +12721,13 @@ int arm_detect_encodings(const char* buf, size_t len)
 
     uint32x4_t currentmax = vmovq_n_u32(0x0);
 
-    checker check {};
+    checker check{};
 
-    while (buf + 64 <= end) {
+    while(buf + 64 <= end) {
         uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
         uint16x8_t secondin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + simd16<uint16_t>::SIZE / sizeof(char16_t));
-        uint16x8_t thirdin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 2 * simd16<uint16_t>::SIZE / sizeof(char16_t));
-        uint16x8_t fourthin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 3 * simd16<uint16_t>::SIZE / sizeof(char16_t));
+        uint16x8_t thirdin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 2*simd16<uint16_t>::SIZE / sizeof(char16_t));
+        uint16x8_t fourthin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 3*simd16<uint16_t>::SIZE / sizeof(char16_t));
 
         const auto u0 = simd16<uint16_t>(in);
         const auto u1 = simd16<uint16_t>(secondin);
@@ -13943,15 +12759,15 @@ int arm_detect_encodings(const char* buf, size_t len)
                 is_utf32 = false;
                 // Code from arm_validate_utf16le.cpp
                 // Not efficient, we do not process surrogates_wordmask1
-                const char16_t* input = reinterpret_cast<const char16_t*>(buf);
-                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len / 2;
+                const char16_t * input = reinterpret_cast<const char16_t*>(buf);
+                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
 
                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
                 const uint64_t V0 = ~surrogates_wordmask0;
 
-                const auto vH0 = ((in16 & v_fc) == v_dc);
+                const auto vH0 = ((in16 & v_fc) ==  v_dc);
                 const uint64_t H0 = vH0.to_bitmask64();
 
                 const uint64_t L0 = ~H0 & surrogates_wordmask0;
@@ -13978,12 +12794,12 @@ int arm_detect_encodings(const char* buf, size_t len)
                     const simd8<uint8_t> in_16 = simd16<uint16_t>::pack(t0, t1);
 
                     const uint64_t surrogates_wordmask = ((in_16 & v_f8) == v_d8).to_bitmask64();
-                    if (surrogates_wordmask == 0) {
+                    if(surrogates_wordmask == 0) {
                         input += 16;
                     } else {
                         const uint64_t V = ~surrogates_wordmask;
 
-                        const auto vH = ((in_16 & v_fc) == v_dc);
+                        const auto vH = ((in_16 & v_fc) ==  v_dc);
                         const uint64_t H = vH.to_bitmask64();
 
                         const uint64_t L = ~H & surrogates_wordmask;
@@ -14007,23 +12823,23 @@ int arm_detect_encodings(const char* buf, size_t len)
                 is_utf16 = false;
                 // Check for UTF-32
                 if (len % 4 == 0) {
-                    const char32_t* input = reinterpret_cast<const char32_t*>(buf);
-                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len / 4;
+                    const char32_t * input = reinterpret_cast<const char32_t*>(buf);
+                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
 
                     // Must start checking for surrogates
                     uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
                     const uint32x4_t offset = vmovq_n_u32(0xffff2000);
                     const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
 
-                    const uint32x4_t in32 = vreinterpretq_u32_u16(in);
-                    const uint32x4_t secondin32 = vreinterpretq_u32_u16(secondin);
-                    const uint32x4_t thirdin32 = vreinterpretq_u32_u16(thirdin);
-                    const uint32x4_t fourthin32 = vreinterpretq_u32_u16(fourthin);
+                    const uint32x4_t in32 =  vreinterpretq_u32_u16(in);
+                    const uint32x4_t secondin32 =  vreinterpretq_u32_u16(secondin);
+                    const uint32x4_t thirdin32 =  vreinterpretq_u32_u16(thirdin);
+                    const uint32x4_t fourthin32 =  vreinterpretq_u32_u16(fourthin);
 
-                    currentmax = vmaxq_u32(in32, currentmax);
-                    currentmax = vmaxq_u32(secondin32, currentmax);
-                    currentmax = vmaxq_u32(thirdin32, currentmax);
-                    currentmax = vmaxq_u32(fourthin32, currentmax);
+                    currentmax = vmaxq_u32(in32,currentmax);
+                    currentmax = vmaxq_u32(secondin32,currentmax);
+                    currentmax = vmaxq_u32(thirdin32,currentmax);
+                    currentmax = vmaxq_u32(fourthin32,currentmax);
 
                     currentoffsetmax = vmaxq_u32(vaddq_u32(in32, offset), currentoffsetmax);
                     currentoffsetmax = vmaxq_u32(vaddq_u32(secondin32, offset), currentoffsetmax);
@@ -14032,13 +12848,13 @@ int arm_detect_encodings(const char* buf, size_t len)
 
                     while (input + 4 < end32) {
                         const uint32x4_t in_32 = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
-                        currentmax = vmaxq_u32(in_32, currentmax);
+                        currentmax = vmaxq_u32(in_32,currentmax);
                         currentoffsetmax = vmaxq_u32(vaddq_u32(in_32, offset), currentoffsetmax);
                         input += 4;
                     }
 
                     uint32x4_t forbidden_words = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-                    if (vmaxvq_u32(forbidden_words) != 0) {
+                    if(vmaxvq_u32(forbidden_words) != 0) {
                         is_utf32 = false;
                     }
                 } else {
@@ -14050,10 +12866,10 @@ int arm_detect_encodings(const char* buf, size_t len)
         // If no surrogate, validate under other encodings as well
 
         // UTF-32 validation
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(in), currentmax);
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin), currentmax);
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin), currentmax);
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin), currentmax);
+        currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax);
+        currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax);
+        currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax);
+        currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin),currentmax);
 
         // UTF-8 validation
         // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
@@ -14067,7 +12883,7 @@ int arm_detect_encodings(const char* buf, size_t len)
 
     if (is_utf8) {
         if (static_cast<size_t>(buf - start) != len) {
-            uint8_t block[64] {};
+            uint8_t block[64]{};
             std::memset(block, 0x20, 64);
             std::memcpy(block, buf, len - (buf - start));
             simd::simd8x64<uint8_t> in(block);
@@ -14078,14 +12894,14 @@ int arm_detect_encodings(const char* buf, size_t len)
         }
     }
 
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start)) / 2)) {
+    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
         out |= simdutf::encoding_type::UTF16_LE;
     }
 
     if (is_utf32 && (len % 4 == 0)) {
         const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
         uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-        if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start)) / 4)) {
+        if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
             out |= simdutf::encoding_type::UTF32_LE;
         }
     }
@@ -14096,9 +12912,8 @@ int arm_detect_encodings(const char* buf, size_t len)
 
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_validate_utf16.cpp
 /* begin file src/arm64/arm_validate_utf16.cpp */
-template<endianness big_endian>
-const char16_t* arm_validate_utf16(const char16_t* input, size_t size)
-{
+template <endianness big_endian>
+const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
     const char16_t* end = input + size;
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
@@ -14111,11 +12926,11 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size)
         auto in0 = simd16<uint16_t>(input);
         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
         if (!match_system(big_endian)) {
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
             const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-#else
-            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
-#endif
+            #else
+            const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+            #endif
             in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap));
             in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap));
         }
@@ -14124,7 +12939,7 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size)
         const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
         const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
-        if (surrogates_wordmask == 0) {
+        if(surrogates_wordmask == 0) {
             input += 16;
         } else {
             // 2. We have some surrogates that have to be distinguished:
@@ -14138,7 +12953,7 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size)
             const uint64_t V = ~surrogates_wordmask;
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto vH = ((in & v_fc) == v_dc);
+            const auto vH = ((in & v_fc) ==  v_dc);
             const uint64_t H = vH.to_bitmask64();
 
             // L - word mask for low surrogates
@@ -14146,11 +12961,11 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size)
             const uint64_t L = ~H & surrogates_wordmask;
 
             const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one.
-                                             // (A low surrogate placed in the 7th register's word
-                                             // is an exception we handle.)
+                              // (A low surrogate placed in the 7th register's word
+                              // is an exception we handle.)
             const uint64_t b = a << 4; // Just mark that the opposite fact is hold,
-                                       // thanks to that we have only two masks for valid case.
-            const uint64_t c = V | a | b; // Combine all the masks into the final one.
+                          // thanks to that we have only two masks for valid case.
+            const uint64_t c = V | a | b;      // Combine all the masks into the final one.
             if (c == ~0ull) {
                 // The whole input register contains valid UTF-16, i.e.,
                 // either single words or proper surrogate pairs.
@@ -14169,9 +12984,9 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size)
     return input;
 }
 
-template<endianness big_endian>
-const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
-{
+
+template <endianness big_endian>
+const result arm_validate_utf16_with_errors(const char16_t* input, size_t size) {
     const char16_t* start = input;
     const char16_t* end = input + size;
 
@@ -14187,11 +13002,11 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
 
         if (!match_system(big_endian)) {
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
             const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-#else
-            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
-#endif
+            #else
+            const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+            #endif
             in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap));
             in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap));
         }
@@ -14200,7 +13015,7 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
         const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
         const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
-        if (surrogates_wordmask == 0) {
+        if(surrogates_wordmask == 0) {
             input += 16;
         } else {
             // 2. We have some surrogates that have to be distinguished:
@@ -14214,7 +13029,7 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
             const uint64_t V = ~surrogates_wordmask;
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto vH = ((in & v_fc) == v_dc);
+            const auto vH = ((in & v_fc) ==  v_dc);
             const uint64_t H = vH.to_bitmask64();
 
             // L - word mask for low surrogates
@@ -14222,11 +13037,11 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
             const uint64_t L = ~H & surrogates_wordmask;
 
             const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one.
-                                             // (A low surrogate placed in the 7th register's word
-                                             // is an exception we handle.)
+                              // (A low surrogate placed in the 7th register's word
+                              // is an exception we handle.)
             const uint64_t b = a << 4; // Just mark that the opposite fact is hold,
-                                       // thanks to that we have only two masks for valid case.
-            const uint64_t c = V | a | b; // Combine all the masks into the final one.
+                          // thanks to that we have only two masks for valid case.
+            const uint64_t c = V | a | b;      // Combine all the masks into the final one.
             if (c == ~0ull) {
                 // The whole input register contains valid UTF-16, i.e.,
                 // either single words or proper surrogate pairs.
@@ -14248,8 +13063,7 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_validate_utf32le.cpp
 /* begin file src/arm64/arm_validate_utf32le.cpp */
 
-const char32_t* arm_validate_utf32le(const char32_t* input, size_t size)
-{
+const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) {
     const char32_t* end = input + size;
 
     const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
@@ -14260,26 +13074,26 @@ const char32_t* arm_validate_utf32le(const char32_t* input, size_t size)
 
     while (input + 4 < end) {
         const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
-        currentmax = vmaxq_u32(in, currentmax);
+        currentmax = vmaxq_u32(in,currentmax);
         currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
         input += 4;
     }
 
     uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-    if (vmaxvq_u32(is_zero) != 0) {
+    if(vmaxvq_u32(is_zero) != 0) {
         return nullptr;
     }
 
     is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-    if (vmaxvq_u32(is_zero) != 0) {
+    if(vmaxvq_u32(is_zero) != 0) {
         return nullptr;
     }
 
     return input;
 }
 
-const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size)
-{
+
+const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size) {
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -14291,16 +13105,16 @@ const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size
 
     while (input + 4 < end) {
         const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
-        currentmax = vmaxq_u32(in, currentmax);
+        currentmax = vmaxq_u32(in,currentmax);
         currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
 
         uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-        if (vmaxvq_u32(is_zero) != 0) {
+        if(vmaxvq_u32(is_zero) != 0) {
             return result(error_code::TOO_LARGE, input - start);
         }
 
         is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-        if (vmaxvq_u32(is_zero) != 0) {
+        if(vmaxvq_u32(is_zero) != 0) {
             return result(error_code::SURROGATE, input - start);
         }
 
@@ -14313,180 +13127,264 @@ const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size
 
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf8_to_utf16.cpp
 /* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */
-// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
-// It returns how many bytes were consumed (up to 12).
-template<endianness big_endian>
-size_t convert_masked_utf8_to_utf16(const char* input,
-    uint64_t utf8_end_of_code_point_mask,
-    char16_t*& utf16_output)
-{
-// we use an approach where we try to process up to 12 input bytes.
-// Why 12 input bytes and not 16? Because we are concerned with the size of
-// the lookup tables. Also 12 is nicely divisible by two and three.
-//
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-    const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-#else
-    const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
-#endif
-    uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
-    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
-    //
-    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-    // beneficial to have fast paths that depend on branch prediction but have less latency.
-    // This results in more instructions but, potentially, also higher speeds.
-    //
-    // We first try a few fast paths.
-    if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
-        // We process in chunks of 16 bytes
-        uint16x8_t ascii_first = vmovl_u8(vget_low_u8(in));
-        uint16x8_t ascii_second = vmovl_high_u8(in);
-        if (!match_system(big_endian)) {
-            ascii_first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_first), swap));
-            ascii_second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_second), swap));
-        }
-        vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), ascii_first);
-        vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output) + 8, ascii_second);
-        utf16_output += 16; // We wrote 16 16-bit characters.
-        return 16; // We consumed 16 bytes.
-    }
-    if ((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
-        // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
-        // There is probably a more efficient sequence, but the following might do.
-        uint8x16_t perm = vqtbl1q_u8(in, swap);
-        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
-        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
-        uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
-        if (!match_system(big_endian)) {
-            composed = vqtbl1q_u8(composed, swap);
-        }
-        vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
-        utf16_output += 8; // We wrote 16 bytes, 8 code points.
-        return 16;
-    }
-    if (input_utf8_end_of_code_point_mask == 0x924) {
-        // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
-        // There is probably a more efficient sequence, but the following might do.
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
-#else
-        const uint8x16_t sh = { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255 };
-#endif
-        uint8x16_t perm = vqtbl1q_u8(in, sh);
-        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
-        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
-        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-        uint32x4_t highbyte = vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
-        uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
-        uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
-        uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
-        if (!match_system(big_endian)) {
-            composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
-        }
-        vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
-        utf16_output += 4;
-        return 12;
-    }
-    /// We do not have a fast path available, so we fallback.
-
-    const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-    const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-
-    if (idx < 64) {
-        // SIX (6) input code-words
-        // this is a relatively easy scenario
-        // we process SIX (6) input code-words. The max length in bytes of six code
-        // words spanning between 1 and 2 bytes each is 12 bytes.
-        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-        uint8x16_t perm = vqtbl1q_u8(in, sh);
-        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
-        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
-        uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
-        if (!match_system(big_endian)) {
-            composed = vqtbl1q_u8(composed, swap);
-        }
-        vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
-        utf16_output += 6; // We wrote 12 bytes, 6 code points.
-    } else if (idx < 145) {
-        // FOUR (4) input code-words
-        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-        uint8x16_t perm = vqtbl1q_u8(in, sh);
-        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
-        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
-        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-        uint32x4_t highbyte = vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
-        uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
-        uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
-        uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
-        if (!match_system(big_endian)) {
-            composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
-        }
-        vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
-        utf16_output += 4;
-    } else if (idx < 209) {
-        // TWO (2) input code-words
-        //////////////
-        // There might be garbage inputs where a leading byte mascarades as a four-byte
-        // leading byte (by being followed by 3 continuation byte), but is not greater than
-        // 0xf0. This could trigger a buffer overflow if we only counted leading
-        // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
-        // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
-        // We do as at the cost of an extra mask.
-        /////////////
-        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-        uint8x16_t perm = vqtbl1q_u8(in, sh);
-        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
-        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
-        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-        uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
-        // correct for spurious high bit
-        uint8x16_t correct = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
-        middlehighbyte = veorq_u8(correct, middlehighbyte);
-        uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
-        // We deliberately carry the leading four bits if they are present, we remove
-        // them later when computing hightenbits.
-        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0xff000000)));
-        uint8x16_t highbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
-        // When we need to generate a surrogate pair (leading byte > 0xF0), then
-        // the corresponding 32-bit value in 'composed'  will be greater than
-        // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
-        // location of the surrogate pairs.
-        uint8x16_t composed = vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
-            vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
-        uint32x4_t composedminus = vsubq_u32(vreinterpretq_u32_u8(composed), vmovq_n_u32(0x10000));
-        uint32x4_t lowtenbits = vandq_u32(composedminus, vmovq_n_u32(0x3ff));
-        // Notice the 0x3ff mask:
-        uint32x4_t hightenbits = vandq_u32(vshrq_n_u32(composedminus, 10), vmovq_n_u32(0x3ff));
-        uint32x4_t lowtenbitsadd = vaddq_u32(lowtenbits, vmovq_n_u32(0xDC00));
-        uint32x4_t hightenbitsadd = vaddq_u32(hightenbits, vmovq_n_u32(0xD800));
-        uint32x4_t lowtenbitsaddshifted = vshlq_n_u32(lowtenbitsadd, 16);
-        uint32x4_t surrogates = vorrq_u32(hightenbitsadd, lowtenbitsaddshifted);
-        uint32_t basic_buffer[4];
-        uint32_t basic_buffer_swap[4];
-        if (!match_system(big_endian)) {
-            vst1q_u32(basic_buffer_swap, vreinterpretq_u32_u8(vqtbl1q_u8(composed, swap)));
-            surrogates = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(surrogates), swap));
-        }
-        vst1q_u32(basic_buffer, vreinterpretq_u32_u8(composed));
-        uint32_t surrogate_buffer[4];
-        vst1q_u32(surrogate_buffer, surrogates);
-        for (size_t i = 0; i < 3; i++) {
-            if (basic_buffer[i] > 0x3c00000) {
-                utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
-                utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
-                utf16_output += 2;
-            } else {
-                utf16_output[0] = !match_system(big_endian) ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
-                utf16_output++;
-            }
-        }
-    } else {
-        // here we know that there is an error but we do not handle errors
+// It returns how many bytes were consumed (up to 16, usually 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+                           uint64_t utf8_end_of_code_point_mask,
+                           char16_t *&utf16_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+  // beneficial to have fast paths that depend on branch prediction but have less latency.
+  // This results in more instructions but, potentially, also higher speeds.
+
+  // We first try a few fast paths.
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) {
+    // We process in chunks of 16 bytes
+    // The routine in simd.h is reused.
+    simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
+    temp.store_ascii_as_utf16<big_endian>(utf16_output);
+    utf16_output += 16; // We wrote 16 16-bit characters.
+    return 16; // We consumed 16 bytes.
+  }
+
+  // 3 byte sequences are the next most common, as seen in CJK, which has long sequences
+  // of these.
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
+    uint16x4_t composed = convert_utf8_3_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
+    }
+    vst1_u16(reinterpret_cast<uint16_t*>(utf16_output), composed);
+    utf16_output += 4; // We wrote 4 16-bit characters.
+    return 12; // We consumed 12 bytes.
+  }
+
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) {
+    // We want to take 6 2-byte UTF-8 words and turn them into 6 2-byte UTF-16 words.
+    uint16x8_t composed = convert_utf8_2_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+    }
+    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+
+    utf16_output += 6; // We wrote 6 16-bit characters.
+    return 12; // We consumed 12 bytes.
+  }
+
+  /// We do not have a fast path available, or the fast path is unimportant, so we fallback.
+  const uint8_t idx =
+      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+
+  const uint8_t consumed =
+      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+
+  if (idx < 64) {
+    // SIX (6) input code-words
+    // Convert to UTF-16
+    uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+    }
+    // Store
+    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed);
+    utf16_output += 6; // We wrote 6 16-bit characters.
+    return consumed;
+  } else if (idx < 145) {
+    // FOUR (4) input code-words
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+    // XXX: depending on the system scalar instructions might be faster.
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: xx0bbbbb x0cccccc
+    // 3 byte: xxbbbbbb x0cccccc
+    uint16x4_t lowperm = vmovn_u32(perm);
+    // Partially mask with bic (doesn't require a temporary register unlike and)
+    // The shift left insert below will clear the top bits.
+    // 1 byte: 00000000 00000000
+    // 2 byte: xx0bbbbb 00000000
+    // 3 byte: xxbbbbbb 00000000
+    uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00)));
+    // ASCII
+    // 1 byte: 00000000 0ccccccc
+    // 2+byte: 00000000 00cccccc
+    uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F));
+    // Split into narrow vectors.
+    // 2 byte: 00000000 00000000
+    // 3 byte: 00000000 xxxxaaaa
+    uint16x4_t highperm = vshrn_n_u32(perm, 16);
+    // Shift right accumulate the middle byte
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: 00xx0bbb bbcccccc
+    // 3 byte: 00xxbbbb bbcccccc
+    uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2);
+    // Shift left and insert the top 4 bits, overwriting the garbage
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: 00000bbb bbcccccc
+    // 3 byte: aaaabbbb bbcccccc
+    uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
+    }
+    vst1_u16(reinterpret_cast<uint16_t*>(utf16_output), composed);
+
+    utf16_output += 4; // We wrote 4 16-bit codepoints
+    return consumed;
+  } else if (idx < 209) {
+    // THREE (3) input code-words
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 words and turn them into 3 4-byte UTF-16 pairs.
+      // Generating surrogate pairs is a little tricky though, but it is easier when we
+      // can assume they are all pairs.
+      // This version does not use the LUT, but 4 byte sequences are less common and the
+      // overhead of the extra memory access is less important than the early branch overhead
+      // in shorter sequences.
+
+      // Swap byte pairs
+      // 10dddddd 10cccccc|10bbbbbb 11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      uint8x16_t swap = vrev16q_u8(in);
+      // Shift left 2 bits
+      // cccccc00 dddddd00 xxxxxxxx bbbbbb00
+      uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2));
+      // Create a magic number containing the low 2 bits of the trail surrogate and all the
+      // corrections needed to create the pair.
+      // UTF-8 4b prefix   = -0x0000|0xF000
+      // surrogate offset  = -0x0000|0x0040 (0x10000 << 6)
+      // surrogate high    = +0x0000|0xD800
+      // surrogate low     = +0xDC00|0x0000
+      // -------------------------------
+      //                   = +0xDC00|0xE7C0
+      uint32x4_t magic = vmovq_n_u32(0xDC00E7C0);
+      // Generate unadjusted trail surrogate minus lowest 2 bits
+      // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
+      uint32x4_t trail = vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift);
+      // Insert low 2 bits of trail surrogate to magic number for later
+      // 11011100 00000000 11100111 110000cc
+      uint16x8_t magic_with_low_2 = vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30));
+      // Generate lead surrogate
+      // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
+      uint32x4_t lead = vreinterpretq_u32_u16(vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6));
+      // Mask out lead
+      // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
+      lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF)));
+      // Blend pairs
+      // 000000cc ccdddddd|11110aaa bbbbbb00
+      uint16x8_t blend = vreinterpretq_u16_u32(vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead));
+      // Add magic number to finish the result
+      // 110111CC CCDDDDDD|110110AA BBBBBBCC
+      uint16x8_t composed = vaddq_u16(blend, magic_with_low_2);
+      // Byte swap if necessary
+      if (!match_system(big_endian)) {
+        composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+      }
+      vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+      utf16_output += 6; // We 3 32-bit surrogate pairs.
+      return 12; // We consumed 12 bytes.
+    }
+    // 3 1-4 byte sequences
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 3 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // Mask the low and middle bytes
+    // 00000000 00000000 00000000 0ddddddd
+    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f));
+    // Because the surrogates need more work, the high surrogate is computed first.
+    uint32x4_t middlehigh = vshlq_n_u32(perm, 2);
+    // 00000000 00000000 00cccccc 00000000
+    uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00));
+    // Start assembling the sequence. Since the 4th byte is in the same position as it
+    // would be in a surrogate and there is no dependency, shift left instead of right.
+    // 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx
+    // 4 byte: 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
+    uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh);
+    // Top 16 bits contains the high ten bits of the surrogate pair before correction
+    // 3 byte: 00000000 10bbbbcc|cccc0000 00000000
+    // 4 byte: 11110aaa bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
+    uint32x4_t abc = vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4));
+    // Combine the low 6 or 7 bits by a shift right accumulate
+    // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
+    // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o correction
+    uint32x4_t composed = vsraq_n_u32(ascii, abc, 6);
+    // After this is for surrogates
+    // Blend the low and high surrogates
+    // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
+    uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed);
+    // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits yet as
+    // 0x10000 was not subtracted from the codepoint yet.
+    // 4 byte: 11110aaa bbbbbbcc|000000cc ccdddddd
+    uint16x8_t masked_pair =
+        vreinterpretq_u16_u32(vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF))));
+    // Correct the remaining UTF-8 prefix, surrogate offset, and add the surrogate prefixes
+    // in one magic 16-bit addition.
+    // similar magic number but without the continue byte adjust and halfword swapped
+    // UTF-8 4b prefix   = -0xF000|0x0000
+    // surrogate offset  = -0x0040|0x0000 (0x10000 << 6)
+    // surrogate high    = +0xD800|0x0000
+    // surrogate low     = +0x0000|0xDC00
+    // -----------------------------------
+    //                   = +0xE7C0|0xDC00
+    uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00));
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
+    uint32x4_t surrogates = vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic));
+    // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
+    uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm));
+
+    // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
+    // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
+    uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      selected = vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected)));
+    }
+    // Attempting to shuffle and store would be complex, just scalarize.
+    uint32_t buffer[4];
+    vst1q_u32(buffer, selected);
+    // Test for the top bit of the surrogate mask.
+    const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 : 0x00800000;
+    for (size_t i = 0; i < 3; i++) {
+      // Surrogate
+      if (buffer[i] & SURROGATE_MASK) {
+        utf16_output[0] = uint16_t(buffer[i] >> 16);
+        utf16_output[1] = uint16_t(buffer[i] & 0xFFFF);
+        utf16_output += 2;
+      } else {
+        utf16_output[0] = uint16_t(buffer[i] & 0xFFFF);
+        utf16_output++;
+      }
     }
     return consumed;
+  } else {
+    // here we know that there is an error but we do not handle errors
+    return 12;
+  }
 }
+
 /* end file src/arm64/arm_convert_utf8_to_utf16.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf8_to_utf32.cpp
 /* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */
@@ -14494,121 +13392,175 @@ size_t convert_masked_utf8_to_utf16(const char* input,
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_utf32(const char* input,
-    uint64_t utf8_end_of_code_point_mask,
-    char32_t*& utf32_out)
-{
-    // we use an approach where we try to process up to 12 input bytes.
-    // Why 12 input bytes and not 16? Because we are concerned with the size of
-    // the lookup tables. Also 12 is nicely divisible by two and three.
-    //
-    uint32_t*& utf32_output = reinterpret_cast<uint32_t*&>(utf32_out);
-    uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
-    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xFFF;
-    //
-    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-    // beneficial to have fast paths that depend on branch prediction but have less latency.
-    // This results in more instructions but, potentially, also higher speeds.
-    //
-    // We first try a few fast paths.
-    if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
-        // We process in chunks of 16 bytes
-        vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(in)))));
-        vst1q_u32(utf32_output + 4, vmovl_high_u16(vmovl_u8(vget_low_u8(in))));
-        vst1q_u32(utf32_output + 8, vmovl_u16(vget_low_u16(vmovl_high_u8(in))));
-        vst1q_u32(utf32_output + 12, vmovl_high_u16(vmovl_high_u8(in)));
-        utf32_output += 16; // We wrote 16 16-bit characters.
-        return 16; // We consumed 16 bytes.
-    }
-    if ((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
-        // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
-        // There is probably a more efficient sequence, but the following might do.
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x16_t sh = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-#else
-        // const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-        const uint8x16_t sh = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
-#endif
-        uint8x16_t perm = vqtbl1q_u8(in, sh);
-        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
-        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
-        uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
-        vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
-        vst1q_u32(utf32_output + 4, vmovl_high_u16(vreinterpretq_u16_u8(composed)));
-        utf32_output += 8; // We wrote 32 bytes, 8 code points.
-        return 16;
-    }
-    if (input_utf8_end_of_code_point_mask == 0x924) {
-        // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
-        // There is probably a more efficient sequence, but the following might do.
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
-#else
-        const uint8x16_t sh = { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255 };
-#endif
-        uint8x16_t perm = vqtbl1q_u8(in, sh);
-        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
-        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
-        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-        uint32x4_t highbyte = vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
-        uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
-        uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
-        vst1q_u32(utf32_output, composed);
-        utf32_output += 4;
-        return 12;
-    }
-    /// We do not have a fast path available, so we fallback.
-
-    const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-    const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-
-    if (idx < 64) {
-        // SIX (6) input code-words
-        // this is a relatively easy scenario
-        // we process SIX (6) input code-words. The max length in bytes of six code
-        // words spanning between 1 and 2 bytes each is 12 bytes.
-        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-        uint8x16_t perm = vqtbl1q_u8(in, sh);
-        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
-        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
-        uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
-        vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
-        vst1q_u32(utf32_output + 4, vmovl_high_u16(vreinterpretq_u16_u8(composed)));
-        utf32_output += 6; // We wrote 12 bytes, 6 code points.
-    } else if (idx < 145) {
-        // FOUR (4) input code-words
-        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-        uint8x16_t perm = vqtbl1q_u8(in, sh);
-        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
-        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
-        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-        uint32x4_t highbyte = vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
-        uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
-        uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
-        vst1q_u32(utf32_output, composed);
-        utf32_output += 4;
-    } else if (idx < 209) {
-        // TWO (2) input code-words
-        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-        uint8x16_t perm = vqtbl1q_u8(in, sh);
-        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
-        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
-        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-        uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
-        // correct for spurious high bit
-        uint8x16_t correct = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
-        middlehighbyte = veorq_u8(correct, middlehighbyte);
-        uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
-        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000)));
-        uint8x16_t highbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
-        uint8x16_t composed = vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
-            vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
-        vst1q_u32(utf32_output, vreinterpretq_u32_u8(composed));
-        utf32_output += 3;
-    } else {
-        // here we know that there is an error but we do not handle errors
-    }
+size_t convert_masked_utf8_to_utf32(const char *input,
+                           uint64_t utf8_end_of_code_point_mask,
+                           char32_t *&utf32_out) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  uint32_t*& utf32_output = reinterpret_cast<uint32_t*&>(utf32_out);
+  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xFFF;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+  // beneficial to have fast paths that depend on branch prediction but have less latency.
+  // This results in more instructions but, potentially, also higher speeds.
+  //
+  // We first try a few fast paths.
+  if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
+    // We process in chunks of 16 bytes.
+    // use fast implementation in src/simdutf/arm64/simd.h
+    // Ideally the compiler can keep the tables in registers.
+    simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
+    temp.store_ascii_as_utf32_tbl(utf32_out);
+    utf32_output += 16; // We wrote 16 32-bit characters.
+    return 16; // We consumed 16 bytes.
+  }
+  if(input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
+    // Convert to UTF-16
+    uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in);
+    // Zero extend and store via ST2 with a zero.
+    uint16x4x2_t interleaver = {{ composed_utf16, vmov_n_u16(0) }};
+    vst2_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return 12; // We consumed 12 bytes.
+  }
+
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if(input_utf8_end_of_code_point_mask == 0xaaa) {
+    // We want to take 6 2-byte UTF-8 words and turn them into 6 4-byte UTF-32 words.
+    // Convert to UTF-16
+    uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in);
+    // Zero extend and store via ST2 with a zero.
+    uint16x8x2_t interleaver = {{ composed_utf16, vmovq_n_u16(0) }};
+    vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
+    utf32_output += 6; // We wrote 6 32-bit characters.
+    return 12; // We consumed 12 bytes.
+  }
+  /// Either no fast path or an unimportant fast path.
+
+  const uint8_t idx =
+      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+
+
+  if (idx < 64) {
+    // SIX (6) input code-words
+    // Convert to UTF-16
+    uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    // Zero extend and store with ST2 and zero
+    uint16x8x2_t interleaver = {{ composed_utf16, vmovq_n_u16(0) }};
+    vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
+    utf32_output += 6; // We wrote 6 32-bit characters.
+    return consumed;
+  } else if (idx < 145) {
+    // FOUR (4) input code-words
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+    // Shuffle
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // Split
+    // 00000000 00000000 0ccccccc
+    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F));    // 6 or 7 bits
+    // Note: unmasked
+    // xxxxxxxx aaaaxxxx xxxxxxxx
+    uint32x4_t high = vshrq_n_u32(perm, 4);                   // 4 bits
+    // Use 16 bit bic instead of and.
+    // The top bits will be corrected later in the bsl
+    // 00000000 10bbbbbb 00000000
+    uint32x4_t middle =
+        vreinterpretq_u32_u16(vbicq_u16(vreinterpretq_u16_u32(perm), vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits
+    // Combine low and middle with shift right accumulate
+    // 00000000 00xxbbbb bbcccccc
+    uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2);
+    // Insert top 4 bits from high byte with bitwise select
+    // 00000000 aaaabbbb bbcccccc
+    uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid);
+    vst1q_u32(utf32_output, composed);
+    utf32_output += 4; // We wrote 4 32-bit characters.
     return consumed;
+  } else if (idx < 209) {
+    // THREE (3) input code-words
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 words and turn them into 3 4-byte UTF-32 words.
+      // This uses the same method as the fixed 3 byte version, reversing and shift left insert.
+      // However, there is no need for a shuffle mask now, just rev16 and rev32.
+      //
+      // This version does not use the LUT, but 4 byte sequences are less common and the
+      // overhead of the extra memory access is less important than the early branch overhead
+      // in shorter sequences, so it comes last.
+
+      // Swap pairs of bytes
+      // 10dddddd|10cccccc|10bbbbbb|11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in));
+      // Shift left and insert
+      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
+      uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6);
+      // Swap 16-bit lanes
+      // xxxxcccc ccdddddd xxxxxxxa aabbbbbb
+      // xxxxxxxa aabbbbbb xxxxcccc ccdddddd
+      uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1));
+      // Shift insert again
+      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
+      uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12);
+      // Clear the garbage
+      // 00000000 000aaabb bbbbcccc ccdddddd
+      uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF));
+      // Store
+      vst1q_u32(utf32_output, composed);
+
+      utf32_output += 3; // We wrote 3 32-bit characters.
+      return 12; // We consumed 12 bytes.
+    }
+    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit due to
+    // surrogates no longer being involved.
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 2 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // Ascii
+    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F));
+    uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00));
+    // When converting the way we do, the 3 byte prefix will be interpreted as the
+    // 18th bit being set, since the code would interpret the lead byte (0b1110bbbb)
+    // as a continuation byte (0b10bbbbbb). To fix this, we can either xor or do an
+    // 8 bit add of the 6th bit shifted right by 1. Since NEON has shift right accumulate,
+    // we use that.
+    //  4 byte   3 byte
+    // 10bbbbbb 1110bbbb
+    // 00000000 01000000 6th bit
+    // 00000000 00100000 shift right
+    // 10bbbbbb 0000bbbb add
+    // 00bbbbbb 0000bbbb mask
+    uint8x16_t correction =
+        vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000)));
+    uint32x4_t corrected =
+        vreinterpretq_u32_u8(vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1));
+    // 00000000 00000000 0000cccc ccdddddd
+    uint32x4_t cd = vsraq_n_u32(ascii, middle, 2);
+    // Insert twice
+    // xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx
+    uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6), vshrq_n_u32(corrected, 4));
+    // 00000000 000aaabb bbbbcccc ccdddddd
+    uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab);
+    // Store
+    vst1q_u32(utf32_output, composed);
+    utf32_output += 3; // We wrote 3 32-bit characters.
+    return consumed;
+  } else {
+    // here we know that there is an error but we do not handle errors
+    return 12;
+  }
 }
 /* end file src/arm64/arm_convert_utf8_to_utf32.cpp */
 
@@ -14666,537 +13618,530 @@ size_t convert_masked_utf8_to_utf32(const char* input,
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template<endianness big_endian>
-std::pair<const char16_t*, char*> arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out)
-{
-    uint8_t* utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
-    const char16_t* end = buf + len;
-
-    const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-    const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-    const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-
-    while (buf + 16 <= end) {
-        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
+template <endianness big_endian>
+std::pair<const char16_t*, char*> arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out) {
+  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+  const char16_t* end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+  while (buf + 16 <= end) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      #else
+      const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+      #endif
+      in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
+    }
+    if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+        // It is common enough that we have sequences of 16 consecutive ASCII characters.
+        uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
         if (!match_system(big_endian)) {
+          #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+          const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+          #else
+          const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+          #endif
+          nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
+        }
+        if(vmaxvq_u16(nextin) > 0x7F) {
+          // 1. pack the bytes
+          // obviously suboptimal.
+          uint8x8_t utf8_packed = vmovn_u16(in);
+          // 2. store (8 bytes)
+          vst1_u8(utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 8;
+          utf8_output += 8;
+          in = nextin;
+        } else {
+          // 1. pack the bytes
+          // obviously suboptimal.
+          uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+          // 2. store (16 bytes)
+          vst1q_u8(utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 16;
+          utf8_output += 16;
+          continue; // we are done for this round!
+        }
+    }
+
+    if (vmaxvq_u16(in) <= 0x7FF) {
+          // 1. prepare 2-byte values
+          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+          // expected output   : [110a|aaaa|10bb|bbbb] x 8
+          const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+          const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+          // t0 = [000a|aaaa|bbbb|bb00]
+          const uint16x8_t t0 = vshlq_n_u16(in, 2);
+          // t1 = [000a|aaaa|0000|0000]
+          const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+          // t2 = [0000|0000|00bb|bbbb]
+          const uint16x8_t t2 = vandq_u16(in, v_003f);
+          // t3 = [000a|aaaa|00bb|bbbb]
+          const uint16x8_t t3 = vorrq_u16(t1, t2);
+          // t4 = [110a|aaaa|10bb|bbbb]
+          const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+          // 2. merge ASCII and 2-byte codewords
+          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+          const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+          const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+          // 3. prepare bitmask for 8-bit lookup
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-#else
-            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
-#endif
-            in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
-        }
-        if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
-            // It is common enough that we have sequences of 16 consecutive ASCII characters.
-            uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 8);
-            if (!match_system(big_endian)) {
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-                const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-#else
-                const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
-#endif
-                nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
-            }
-            if (vmaxvq_u16(nextin) > 0x7F) {
-                // 1. pack the bytes
-                // obviously suboptimal.
-                uint8x8_t utf8_packed = vmovn_u16(in);
-                // 2. store (8 bytes)
-                vst1_u8(utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 8;
-                utf8_output += 8;
-                in = nextin;
-            } else {
-                // 1. pack the bytes
-                // obviously suboptimal.
-                uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
-                // 2. store (16 bytes)
-                vst1q_u8(utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 16;
-                utf8_output += 16;
-                continue; // we are done for this round!
-            }
-        }
-
-        if (vmaxvq_u16(in) <= 0x7FF) {
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-            const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
-
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const uint16x8_t t0 = vshlq_n_u16(in, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const uint16x8_t t2 = vandq_u16(in, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const uint16x8_t t3 = vorrq_u16(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-            // 2. merge ASCII and 2-byte codewords
-            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-            const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-            const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
-            // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
-                0x0010, 0x0040,
-                0x0002, 0x0008,
-                0x0020, 0x0080);
+          const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                                    0x0010, 0x0040,
+                                    0x0002, 0x0008,
+                                    0x0020, 0x0080);
 #else
-            const uint16x8_t mask = { 0x0001, 0x0004,
-                0x0010, 0x0040,
-                0x0002, 0x0008,
-                0x0020, 0x0080 };
+          const uint16x8_t mask = { 0x0001, 0x0004,
+                                    0x0010, 0x0040,
+                                    0x0002, 0x0008,
+                                    0x0020, 0x0080 };
 #endif
-            uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-            // 4. pack the bytes
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-            const uint8x16_t shuffle = vld1q_u8(row + 1);
-            const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
-
-            // 5. store bytes
-            vst1q_u8(utf8_output, utf8_packed);
-
-            // 6. adjust pointers
-            buf += 8;
-            utf8_output += row[0];
-            continue;
-        }
-        const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
-        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-        // it is likely an uncommon occurrence.
-        if (vmaxvq_u16(surrogates_bytemask) == 0) {
-            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+          uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+          // 4. pack the bytes
+          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+          const uint8x16_t shuffle = vld1q_u8(row + 1);
+          const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+          // 5. store bytes
+          vst1q_u8(utf8_output, utf8_packed);
+
+          // 6. adjust pointers
+          buf += 8;
+          utf8_output += row[0];
+          continue;
+
+    }
+    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+      if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+        const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 #else
-            const uint16x8_t dup_even = { 0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e };
+        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
 #endif
-            /* In this branch we handle three cases:
-               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-              We expand the input word (16-bit) into two words (32-bit), thus
-              we have room for four bytes. However, we need five distinct bit
-              layouts. Note that the last byte in cases #2 and #3 is the same.
-
-              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-              in register t2.
-
-              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-              either byte 1 for case #2 or byte 2 for case #3. Note that they
-              differ by exactly one bit.
-
-              Finally from these two words we build proper UTF-8 sequence, taking
-              into account the case (i.e, the number of bytes to write).
-            */
-            /**
-             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-             * t2 => [0ccc|cccc] [10cc|cccc]
-             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-             */
+        /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+          We expand the input word (16-bit) into two words (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+          either byte 1 for case #2 or byte 2 for case #3. Note that they
+          differ by exactly one bit.
+
+          Finally from these two words we build proper UTF-8 sequence, taking
+          into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
 #define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-            const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
-            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-            const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
-            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-            const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
-
-            // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-            const uint16x8_t s0 = vshrq_n_u16(in, 12);
-            // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-            const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
-            // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-            const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-            // [00bb|bbbb|0000|aaaa]
-            const uint16x8_t s2 = vorrq_u16(s0, s1s);
-            // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-            const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
-            const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-            const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
-            const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
-            const uint16x8_t s4 = veorq_u16(s3, m0);
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
+
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        const uint16x8_t s0 = vshrq_n_u16(in, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+        // [00bb|bbbb|0000|aaaa]
+        const uint16x8_t s2 = vorrq_u16(s0, s1s);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+        const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+        const uint16x8_t s4 = veorq_u16(s3, m0);
 #undef simdutf_vec
 
-            // 4. expand words 16-bit => 32-bit
-            const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-            const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+        // 4. expand words 16-bit => 32-bit
+        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
 
-            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-            const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+        const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
-                0x0010, 0x0040,
-                0x0100, 0x0400,
-                0x1000, 0x4000);
-            const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
-                0x0020, 0x0080,
-                0x0200, 0x0800,
-                0x2000, 0x8000);
+        const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                                    0x0010, 0x0040,
+                                    0x0100, 0x0400,
+                                    0x1000, 0x4000 );
+        const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                                    0x0020, 0x0080,
+                                    0x0200, 0x0800,
+                                    0x2000, 0x8000 );
 #else
-            const uint16x8_t onemask = { 0x0001, 0x0004,
-                0x0010, 0x0040,
-                0x0100, 0x0400,
-                0x1000, 0x4000 };
-            const uint16x8_t twomask = { 0x0002, 0x0008,
-                0x0020, 0x0080,
-                0x0200, 0x0800,
-                0x2000, 0x8000 };
+        const uint16x8_t onemask = { 0x0001, 0x0004,
+                                    0x0010, 0x0040,
+                                    0x0100, 0x0400,
+                                    0x1000, 0x4000 };
+        const uint16x8_t twomask = { 0x0002, 0x0008,
+                                    0x0020, 0x0080,
+                                    0x0200, 0x0800,
+                                    0x2000, 0x8000 };
 #endif
-            const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
-            const uint16_t mask = vaddvq_u16(combined);
-            // The following fast path may or may not be beneficial.
-            /*if(mask == 0) {
-              // We only have three-byte words. Use fast path.
-              const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-              const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-              const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-              vst1q_u8(utf8_output, utf8_0);
-              utf8_output += 12;
-              vst1q_u8(utf8_output, utf8_1);
-              utf8_output += 12;
-              buf += 8;
-              continue;
-            }*/
-            const uint8_t mask0 = uint8_t(mask);
-
-            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-            const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-            const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
-
-            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-            const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-            const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
-
-            vst1q_u8(utf8_output, utf8_0);
-            utf8_output += row0[0];
-            vst1q_u8(utf8_output, utf8_1);
-            utf8_output += row1[0];
-
-            buf += 8;
-            // surrogate pair(s) in a register
+        const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+        const uint16_t mask = vaddvq_u16(combined);
+        // The following fast path may or may not be beneficial.
+        /*if(mask == 0) {
+          // We only have three-byte words. Use fast path.
+          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+          vst1q_u8(utf8_output, utf8_0);
+          utf8_output += 12;
+          vst1q_u8(utf8_output, utf8_1);
+          utf8_output += 12;
+          buf += 8;
+          continue;
+        }*/
+        const uint8_t mask0 = uint8_t(mask);
+
+        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+        vst1q_u8(utf8_output, utf8_0);
+        utf8_output += row0[0];
+        vst1q_u8(utf8_output, utf8_1);
+        utf8_output += row1[0];
+
+        buf += 8;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if((word & 0xFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word &0xF800 ) != 0xD800) {
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-                if ((word & 0xFF80) == 0) {
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xF800) == 0) {
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xF800) != 0xD800) {
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else {
-                    // must be a surrogate pair
-                    uint16_t diff = uint16_t(word - 0xD800);
-                    uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-                    k++;
-                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
-                    if ((diff | diff2) > 0x3FF) {
-                        return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
-                    }
-                    uint32_t value = (diff << 10) + diff2 + 0x10000;
-                    *utf8_output++ = char((value >> 18) | 0b11110000);
-                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value>>18) | 0b11110000);
+          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
         }
-    } // while
+      }
+      buf += k;
+    }
+  } // while
 
-    return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
+  return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
 }
 
+
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template<endianness big_endian>
-std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out)
-{
-    uint8_t* utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+template <endianness big_endian>
+std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out) {
+  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
     const char16_t* start = buf;
-    const char16_t* end = buf + len;
-
-    const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-    const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-    const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-
-    while (buf + 16 <= end) {
-        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
+  const char16_t* end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+  while (buf + 16 <= end) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      #else
+      const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+      #endif
+      in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
+    }
+    if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+        // It is common enough that we have sequences of 16 consecutive ASCII characters.
+        uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
         if (!match_system(big_endian)) {
+          #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+          const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+          #else
+          const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+          #endif
+          nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
+        }
+        if(vmaxvq_u16(nextin) > 0x7F) {
+          // 1. pack the bytes
+          // obviously suboptimal.
+          uint8x8_t utf8_packed = vmovn_u16(in);
+          // 2. store (8 bytes)
+          vst1_u8(utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 8;
+          utf8_output += 8;
+          in = nextin;
+        } else {
+          // 1. pack the bytes
+          // obviously suboptimal.
+          uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+          // 2. store (16 bytes)
+          vst1q_u8(utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 16;
+          utf8_output += 16;
+          continue; // we are done for this round!
+        }
+    }
+
+    if (vmaxvq_u16(in) <= 0x7FF) {
+          // 1. prepare 2-byte values
+          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+          // expected output   : [110a|aaaa|10bb|bbbb] x 8
+          const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+          const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+          // t0 = [000a|aaaa|bbbb|bb00]
+          const uint16x8_t t0 = vshlq_n_u16(in, 2);
+          // t1 = [000a|aaaa|0000|0000]
+          const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+          // t2 = [0000|0000|00bb|bbbb]
+          const uint16x8_t t2 = vandq_u16(in, v_003f);
+          // t3 = [000a|aaaa|00bb|bbbb]
+          const uint16x8_t t3 = vorrq_u16(t1, t2);
+          // t4 = [110a|aaaa|10bb|bbbb]
+          const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+          // 2. merge ASCII and 2-byte codewords
+          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+          const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+          const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+          // 3. prepare bitmask for 8-bit lookup
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-#else
-            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
-#endif
-            in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
-        }
-        if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
-            // It is common enough that we have sequences of 16 consecutive ASCII characters.
-            uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 8);
-            if (!match_system(big_endian)) {
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-                const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-#else
-                const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
-#endif
-                nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
-            }
-            if (vmaxvq_u16(nextin) > 0x7F) {
-                // 1. pack the bytes
-                // obviously suboptimal.
-                uint8x8_t utf8_packed = vmovn_u16(in);
-                // 2. store (8 bytes)
-                vst1_u8(utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 8;
-                utf8_output += 8;
-                in = nextin;
-            } else {
-                // 1. pack the bytes
-                // obviously suboptimal.
-                uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
-                // 2. store (16 bytes)
-                vst1q_u8(utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 16;
-                utf8_output += 16;
-                continue; // we are done for this round!
-            }
-        }
-
-        if (vmaxvq_u16(in) <= 0x7FF) {
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-            const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
-
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const uint16x8_t t0 = vshlq_n_u16(in, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const uint16x8_t t2 = vandq_u16(in, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const uint16x8_t t3 = vorrq_u16(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-            // 2. merge ASCII and 2-byte codewords
-            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-            const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-            const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
-            // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
-                0x0010, 0x0040,
-                0x0002, 0x0008,
-                0x0020, 0x0080);
+          const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                                    0x0010, 0x0040,
+                                    0x0002, 0x0008,
+                                    0x0020, 0x0080);
 #else
-            const uint16x8_t mask = { 0x0001, 0x0004,
-                0x0010, 0x0040,
-                0x0002, 0x0008,
-                0x0020, 0x0080 };
+          const uint16x8_t mask = { 0x0001, 0x0004,
+                                    0x0010, 0x0040,
+                                    0x0002, 0x0008,
+                                    0x0020, 0x0080 };
 #endif
-            uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-            // 4. pack the bytes
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-            const uint8x16_t shuffle = vld1q_u8(row + 1);
-            const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
-
-            // 5. store bytes
-            vst1q_u8(utf8_output, utf8_packed);
-
-            // 6. adjust pointers
-            buf += 8;
-            utf8_output += row[0];
-            continue;
-        }
-        const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
-        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-        // it is likely an uncommon occurrence.
-        if (vmaxvq_u16(surrogates_bytemask) == 0) {
-            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+          uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+          // 4. pack the bytes
+          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+          const uint8x16_t shuffle = vld1q_u8(row + 1);
+          const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+          // 5. store bytes
+          vst1q_u8(utf8_output, utf8_packed);
+
+          // 6. adjust pointers
+          buf += 8;
+          utf8_output += row[0];
+          continue;
+
+    }
+    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+      if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+        const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 #else
-            const uint16x8_t dup_even = { 0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e };
+        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
 #endif
-            /* In this branch we handle three cases:
-               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-              We expand the input word (16-bit) into two words (32-bit), thus
-              we have room for four bytes. However, we need five distinct bit
-              layouts. Note that the last byte in cases #2 and #3 is the same.
-
-              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-              in register t2.
-
-              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-              either byte 1 for case #2 or byte 2 for case #3. Note that they
-              differ by exactly one bit.
-
-              Finally from these two words we build proper UTF-8 sequence, taking
-              into account the case (i.e, the number of bytes to write).
-            */
-            /**
-             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-             * t2 => [0ccc|cccc] [10cc|cccc]
-             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-             */
+        /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+          We expand the input word (16-bit) into two words (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+          either byte 1 for case #2 or byte 2 for case #3. Note that they
+          differ by exactly one bit.
+
+          Finally from these two words we build proper UTF-8 sequence, taking
+          into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
 #define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-            const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
-            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-            const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
-            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-            const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
-
-            // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-            const uint16x8_t s0 = vshrq_n_u16(in, 12);
-            // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-            const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
-            // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-            const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-            // [00bb|bbbb|0000|aaaa]
-            const uint16x8_t s2 = vorrq_u16(s0, s1s);
-            // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-            const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
-            const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-            const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
-            const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
-            const uint16x8_t s4 = veorq_u16(s3, m0);
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
+
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        const uint16x8_t s0 = vshrq_n_u16(in, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+        // [00bb|bbbb|0000|aaaa]
+        const uint16x8_t s2 = vorrq_u16(s0, s1s);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+        const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+        const uint16x8_t s4 = veorq_u16(s3, m0);
 #undef simdutf_vec
 
-            // 4. expand words 16-bit => 32-bit
-            const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-            const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+        // 4. expand words 16-bit => 32-bit
+        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
 
-            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-            const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+        const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
-                0x0010, 0x0040,
-                0x0100, 0x0400,
-                0x1000, 0x4000);
-            const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
-                0x0020, 0x0080,
-                0x0200, 0x0800,
-                0x2000, 0x8000);
+        const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                                    0x0010, 0x0040,
+                                    0x0100, 0x0400,
+                                    0x1000, 0x4000 );
+        const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                                    0x0020, 0x0080,
+                                    0x0200, 0x0800,
+                                    0x2000, 0x8000 );
 #else
-            const uint16x8_t onemask = { 0x0001, 0x0004,
-                0x0010, 0x0040,
-                0x0100, 0x0400,
-                0x1000, 0x4000 };
-            const uint16x8_t twomask = { 0x0002, 0x0008,
-                0x0020, 0x0080,
-                0x0200, 0x0800,
-                0x2000, 0x8000 };
+        const uint16x8_t onemask = { 0x0001, 0x0004,
+                                    0x0010, 0x0040,
+                                    0x0100, 0x0400,
+                                    0x1000, 0x4000 };
+        const uint16x8_t twomask = { 0x0002, 0x0008,
+                                    0x0020, 0x0080,
+                                    0x0200, 0x0800,
+                                    0x2000, 0x8000 };
 #endif
-            const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
-            const uint16_t mask = vaddvq_u16(combined);
-            // The following fast path may or may not be beneficial.
-            /*if(mask == 0) {
-              // We only have three-byte words. Use fast path.
-              const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-              const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-              const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-              vst1q_u8(utf8_output, utf8_0);
-              utf8_output += 12;
-              vst1q_u8(utf8_output, utf8_1);
-              utf8_output += 12;
-              buf += 8;
-              continue;
-            }*/
-            const uint8_t mask0 = uint8_t(mask);
-
-            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-            const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-            const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
-
-            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-            const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-            const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
-
-            vst1q_u8(utf8_output, utf8_0);
-            utf8_output += row0[0];
-            vst1q_u8(utf8_output, utf8_1);
-            utf8_output += row1[0];
-
-            buf += 8;
-            // surrogate pair(s) in a register
+        const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+        const uint16_t mask = vaddvq_u16(combined);
+        // The following fast path may or may not be beneficial.
+        /*if(mask == 0) {
+          // We only have three-byte words. Use fast path.
+          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+          vst1q_u8(utf8_output, utf8_0);
+          utf8_output += 12;
+          vst1q_u8(utf8_output, utf8_1);
+          utf8_output += 12;
+          buf += 8;
+          continue;
+        }*/
+        const uint8_t mask0 = uint8_t(mask);
+
+        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+        vst1q_u8(utf8_output, utf8_0);
+        utf8_output += row0[0];
+        vst1q_u8(utf8_output, utf8_1);
+        utf8_output += row1[0];
+
+        buf += 8;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if((word & 0xFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word &0xF800 ) != 0xD800) {
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-                if ((word & 0xFF80) == 0) {
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xF800) == 0) {
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xF800) != 0xD800) {
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else {
-                    // must be a surrogate pair
-                    uint16_t diff = uint16_t(word - 0xD800);
-                    uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-                    k++;
-                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
-                    if ((diff | diff2) > 0x3FF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char*>(utf8_output));
-                    }
-                    uint32_t value = (diff << 10) + diff2 + 0x10000;
-                    *utf8_output++ = char((value >> 18) | 0b11110000);
-                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char*>(utf8_output)); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value>>18) | 0b11110000);
+          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
         }
-    } // while
+      }
+      buf += k;
+    }
+  } // while
 
-    return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
+  return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
 }
 /* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf16_to_utf32.cpp
@@ -15253,764 +14198,729 @@ std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* b
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template<endianness big_endian>
-std::pair<const char16_t*, char32_t*> arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out)
-{
-    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
-    const char16_t* end = buf + len;
-
-    const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-    const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-
-    while (buf + 16 <= end) {
-        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
-        if (!match_system(big_endian)) {
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-#else
-            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
-#endif
-            in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
-        }
-
-        const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
-        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-        // it is likely an uncommon occurrence.
-        if (vmaxvq_u16(surrogates_bytemask) == 0) {
-            // case: no surrogate pairs, extend all 16-bit words to 32-bit words
-            vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
-            vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
-            utf32_output += 8;
-            buf += 8;
-            // surrogate pair(s) in a register
+template <endianness big_endian>
+std::pair<const char16_t*, char32_t*> arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out) {
+  uint32_t * utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
+  const char16_t* end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+
+  while (buf + 16 <= end) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      #else
+      const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+      #endif
+      in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
+    }
+
+    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+      if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: no surrogate pairs, extend all 16-bit words to 32-bit words
+      vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(in)));
+      vst1q_u32(utf32_output+4,  vmovl_high_u16(in));
+      utf32_output += 8;
+      buf += 8;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if((word &0xF800 ) != 0xD800) {
+          *utf32_output++ = char32_t(word);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-                if ((word & 0xF800) != 0xD800) {
-                    *utf32_output++ = char32_t(word);
-                } else {
-                    // must be a surrogate pair
-                    uint16_t diff = uint16_t(word - 0xD800);
-                    uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-                    k++;
-                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
-                    if ((diff | diff2) > 0x3FF) {
-                        return std::make_pair(nullptr, reinterpret_cast<char32_t*>(utf32_output));
-                    }
-                    uint32_t value = (diff << 10) + diff2 + 0x10000;
-                    *utf32_output++ = char32_t(value);
-                }
-            }
-            buf += k;
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, reinterpret_cast<char32_t*>(utf32_output)); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
         }
-    } // while
-    return std::make_pair(buf, reinterpret_cast<char32_t*>(utf32_output));
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(buf, reinterpret_cast<char32_t*>(utf32_output));
 }
 
+
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template<endianness big_endian>
-std::pair<result, char32_t*> arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out)
-{
-    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
-    const char16_t* start = buf;
-    const char16_t* end = buf + len;
-
-    const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-    const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-
-    while (buf + 16 <= end) {
-        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
-        if (!match_system(big_endian)) {
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-#else
-            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
-#endif
-            in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
-        }
-
-        const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
-        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-        // it is likely an uncommon occurrence.
-        if (vmaxvq_u16(surrogates_bytemask) == 0) {
-            // case: no surrogate pairs, extend all 16-bit words to 32-bit words
-            vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
-            vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
-            utf32_output += 8;
-            buf += 8;
-            // surrogate pair(s) in a register
+template <endianness big_endian>
+std::pair<result, char32_t*> arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out) {
+  uint32_t * utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
+  const char16_t* start = buf;
+  const char16_t* end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+
+  while (buf + 16 <= end) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      #else
+      const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+      #endif
+      in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
+    }
+
+    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+      if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: no surrogate pairs, extend all 16-bit words to 32-bit words
+      vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(in)));
+      vst1q_u32(utf32_output+4,  vmovl_high_u16(in));
+      utf32_output += 8;
+      buf += 8;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if((word &0xF800 ) != 0xD800) {
+          *utf32_output++ = char32_t(word);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-                if ((word & 0xF800) != 0xD800) {
-                    *utf32_output++ = char32_t(word);
-                } else {
-                    // must be a surrogate pair
-                    uint16_t diff = uint16_t(word - 0xD800);
-                    uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-                    k++;
-                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
-                    if ((diff | diff2) > 0x3FF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char32_t*>(utf32_output));
-                    }
-                    uint32_t value = (diff << 10) + diff2 + 0x10000;
-                    *utf32_output++ = char32_t(value);
-                }
-            }
-            buf += k;
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char32_t*>(utf32_output)); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
         }
-    } // while
-    return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char32_t*>(utf32_output));
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char32_t*>(utf32_output));
 }
 /* end file src/arm64/arm_convert_utf16_to_utf32.cpp */
 
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf32_to_utf8.cpp
 /* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */
-std::pair<const char32_t*, char*> arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out)
-{
-    uint8_t* utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
-    const char32_t* end = buf + len;
-
-    const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-
-    uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
-
-    while (buf + 16 <= end) {
-        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(buf));
-        uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t*>(buf + 4));
-
-        // Check if no bits set above 16th
-        if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
-            // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
-            // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
-            uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
-            if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
-                // 1. pack the bytes
-                // obviously suboptimal.
-                uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
-                // 2. store (8 bytes)
-                vst1_u8(utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 8;
-                utf8_output += 8;
-                continue; // we are done for this round!
-            }
-
-            if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
-                // 1. prepare 2-byte values
-                // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-                // expected output   : [110a|aaaa|10bb|bbbb] x 8
-                const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-                const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
-
-                // t0 = [000a|aaaa|bbbb|bb00]
-                const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
-                // t1 = [000a|aaaa|0000|0000]
-                const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-                // t2 = [0000|0000|00bb|bbbb]
-                const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
-                // t3 = [000a|aaaa|00bb|bbbb]
-                const uint16x8_t t3 = vorrq_u16(t1, t2);
-                // t4 = [110a|aaaa|10bb|bbbb]
-                const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-                // 2. merge ASCII and 2-byte codewords
-                const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-                const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-                const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
-                // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-                const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
-                    0x0010, 0x0040,
-                    0x0002, 0x0008,
-                    0x0020, 0x0080);
-#else
-                const uint16x8_t mask = { 0x0001, 0x0004,
-                    0x0010, 0x0040,
-                    0x0002, 0x0008,
-                    0x0020, 0x0080 };
-#endif
-                uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-                // 4. pack the bytes
-                const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-                const uint8x16_t shuffle = vld1q_u8(row + 1);
-                const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
-
-                // 5. store bytes
-                vst1q_u8(utf8_output, utf8_packed);
-
-                // 6. adjust pointers
-                buf += 8;
-                utf8_output += row[0];
-                continue;
+std::pair<const char32_t*, char*> arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out) {
+  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+  const char32_t* end = buf + len;
+
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+  uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
+
+  while (buf + 16 <= end) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
+
+    // Check if no bits set above 16th
+    if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
+      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
+      if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
+          // 1. pack the bytes
+          // obviously suboptimal.
+          uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
+          // 2. store (8 bytes)
+          vst1_u8(utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 8;
+          utf8_output += 8;
+          continue; // we are done for this round!
+      }
+
+      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+            const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
 
-            } else {
-                // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-                const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-                const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
-                forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask);
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const uint16x8_t t3 = vorrq_u16(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+            // 2. merge ASCII and 2-byte codewords
+            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+            const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+            const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
+            // 3. prepare bitmask for 8-bit lookup
+  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                                      0x0010, 0x0040,
+                                      0x0002, 0x0008,
+                                      0x0020, 0x0080);
+  #else
+            const uint16x8_t mask = { 0x0001, 0x0004,
+                                      0x0010, 0x0040,
+                                      0x0002, 0x0008,
+                                      0x0020, 0x0080 };
+  #endif
+            uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const uint8x16_t shuffle = vld1q_u8(row + 1);
+            const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
 
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-                const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
-                    0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-#else
-                const uint16x8_t dup_even = { 0x0000, 0x0202, 0x0404, 0x0606,
-                    0x0808, 0x0a0a, 0x0c0c, 0x0e0e };
-#endif
-                /* In this branch we handle three cases:
-                  1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-                  2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-                  3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-                  We expand the input word (16-bit) into two words (32-bit), thus
-                  we have room for four bytes. However, we need five distinct bit
-                  layouts. Note that the last byte in cases #2 and #3 is the same.
-
-                  We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-                  in register t2.
-
-                  We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-                  either byte 1 for case #2 or byte 2 for case #3. Note that they
-                  differ by exactly one bit.
-
-                  Finally from these two words we build proper UTF-8 sequence, taking
-                  into account the case (i.e, the number of bytes to write).
-                */
-                /**
-                 * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-                 * t2 => [0ccc|cccc] [10cc|cccc]
-                 * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-                 */
-#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-                // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-                const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
-                // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-                const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
-                // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-                const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
-
-                // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-                const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
-                // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-                const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
-                // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-                const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-                // [00bb|bbbb|0000|aaaa]
-                const uint16x8_t s2 = vorrq_u16(s0, s1s);
-                // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-                const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
-                const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-                const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
-                const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
-                const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef simdutf_vec
+            // 5. store bytes
+            vst1q_u8(utf8_output, utf8_packed);
 
-                // 4. expand words 16-bit => 32-bit
-                const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-                const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
 
-                // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-                const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-                const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-                const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
-                    0x0010, 0x0040,
-                    0x0100, 0x0400,
-                    0x1000, 0x4000);
-                const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
-                    0x0020, 0x0080,
-                    0x0200, 0x0800,
-                    0x2000, 0x8000);
-#else
-                const uint16x8_t onemask = { 0x0001, 0x0004,
-                    0x0010, 0x0040,
-                    0x0100, 0x0400,
-                    0x1000, 0x4000 };
-                const uint16x8_t twomask = { 0x0002, 0x0008,
-                    0x0020, 0x0080,
-                    0x0200, 0x0800,
-                    0x2000, 0x8000 };
-#endif
-                const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
-                const uint16_t mask = vaddvq_u16(combined);
-                // The following fast path may or may not be beneficial.
-                /*if(mask == 0) {
-                  // We only have three-byte words. Use fast path.
-                  const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-                  const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-                  const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-                  vst1q_u8(utf8_output, utf8_0);
-                  utf8_output += 12;
-                  vst1q_u8(utf8_output, utf8_1);
-                  utf8_output += 12;
-                  buf += 8;
-                  continue;
-                }*/
-                const uint8_t mask0 = uint8_t(mask);
-                const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-                const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-                const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
-
-                const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-                const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-                const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-                const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
-
-                vst1q_u8(utf8_output, utf8_0);
-                utf8_output += row0[0];
-                vst1q_u8(utf8_output, utf8_1);
-                utf8_output += row1[0];
-
-                buf += 8;
-            }
-            // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
+      } else {
+        // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
+        forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask);
+
+  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+          const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+  #else
+          const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+  #endif
+          /* In this branch we handle three cases:
+            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+            We expand the input word (16-bit) into two words (32-bit), thus
+            we have room for four bytes. However, we need five distinct bit
+            layouts. Note that the last byte in cases #2 and #3 is the same.
+
+            We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+            in register t2.
+
+            We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+            either byte 1 for case #2 or byte 2 for case #3. Note that they
+            differ by exactly one bit.
+
+            Finally from these two words we build proper UTF-8 sequence, taking
+            into account the case (i.e, the number of bytes to write).
+          */
+          /**
+           * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+           * t2 => [0ccc|cccc] [10cc|cccc]
+           * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+           */
+  #define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+          // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+          const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
+          // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+          const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+          // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+          const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
+
+          // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+          const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
+          // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+          const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
+          // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+          const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+          // [00bb|bbbb|0000|aaaa]
+          const uint16x8_t s2 = vorrq_u16(s0, s1s);
+          // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+          const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+          const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+          const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
+          const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+          const uint16x8_t s4 = veorq_u16(s3, m0);
+  #undef simdutf_vec
+
+          // 4. expand words 16-bit => 32-bit
+          const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+          const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+          // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+          const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+          const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                                      0x0010, 0x0040,
+                                      0x0100, 0x0400,
+                                      0x1000, 0x4000 );
+          const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                                      0x0020, 0x0080,
+                                      0x0200, 0x0800,
+                                      0x2000, 0x8000 );
+  #else
+          const uint16x8_t onemask = { 0x0001, 0x0004,
+                                      0x0010, 0x0040,
+                                      0x0100, 0x0400,
+                                      0x1000, 0x4000 };
+          const uint16x8_t twomask = { 0x0002, 0x0008,
+                                      0x0020, 0x0080,
+                                      0x0200, 0x0800,
+                                      0x2000, 0x8000 };
+  #endif
+          const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+          const uint16_t mask = vaddvq_u16(combined);
+          // The following fast path may or may not be beneficial.
+          /*if(mask == 0) {
+            // We only have three-byte words. Use fast path.
+            const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+            const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+            const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+            vst1q_u8(utf8_output, utf8_0);
+            utf8_output += 12;
+            vst1q_u8(utf8_output, utf8_1);
+            utf8_output += 12;
+            buf += 8;
+            continue;
+          }*/
+          const uint8_t mask0 = uint8_t(mask);
+          const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+          const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+          const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+          const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+          const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+          vst1q_u8(utf8_output, utf8_0);
+          utf8_output += row0[0];
+          vst1q_u8(utf8_output, utf8_1);
+          utf8_output += row1[0];
+
+          buf += 8;
+      }
+    // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFFFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xFFFFF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word & 0xFFFF0000)==0) {
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFFFF80) == 0) {
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xFFFFF800) == 0) {
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xFFFF0000) == 0) {
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
-                    }
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else {
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
-                    }
-                    *utf8_output++ = char((word >> 18) | 0b11110000);
-                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
-        }
-    } // while
-
-    // check for invalid input
-    if (vmaxvq_u16(forbidden_bytemask) != 0) {
-        return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
-    }
-    return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
-}
-
-std::pair<result, char*> arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out)
-{
-    uint8_t* utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
-    const char32_t* start = buf;
-    const char32_t* end = buf + len;
-
-    const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-
-    while (buf + 16 <= end) {
-        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(buf));
-        uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t*>(buf + 4));
-
-        // Check if no bits set above 16th
-        if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
-            // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
-            // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
-            uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
-            if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
-                // 1. pack the bytes
-                // obviously suboptimal.
-                uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
-                // 2. store (8 bytes)
-                vst1_u8(utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 8;
-                utf8_output += 8;
-                continue; // we are done for this round!
-            }
-
-            if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
-                // 1. prepare 2-byte values
-                // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-                // expected output   : [110a|aaaa|10bb|bbbb] x 8
-                const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-                const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
-
-                // t0 = [000a|aaaa|bbbb|bb00]
-                const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
-                // t1 = [000a|aaaa|0000|0000]
-                const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-                // t2 = [0000|0000|00bb|bbbb]
-                const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
-                // t3 = [000a|aaaa|00bb|bbbb]
-                const uint16x8_t t3 = vorrq_u16(t1, t2);
-                // t4 = [110a|aaaa|10bb|bbbb]
-                const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-                // 2. merge ASCII and 2-byte codewords
-                const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-                const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-                const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
-                // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-                const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
-                    0x0010, 0x0040,
-                    0x0002, 0x0008,
-                    0x0020, 0x0080);
-#else
-                const uint16x8_t mask = { 0x0001, 0x0004,
-                    0x0010, 0x0040,
-                    0x0002, 0x0008,
-                    0x0020, 0x0080 };
-#endif
-                uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-                // 4. pack the bytes
-                const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-                const uint8x16_t shuffle = vld1q_u8(row + 1);
-                const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
-
-                // 5. store bytes
-                vst1q_u8(utf8_output, utf8_packed);
-
-                // 6. adjust pointers
-                buf += 8;
-                utf8_output += row[0];
-                continue;
+          if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
+          *utf8_output++ = char((word>>18) | 0b11110000);
+          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  // check for invalid input
+  if (vmaxvq_u16(forbidden_bytemask) != 0) {
+    return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
+  }
+  return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
+}
+
+
+std::pair<result, char*> arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out) {
+  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+  const char32_t* start = buf;
+  const char32_t* end = buf + len;
+
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+  while (buf + 16 <= end) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
+
+    // Check if no bits set above 16th
+    if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
+      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
+      if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
+          // 1. pack the bytes
+          // obviously suboptimal.
+          uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
+          // 2. store (8 bytes)
+          vst1_u8(utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 8;
+          utf8_output += 8;
+          continue; // we are done for this round!
+      }
+
+      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+            const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
 
-            } else {
-                // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-
-                // check for invalid input
-                const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-                const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
-                const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
-                if (vmaxvq_u16(forbidden_bytemask) != 0) {
-                    return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char*>(utf8_output));
-                }
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const uint16x8_t t3 = vorrq_u16(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+            // 2. merge ASCII and 2-byte codewords
+            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+            const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+            const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
+            // 3. prepare bitmask for 8-bit lookup
+  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                                      0x0010, 0x0040,
+                                      0x0002, 0x0008,
+                                      0x0020, 0x0080);
+  #else
+            const uint16x8_t mask = { 0x0001, 0x0004,
+                                      0x0010, 0x0040,
+                                      0x0002, 0x0008,
+                                      0x0020, 0x0080 };
+  #endif
+            uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const uint8x16_t shuffle = vld1q_u8(row + 1);
+            const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
 
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-                const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
-                    0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-#else
-                const uint16x8_t dup_even = { 0x0000, 0x0202, 0x0404, 0x0606,
-                    0x0808, 0x0a0a, 0x0c0c, 0x0e0e };
-#endif
-                /* In this branch we handle three cases:
-                  1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-                  2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-                  3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-                  We expand the input word (16-bit) into two words (32-bit), thus
-                  we have room for four bytes. However, we need five distinct bit
-                  layouts. Note that the last byte in cases #2 and #3 is the same.
-
-                  We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-                  in register t2.
-
-                  We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-                  either byte 1 for case #2 or byte 2 for case #3. Note that they
-                  differ by exactly one bit.
-
-                  Finally from these two words we build proper UTF-8 sequence, taking
-                  into account the case (i.e, the number of bytes to write).
-                */
-                /**
-                 * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-                 * t2 => [0ccc|cccc] [10cc|cccc]
-                 * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-                 */
-#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-                // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-                const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
-                // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-                const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
-                // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-                const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
-
-                // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-                const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
-                // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-                const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
-                // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-                const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-                // [00bb|bbbb|0000|aaaa]
-                const uint16x8_t s2 = vorrq_u16(s0, s1s);
-                // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-                const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
-                const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-                const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
-                const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
-                const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef simdutf_vec
+            // 5. store bytes
+            vst1q_u8(utf8_output, utf8_packed);
 
-                // 4. expand words 16-bit => 32-bit
-                const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-                const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
 
-                // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-                const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-                const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-                const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
-                    0x0010, 0x0040,
-                    0x0100, 0x0400,
-                    0x1000, 0x4000);
-                const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
-                    0x0020, 0x0080,
-                    0x0200, 0x0800,
-                    0x2000, 0x8000);
-#else
-                const uint16x8_t onemask = { 0x0001, 0x0004,
-                    0x0010, 0x0040,
-                    0x0100, 0x0400,
-                    0x1000, 0x4000 };
-                const uint16x8_t twomask = { 0x0002, 0x0008,
-                    0x0020, 0x0080,
-                    0x0200, 0x0800,
-                    0x2000, 0x8000 };
-#endif
-                const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
-                const uint16_t mask = vaddvq_u16(combined);
-                // The following fast path may or may not be beneficial.
-                /*if(mask == 0) {
-                  // We only have three-byte words. Use fast path.
-                  const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-                  const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-                  const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-                  vst1q_u8(utf8_output, utf8_0);
-                  utf8_output += 12;
-                  vst1q_u8(utf8_output, utf8_1);
-                  utf8_output += 12;
-                  buf += 8;
-                  continue;
-                }*/
-                const uint8_t mask0 = uint8_t(mask);
-
-                const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-                const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-                const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
-
-                const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-                const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-                const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-                const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
-
-                vst1q_u8(utf8_output, utf8_0);
-                utf8_output += row0[0];
-                vst1q_u8(utf8_output, utf8_1);
-                utf8_output += row1[0];
-
-                buf += 8;
-            }
-            // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
+      } else {
+        // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+
+        // check for invalid input
+        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
+        const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
+        if (vmaxvq_u16(forbidden_bytemask) != 0) {
+          return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char*>(utf8_output));
+        }
+
+  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+          const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+  #else
+          const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+  #endif
+          /* In this branch we handle three cases:
+            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+            We expand the input word (16-bit) into two words (32-bit), thus
+            we have room for four bytes. However, we need five distinct bit
+            layouts. Note that the last byte in cases #2 and #3 is the same.
+
+            We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+            in register t2.
+
+            We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+            either byte 1 for case #2 or byte 2 for case #3. Note that they
+            differ by exactly one bit.
+
+            Finally from these two words we build proper UTF-8 sequence, taking
+            into account the case (i.e, the number of bytes to write).
+          */
+          /**
+           * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+           * t2 => [0ccc|cccc] [10cc|cccc]
+           * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+           */
+  #define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+          // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+          const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
+          // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+          const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+          // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+          const uint16x8_t t2 = vorrq_u16 (t1, simdutf_vec(0b1000000000000000));
+
+          // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+          const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
+          // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+          const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
+          // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+          const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+          // [00bb|bbbb|0000|aaaa]
+          const uint16x8_t s2 = vorrq_u16(s0, s1s);
+          // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+          const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+          const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+          const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
+          const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+          const uint16x8_t s4 = veorq_u16(s3, m0);
+  #undef simdutf_vec
+
+          // 4. expand words 16-bit => 32-bit
+          const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+          const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+          // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+          const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+          const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                                      0x0010, 0x0040,
+                                      0x0100, 0x0400,
+                                      0x1000, 0x4000 );
+          const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                                      0x0020, 0x0080,
+                                      0x0200, 0x0800,
+                                      0x2000, 0x8000 );
+  #else
+          const uint16x8_t onemask = { 0x0001, 0x0004,
+                                      0x0010, 0x0040,
+                                      0x0100, 0x0400,
+                                      0x1000, 0x4000 };
+          const uint16x8_t twomask = { 0x0002, 0x0008,
+                                      0x0020, 0x0080,
+                                      0x0200, 0x0800,
+                                      0x2000, 0x8000 };
+  #endif
+          const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+          const uint16_t mask = vaddvq_u16(combined);
+          // The following fast path may or may not be beneficial.
+          /*if(mask == 0) {
+            // We only have three-byte words. Use fast path.
+            const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+            const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+            const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+            vst1q_u8(utf8_output, utf8_0);
+            utf8_output += 12;
+            vst1q_u8(utf8_output, utf8_1);
+            utf8_output += 12;
+            buf += 8;
+            continue;
+          }*/
+          const uint8_t mask0 = uint8_t(mask);
+
+          const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+          const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+          const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+          const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+          const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+          vst1q_u8(utf8_output, utf8_0);
+          utf8_output += row0[0];
+          vst1q_u8(utf8_output, utf8_1);
+          utf8_output += row1[0];
+
+          buf += 8;
+      }
+    // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFFFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xFFFFF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word & 0xFFFF0000)==0) {
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFFFF80) == 0) {
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xFFFFF800) == 0) {
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xFFFF0000) == 0) {
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char*>(utf8_output));
-                    }
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else {
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char*>(utf8_output));
-                    }
-                    *utf8_output++ = char((word >> 18) | 0b11110000);
-                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
+          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
+          *utf8_output++ = char((word>>18) | 0b11110000);
+          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
         }
-    } // while
+      }
+      buf += k;
+    }
+  } // while
 
-    return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
+  return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
 }
 /* end file src/arm64/arm_convert_utf32_to_utf8.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf32_to_utf16.cpp
 /* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */
-template<endianness big_endian>
-std::pair<const char32_t*, char16_t*> arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out)
-{
-    uint16_t* utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
-    const char32_t* end = buf + len;
-
-    uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
-
-    while (buf + 4 <= end) {
-        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(buf));
-
-        // Check if no bits set above 16th
-        if (vmaxvq_u32(in) <= 0xFFFF) {
-            uint16x4_t utf16_packed = vmovn_u32(in);
-
-            const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
-            const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
-            forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask);
-
-            if (!match_system(big_endian)) {
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-                const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
-#else
-                const uint8x8_t swap = { 1, 0, 3, 2, 5, 4, 7, 6 };
-#endif
-                utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
-            }
-            vst1_u16(utf16_output, utf16_packed);
-            utf16_output += 4;
-            buf += 4;
+template <endianness big_endian>
+std::pair<const char32_t*, char16_t*> arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out) {
+  uint16_t * utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
+  const char32_t* end = buf + len;
+
+  uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
+
+  while(buf + 4 <= end) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+
+    // Check if no bits set above 16th
+    if(vmaxvq_u32(in) <= 0xFFFF) {
+      uint16x4_t utf16_packed = vmovn_u32(in);
+
+      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
+      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
+      forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask);
+
+      if (!match_system(big_endian)) {
+        #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
+        #else
+        const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6};
+        #endif
+        utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
+      }
+      vst1_u16(utf16_output, utf16_packed);
+      utf16_output += 4;
+      buf += 4;
+    } else {
+      size_t forward = 3;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFF0000)==0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
+          *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
         } else {
-            size_t forward = 3;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFF0000) == 0) {
-                    // will not generate a surrogate pair
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
-                    }
-                    *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
-                } else {
-                    // will generate a surrogate pair
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
-                    }
-                    word -= 0x10000;
-                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-                    if (!match_system(big_endian)) {
-                        high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
-                        low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
-                    }
-                    *utf16_output++ = char16_t(high_surrogate);
-                    *utf16_output++ = char16_t(low_surrogate);
-                }
-            }
-            buf += k;
-        }
-    }
-
-    // check for invalid input
-    if (vmaxv_u16(forbidden_bytemask) != 0) {
-        return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
-    }
-
-    return std::make_pair(buf, reinterpret_cast<char16_t*>(utf16_output));
-}
-
-template<endianness big_endian>
-std::pair<result, char16_t*> arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out)
-{
-    uint16_t* utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
-    const char32_t* start = buf;
-    const char32_t* end = buf + len;
-
-    while (buf + 4 <= end) {
-        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(buf));
-
-        // Check if no bits set above 16th
-        if (vmaxvq_u32(in) <= 0xFFFF) {
-            uint16x4_t utf16_packed = vmovn_u32(in);
-
-            const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
-            const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
-            const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
-            if (vmaxv_u16(forbidden_bytemask) != 0) {
-                return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char16_t*>(utf16_output));
-            }
-
-            if (!match_system(big_endian)) {
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-                const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
-#else
-                const uint8x8_t swap = { 1, 0, 3, 2, 5, 4, 7, 6 };
-#endif
-                utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
-            }
-            vst1_u16(utf16_output, utf16_packed);
-            utf16_output += 4;
-            buf += 4;
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (!match_system(big_endian)) {
+            high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
+    }
+  }
+
+  // check for invalid input
+  if (vmaxv_u16(forbidden_bytemask) != 0) {
+    return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
+  }
+
+  return std::make_pair(buf, reinterpret_cast<char16_t*>(utf16_output));
+}
+
+
+template <endianness big_endian>
+std::pair<result, char16_t*> arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out) {
+  uint16_t * utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
+  const char32_t* start = buf;
+  const char32_t* end = buf + len;
+
+  while(buf + 4 <= end) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+
+    // Check if no bits set above 16th
+    if(vmaxvq_u32(in) <= 0xFFFF) {
+      uint16x4_t utf16_packed = vmovn_u32(in);
+
+      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
+      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
+      const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
+      if (vmaxv_u16(forbidden_bytemask) != 0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char16_t*>(utf16_output));
+      }
+
+      if (!match_system(big_endian)) {
+        #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
+        #else
+        const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6};
+        #endif
+        utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
+      }
+      vst1_u16(utf16_output, utf16_packed);
+      utf16_output += 4;
+      buf += 4;
+    } else {
+      size_t forward = 3;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFF0000)==0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
+          *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
         } else {
-            size_t forward = 3;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFF0000) == 0) {
-                    // will not generate a surrogate pair
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output));
-                    }
-                    *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
-                } else {
-                    // will generate a surrogate pair
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output));
-                    }
-                    word -= 0x10000;
-                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-                    if (!match_system(big_endian)) {
-                        high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
-                        low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
-                    }
-                    *utf16_output++ = char16_t(high_surrogate);
-                    *utf16_output++ = char16_t(low_surrogate);
-                }
-            }
-            buf += k;
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (!match_system(big_endian)) {
+            high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
         }
+      }
+      buf += k;
     }
+  }
 
-    return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char16_t*>(utf16_output));
+  return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char16_t*>(utf16_output));
 }
 /* end file src/arm64/arm_convert_utf32_to_utf16.cpp */
 } // unnamed namespace
@@ -16026,103 +14936,85 @@ namespace {
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-    simdutf_really_inline buf_block_reader(const uint8_t* _buf, size_t _len);
-    simdutf_really_inline size_t block_index();
-    simdutf_really_inline bool has_full_block() const;
-    simdutf_really_inline const uint8_t* full_block() const;
-    /**
-     * Get the last block, padded with spaces.
-     *
-     * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-     * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-     * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-     *
-     * @return the number of effective characters in the last block.
-     */
-    simdutf_really_inline size_t get_remainder(uint8_t* dst) const;
-    simdutf_really_inline void advance();
-
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
 private:
-    const uint8_t* buf;
-    const size_t len;
-    const size_t lenminusstep;
-    size_t idx;
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
 };
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char* format_input_text_64(const uint8_t* text)
-{
-    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-    }
-    buf[sizeof(simd8x64<uint8_t>)] = '\0';
-    return buf;
+simdutf_unused static char * format_input_text_64(const uint8_t *text) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char* format_input_text(const simd8x64<uint8_t>& in)
-{
-    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-    in.store(reinterpret_cast<uint8_t*>(buf));
-    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-        if (buf[i] < ' ') {
-            buf[i] = '_';
-        }
-    }
-    buf[sizeof(simd8x64<uint8_t>)] = '\0';
-    return buf;
+simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
-simdutf_unused static char* format_mask(uint64_t mask)
-{
-    static char* buf = reinterpret_cast<char*>(malloc(64 + 1));
-    for (size_t i = 0; i < 64; i++) {
-        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-    }
-    buf[64] = '\0';
-    return buf;
+simdutf_unused static char * format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t* _buf, size_t _len)
-    : buf { _buf }
-    , len { _len }
-    , lenminusstep { len < STEP_SIZE ? 0 : len - STEP_SIZE }
-    , idx { 0 }
-{
-}
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
 
 template<size_t STEP_SIZE>
 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
-{
-    return idx < lenminusstep;
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline const uint8_t* buf_block_reader<STEP_SIZE>::full_block() const
-{
-    return &buf[idx];
+simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t* dst) const
-{
-    if (len == idx) {
-        return 0;
-    } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-    std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-    std::memcpy(dst, buf + idx, len - idx);
-    return len - idx;
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance()
-{
-    idx += STEP_SIZE;
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
 }
 
 } // unnamed namespace
@@ -16138,22 +15030,21 @@ namespace utf8_validation {
 
 using namespace simd;
 
-simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
-{
-    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-    // Bit 1 = Too Long (ASCII followed by continuation)
-    // Bit 2 = Overlong 3-byte
-    // Bit 4 = Surrogate
-    // Bit 5 = Overlong 2-byte
-    // Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -16161,92 +15052,101 @@ simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> in
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-    // 11110101 1000____
-    // 1111011_ 1000____
-    // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-        // 0_______ ________ <ASCII in byte 1>
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        // 10______ ________ <continuation in byte 1>
-        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-        // 1100____ ________ <two byte lead in byte 1>
-        TOO_SHORT | OVERLONG_2,
-        // 1101____ ________ <two byte lead in byte 1>
-        TOO_SHORT,
-        // 1110____ ________ <three byte lead in byte 1>
-        TOO_SHORT | OVERLONG_3 | SURROGATE,
-        // 1111____ ________ <four+ byte lead in byte 1>
-        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-        // ____0000 ________
-        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-        // ____0001 ________
-        CARRY | OVERLONG_2,
-        // ____001_ ________
-        CARRY, CARRY,
-
-        // ____0100 ________
-        CARRY | TOO_LARGE,
-        // ____0101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____011_ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-        // ____1___ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____1101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ________ 0_______ <ASCII in byte 2>
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-        // ________ 1000____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-        // ________ 1001____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-        // ________ 101_____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-
-        // ________ 11______
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
     return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
-{
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-}
+  }
 
-//
-// Return nonzero if there are incomplete multibyte characters at the end of the block:
-// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-//
-simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input)
-{
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     static const uint8_t max_array[32] = {
-        255, 255, 255, 255, 255, 255, 255, 255,
-        255, 255, 255, 255, 255, 255, 255, 255,
-        255, 255, 255, 255, 255, 255, 255, 255,
-        255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
     return input.gt_bits(max_value);
-}
+  }
 
-struct utf8_checker {
+  struct utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
     // The last input we received
@@ -16257,54 +15157,51 @@ struct utf8_checker {
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-        simd8<uint8_t> sc = check_special_cases(input, prev1);
-        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof()
-    {
-        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-        // possibly finish them.
-        this->error |= this->prev_incomplete;
+    simdutf_really_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
     }
 
-    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input)
-    {
-        if (simdutf_likely(is_ascii(input))) {
-            this->error |= this->prev_incomplete;
-        } else {
-            // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-            static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                "We support either two or four chunks per 64-byte block.");
-            if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-            }
-            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
-            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdutf_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+
+      }
     }
 
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const
-    {
-        return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
     }
 
-}; // struct utf8_checker
+  }; // struct utf8_checker
 } // namespace utf8_validation
 
 using utf8_validation::utf8_checker;
@@ -16324,16 +15221,15 @@ namespace utf8_validation {
  * Validates that the string is actual UTF-8.
  */
 template<class checker>
-bool generic_validate_utf8(const uint8_t* input, size_t length)
-{
-    checker c {};
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
     buf_block_reader<64> reader(input, length);
     while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        c.check_next_input(in);
-        reader.advance();
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
     }
-    uint8_t block[64] {};
+    uint8_t block[64]{};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
@@ -16342,106 +15238,97 @@ bool generic_validate_utf8(const uint8_t* input, size_t length)
     return !c.errors();
 }
 
-bool generic_validate_utf8(const char* input, size_t length)
-{
-    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+bool generic_validate_utf8(const char * input, size_t length) {
+  return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 /**
  * Validates that the string is actual UTF-8 and stops on errors.
  */
 template<class checker>
-result generic_validate_utf8_with_errors(const uint8_t* input, size_t length)
-{
-    checker c {};
+result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
+    checker c{};
     buf_block_reader<64> reader(input, length);
-    size_t count { 0 };
+    size_t count{0};
     while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        c.check_next_input(in);
-        if (c.errors()) {
-            if (count != 0) {
-                count--;
-            } // Sometimes the error is only detected in the next chunk
-            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-            res.count += count;
-            return res;
-        }
-        reader.advance();
-        count += 64;
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      if(c.errors()) {
+        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+        res.count += count;
+        return res;
+      }
+      reader.advance();
+      count += 64;
     }
-    uint8_t block[64] {};
+    uint8_t block[64]{};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
     reader.advance();
     c.check_eof();
     if (c.errors()) {
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
-        res.count += count;
-        return res;
+      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
+      res.count += count;
+      return res;
     } else {
-        return result(error_code::SUCCESS, length);
+      return result(error_code::SUCCESS, length);
     }
 }
 
-result generic_validate_utf8_with_errors(const char* input, size_t length)
-{
-    return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+result generic_validate_utf8_with_errors(const char * input, size_t length) {
+  return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 template<class checker>
-bool generic_validate_ascii(const uint8_t* input, size_t length)
-{
+bool generic_validate_ascii(const uint8_t * input, size_t length) {
     buf_block_reader<64> reader(input, length);
-    uint8_t blocks[64] {};
+    uint8_t blocks[64]{};
     simd::simd8x64<uint8_t> running_or(blocks);
     while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        running_or |= in;
-        reader.advance();
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      running_or |= in;
+      reader.advance();
     }
-    uint8_t block[64] {};
+    uint8_t block[64]{};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     running_or |= in;
     return running_or.is_ascii();
 }
 
-bool generic_validate_ascii(const char* input, size_t length)
-{
-    return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+bool generic_validate_ascii(const char * input, size_t length) {
+  return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 template<class checker>
-result generic_validate_ascii_with_errors(const uint8_t* input, size_t length)
-{
-    buf_block_reader<64> reader(input, length);
-    size_t count { 0 };
-    while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        if (!in.is_ascii()) {
-            result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-            return result(res.error, count + res.count);
-        }
-        reader.advance();
-
-        count += 64;
-    }
-    uint8_t block[64] {};
-    reader.get_remainder(block);
-    simd::simd8x64<uint8_t> in(block);
+result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
     if (!in.is_ascii()) {
-        result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-        return result(res.error, count + res.count);
-    } else {
-        return result(error_code::SUCCESS, length);
+      result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+      return result(res.error, count + res.count);
     }
+    reader.advance();
+
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  if (!in.is_ascii()) {
+    result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+    return result(res.error, count + res.count);
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
 }
 
-result generic_validate_ascii_with_errors(const char* input, size_t length)
-{
-    return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+result generic_validate_ascii_with_errors(const char * input, size_t length) {
+  return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 } // namespace utf8_validation
@@ -16453,6 +15340,7 @@ result generic_validate_ascii_with_errors(const char* input, size_t length)
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
+
 namespace simdutf {
 namespace arm64 {
 namespace {
@@ -16460,64 +15348,63 @@ namespace utf8_to_utf16 {
 
 using namespace simd;
 
-template<endianness endian>
+template <endianness endian>
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char16_t* utf16_output) noexcept
-{
-    // The implementation is not specific to haswell and should be moved to the generic directory.
-    size_t pos = 0;
-    char16_t* start { utf16_output };
-    const size_t safety_margin = 16; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-        // this loop could be unrolled further. For example, we could process the mask
-        // far more than 64 bytes.
-        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
-        if (in.is_ascii()) {
-            in.store_ascii_as_utf16<endian>(utf16_output);
-            utf16_output += 64;
-            pos += 64;
-        } else {
-            // Slow path. We hope that the compiler will recognize that this is a slow path.
-            // Anything that is not a continuation mask is a 'leading byte', that is, the
-            // start of a new code point.
-            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-            // The *start* of code points is not so useful, rather, we want the *end* of code points.
-            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-            // We process in blocks of up to 12 bytes except possibly
-            // for fast paths which may process up to 16 bytes. For the
-            // slow path to work, we should have at least 12 input bytes left.
-            size_t max_starting_point = (pos + 64) - 12;
-            // Next loop is going to run at least five times when using solely
-            // the slow/regular path, and at least four times if there are fast paths.
-            while (pos < max_starting_point) {
-                // Performance note: our ability to compute 'consumed' and
-                // then shift and recompute is critical. If there is a
-                // latency of, say, 4 cycles on getting 'consumed', then
-                // the inner loop might have a total latency of about 6 cycles.
-                // Yet we process between 6 to 12 inputs bytes, thus we get
-                // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                // for this section of the code. Hence, there is a limit
-                // to how much we can further increase this latency before
-                // it seriously harms performance.
-                //
-                // Thus we may allow convert_masked_utf8_to_utf16 to process
-                // more bytes at a time under a fast-path mode where 16 bytes
-                // are consumed at once (e.g., when encountering ASCII).
-                size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
-                    utf8_end_of_code_point_mask, utf16_output);
-                pos += consumed;
-                utf8_end_of_code_point_mask >>= consumed;
-            }
-            // At this point there may remain between 0 and 12 bytes in the
-            // 64-byte block. These bytes will be processed again. So we have an
-            // 80% efficiency (in the worst case). In practice we expect an
-            // 85% to 90% efficiency.
-        }
-    }
-    utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
-    return utf16_output - start;
+    char16_t* utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the generic directory.
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while(pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the mask
+    // far more than 64 bytes.
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if(in.is_ascii()) {
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow path.
+      // Anything that is not a continuation mask is a 'leading byte', that is, the
+      // start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end* of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while(pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
+  }
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
+  return utf16_output - start;
 }
 
 } // namespace utf8_to_utf16
@@ -16528,28 +15415,29 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
+
 namespace simdutf {
 namespace arm64 {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
 
-simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
-{
-    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-    // Bit 1 = Too Long (ASCII followed by continuation)
-    // Bit 2 = Overlong 3-byte
-    // Bit 4 = Surrogate
-    // Bit 5 = Overlong 2-byte
-    // Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -16557,274 +15445,275 @@ simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> in
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-    // 11110101 1000____
-    // 1111011_ 1000____
-    // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-        // 0_______ ________ <ASCII in byte 1>
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        // 10______ ________ <continuation in byte 1>
-        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-        // 1100____ ________ <two byte lead in byte 1>
-        TOO_SHORT | OVERLONG_2,
-        // 1101____ ________ <two byte lead in byte 1>
-        TOO_SHORT,
-        // 1110____ ________ <three byte lead in byte 1>
-        TOO_SHORT | OVERLONG_3 | SURROGATE,
-        // 1111____ ________ <four+ byte lead in byte 1>
-        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-        // ____0000 ________
-        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-        // ____0001 ________
-        CARRY | OVERLONG_2,
-        // ____001_ ________
-        CARRY, CARRY,
-
-        // ____0100 ________
-        CARRY | TOO_LARGE,
-        // ____0101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____011_ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-        // ____1___ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____1101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ________ 0_______ <ASCII in byte 2>
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-        // ________ 1000____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-        // ________ 1001____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-        // ________ 101_____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-
-        // ________ 11______
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
     return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
-{
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-}
+  }
+
 
-struct validating_transcoder {
+  struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder()
-        : error(uint8_t(0))
-    {
-    }
+    validating_transcoder() : error(uint8_t(0)) {}
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-        simd8<uint8_t> sc = check_special_cases(input, prev1);
-        this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-    template<endianness endian>
-    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output)
-    {
-        size_t pos = 0;
-        char16_t* start { utf16_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 8; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf16<endian>(utf16_output);
-                utf16_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                        utf8_end_of_code_point_mask, utf16_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
-            return 0;
-        }
-        if (pos < size) {
-            size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
-            if (howmany == 0) {
-                return 0;
-            }
-            utf16_output += howmany;
-        }
-        return utf16_output - start;
-    }
-
-    template<endianness endian>
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output)
-    {
-        size_t pos = 0;
-        char16_t* start { utf16_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 8; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf16<endian>(utf16_output);
-                utf16_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                if (errors()) {
-                    // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-                    // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-                    result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-                    res.count += pos;
-                    return res;
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                        utf8_end_of_code_point_mask, utf16_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+
+    template <endianness endian>
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
+      size_t pos = 0;
+      char16_t* start{utf16_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 8; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf16<endian>(utf16_output);
+          utf16_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) { return 0; }
+      if(pos < size) {
+        size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
+        if(howmany == 0) { return 0; }
+        utf16_output += howmany;
+      }
+      return utf16_output - start;
+    }
+
+    template <endianness endian>
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
+      size_t pos = 0;
+      char16_t* start{utf16_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 8; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf16<endian>(utf16_output);
+          utf16_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          if (errors()) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) {
+        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+        res.count += pos;
+        return res;
+      }
+      if(pos < size) {
+        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+        if (res.error) {    // In case of error, we want the error position
+          res.count += pos;
+          return res;
+        } else {    // In case of success, we want the number of word written
+          utf16_output += res.count;
         }
-        if (pos < size) {
-            // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-            // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-            result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-            if (res.error) { // In case of error, we want the error position
-                res.count += pos;
-                return res;
-            } else { // In case of success, we want the number of word written
-                utf16_output += res.count;
-            }
-        }
-        return result(error_code::SUCCESS, utf16_output - start);
+      }
+      return result(error_code::SUCCESS, utf16_output - start);
     }
 
-    simdutf_really_inline bool errors() const
-    {
-        return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
     }
 
-}; // struct utf8_checker
+  }; // struct utf8_checker
 } // utf8_to_utf16 namespace
 } // unnamed namespace
 } // namespace arm64
@@ -16841,36 +15730,37 @@ namespace utf8_to_utf32 {
 
 using namespace simd;
 
+
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char32_t* utf32_output) noexcept
-{
-    size_t pos = 0;
-    char32_t* start { utf32_output };
-    const size_t safety_margin = 16; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
-        if (in.is_ascii()) {
-            in.store_ascii_as_utf32(utf32_output);
-            utf32_output += 64;
-            pos += 64;
-        } else {
-            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-            size_t max_starting_point = (pos + 64) - 12;
-            while (pos < max_starting_point) {
-                size_t consumed = convert_masked_utf8_to_utf32(input + pos,
-                    utf8_end_of_code_point_mask, utf32_output);
-                pos += consumed;
-                utf8_end_of_code_point_mask >>= consumed;
-            }
-        }
+    char32_t* utf32_output) noexcept {
+  size_t pos = 0;
+  char32_t* start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while(pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if(in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+    uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+    uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+    size_t max_starting_point = (pos + 64) - 12;
+    while(pos < max_starting_point) {
+      size_t consumed = convert_masked_utf8_to_utf32(input + pos,
+                          utf8_end_of_code_point_mask, utf32_output);
+      pos += consumed;
+      utf8_end_of_code_point_mask >>= consumed;
+      }
     }
-    utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
-    return utf32_output - start;
+  }
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
+  return utf32_output - start;
 }
 
+
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace arm64
@@ -16879,28 +15769,29 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
+
 namespace simdutf {
 namespace arm64 {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
 
-simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
-{
-    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-    // Bit 1 = Too Long (ASCII followed by continuation)
-    // Bit 2 = Overlong 3-byte
-    // Bit 4 = Surrogate
-    // Bit 5 = Overlong 2-byte
-    // Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -16908,266 +15799,268 @@ simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> in
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-    // 11110101 1000____
-    // 1111011_ 1000____
-    // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-        // 0_______ ________ <ASCII in byte 1>
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        // 10______ ________ <continuation in byte 1>
-        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-        // 1100____ ________ <two byte lead in byte 1>
-        TOO_SHORT | OVERLONG_2,
-        // 1101____ ________ <two byte lead in byte 1>
-        TOO_SHORT,
-        // 1110____ ________ <three byte lead in byte 1>
-        TOO_SHORT | OVERLONG_3 | SURROGATE,
-        // 1111____ ________ <four+ byte lead in byte 1>
-        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-        // ____0000 ________
-        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-        // ____0001 ________
-        CARRY | OVERLONG_2,
-        // ____001_ ________
-        CARRY, CARRY,
-
-        // ____0100 ________
-        CARRY | TOO_LARGE,
-        // ____0101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____011_ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-        // ____1___ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____1101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ________ 0_______ <ASCII in byte 2>
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-        // ________ 1000____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-        // ________ 1001____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-        // ________ 101_____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-
-        // ________ 11______
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
     return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
-{
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-}
+  }
+
 
-struct validating_transcoder {
+  struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder()
-        : error(uint8_t(0))
-    {
-    }
+    validating_transcoder() : error(uint8_t(0)) {}
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-        simd8<uint8_t> sc = check_special_cases(input, prev1);
-        this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output)
-    {
-        size_t pos = 0;
-        char32_t* start { utf32_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 4; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf32(utf32_output);
-                utf32_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                        utf8_end_of_code_point_mask, utf32_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
-            return 0;
-        }
-        if (pos < size) {
-            size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-            if (howmany == 0) {
-                return 0;
-            }
-            utf32_output += howmany;
-        }
-        return utf32_output - start;
-    }
-
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output)
-    {
-        size_t pos = 0;
-        char32_t* start { utf32_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 4; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf32(utf32_output);
-                utf32_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                if (errors()) {
-                    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-                    res.count += pos;
-                    return res;
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                        utf8_end_of_code_point_mask, utf32_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
+      size_t pos = 0;
+      char32_t* start{utf32_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 4; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf32(utf32_output);
+          utf32_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                            utf8_end_of_code_point_mask, utf32_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) { return 0; }
+      if(pos < size) {
+        size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+        if(howmany == 0) { return 0; }
+        utf32_output += howmany;
+      }
+      return utf32_output - start;
+    }
+
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
+      size_t pos = 0;
+      char32_t* start{utf32_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 4; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf32(utf32_output);
+          utf32_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          if (errors()) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                            utf8_end_of_code_point_mask, utf32_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) {
+        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+        res.count += pos;
+        return res;
+      }
+      if(pos < size) {
+        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+        if (res.error) {    // In case of error, we want the error position
+          res.count += pos;
+          return res;
+        } else {    // In case of success, we want the number of word written
+          utf32_output += res.count;
         }
-        if (pos < size) {
-            result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-            if (res.error) { // In case of error, we want the error position
-                res.count += pos;
-                return res;
-            } else { // In case of success, we want the number of word written
-                utf32_output += res.count;
-            }
-        }
-        return result(error_code::SUCCESS, utf32_output - start);
+      }
+      return result(error_code::SUCCESS, utf32_output - start);
     }
 
-    simdutf_really_inline bool errors() const
-    {
-        return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
     }
 
-}; // struct utf8_checker
+  }; // struct utf8_checker
 } // utf8_to_utf32 namespace
 } // unnamed namespace
 } // namespace arm64
@@ -17184,37 +16077,36 @@ namespace utf8 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char* in, size_t size)
-{
+simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
-    for (; pos + 64 <= size; pos += 64) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        count += 64 - count_ones(utf8_continuation_mask);
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      count += 64 - count_ones(utf8_continuation_mask);
     }
     return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size)
-{
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for (; pos + 64 <= size; pos += 64) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        // We count one word for anything that is not a continuation (so
-        // leading bytes).
-        count += 64 - count_ones(utf8_continuation_mask);
-        int64_t utf8_4byte = input.gteq_unsigned(240);
-        count += count_ones(utf8_4byte);
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      // We count one word for anything that is not a continuation (so
+      // leading bytes).
+      count += 64 - count_ones(utf8_continuation_mask);
+      int64_t utf8_4byte = input.gteq_unsigned(240);
+      count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
 
-simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
-{
+
+simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
     return count_code_points(in, size);
 }
 } // utf8 namespace
@@ -17229,65 +16121,57 @@ namespace arm64 {
 namespace {
 namespace utf16 {
 
-template<endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
-    for (; pos + 32 <= size; pos += 32) {
-        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
-        if (!match_system(big_endian)) {
-            input.swap_bytes();
-        }
-        uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-        count += count_ones(not_pair) / 2;
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      if (!match_system(big_endian)) { input.swap_bytes(); }
+      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+      count += count_ones(not_pair) / 2;
     }
     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
 }
 
-template<endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for (; pos + 32 <= size; pos += 32) {
-        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
-        if (!match_system(big_endian)) {
-            input.swap_bytes();
-        }
-        uint64_t ascii_mask = input.lteq(0x7F);
-        uint64_t twobyte_mask = input.lteq(0x7FF);
-        uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-        size_t ascii_count = count_ones(ascii_mask) / 2;
-        size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
-        size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
-        size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-        count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      if (!match_system(big_endian)) { input.swap_bytes(); }
+      uint64_t ascii_mask = input.lteq(0x7F);
+      uint64_t twobyte_mask = input.lteq(0x7FF);
+      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+      size_t ascii_count = count_ones(ascii_mask) / 2;
+      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
+      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
+      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
     }
     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
 }
 
-template<endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
     return count_code_points<big_endian>(in, size);
 }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output)
-{
-    size_t pos = 0;
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
+  size_t pos = 0;
 
-    while (pos + 32 <= size) {
-        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
-        input.swap_bytes();
-        input.store(reinterpret_cast<uint16_t*>(output));
-        pos += 32;
-        output += 32;
-    }
+  while (pos + 32 <= size) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
+  }
 
-    scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
 } // utf16
@@ -17304,666 +16188,557 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 namespace simdutf {
 namespace arm64 {
 
-simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
-{
-    // If there is a BOM, then we trust it.
-    auto bom_encoding = simdutf::BOM::check_bom(input, length);
-    if (bom_encoding != encoding_type::unspecified) {
-        return bom_encoding;
-    }
-    if (length % 2 == 0) {
-        return arm_detect_encodings<utf8_validation::utf8_checker>(input, length);
+simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
+  if (length % 2 == 0) {
+    return arm_detect_encodings<utf8_validation::utf8_checker>(input, length);
+  } else {
+    if (implementation::validate_utf8(input, length)) {
+      return simdutf::encoding_type::UTF8;
     } else {
-        if (implementation::validate_utf8(input, length)) {
-            return simdutf::encoding_type::UTF8;
-        } else {
-            return simdutf::encoding_type::unspecified;
-        }
+      return simdutf::encoding_type::unspecified;
     }
+  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
-{
-    return arm64::utf8_validation::generic_validate_utf8(buf, len);
+simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_utf8(buf,len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
-{
-    return arm64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_utf8_with_errors(buf,len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
-{
-    return arm64::utf8_validation::generic_validate_ascii(buf, len);
+simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_ascii(buf,len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
-{
-    return arm64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_ascii_with_errors(buf,len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
-{
-    const char16_t* tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
-    if (tail) {
-        return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
-    } else {
-        return false;
-    }
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
+  const char16_t* tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
-{
-    const char16_t* tail = arm_validate_utf16<endianness::BIG>(buf, len);
-    if (tail) {
-        return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
-    } else {
-        return false;
-    }
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
+  const char16_t* tail = arm_validate_utf16<endianness::BIG>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
-{
-    result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
-    if (res.count != len) {
-        result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
-        return result(scalar_res.error, res.count + scalar_res.count);
-    } else {
-        return res;
-    }
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
+  result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
-{
-    result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
-    if (res.count != len) {
-        result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
-        return result(scalar_res.error, res.count + scalar_res.count);
-    } else {
-        return res;
-    }
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
+  result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
-{
-    const char32_t* tail = arm_validate_utf32le(buf, len);
-    if (tail) {
-        return scalar::utf32::validate(tail, len - (tail - buf));
-    } else {
-        return false;
-    }
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  const char32_t* tail = arm_validate_utf32le(buf, len);
+  if (tail) {
+    return scalar::utf32::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
-{
-    result res = arm_validate_utf32le_with_errors(buf, len);
-    if (res.count != len) {
-        result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
-        return result(scalar_res.error, res.count + scalar_res.count);
-    } else {
-        return res;
-    }
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
+  result res = arm_validate_utf32le_with_errors(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::latin1_to_utf8::convert(buf,len,utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept
-{
-    return scalar::latin1_to_utf32::convert(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept {
+  return scalar::latin1_to_utf32::convert(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16::validating_transcoder converter;
-    return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16::validating_transcoder converter;
-    return converter.convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16::validating_transcoder converter;
-    return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16::validating_transcoder converter;
-    return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept
-{
-    return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
+    char16_t* utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept
-{
-    return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
+    char16_t* utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    utf8_to_utf32::validating_transcoder converter;
-    return converter.convert(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    utf8_to_utf32::validating_transcoder converter;
-    return converter.convert_with_errors(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
-    char32_t* utf32_output) const noexcept
-{
-    return utf8_to_utf32::convert_valid(input, size, utf32_output);
+    char32_t* utf32_output) const noexcept {
+  return utf8_to_utf32::convert_valid(input, size,  utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf8_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf8_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
-    if (ret.first.error) {
-        return ret.first;
-    } // Can return directly since scalar fallback already found correct ret.first.count
-    if (ret.first.count != len) { // All good so far, but not finished
-        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
-    return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
-    if (ret.first.error) {
-        return ret.first;
-    } // Can return directly since scalar fallback already found correct ret.first.count
-    if (ret.first.count != len) { // All good so far, but not finished
-        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
+  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
-    return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return convert_utf16le_to_utf8(buf, len, utf8_output);
+  }
+  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
+  return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf16le_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    std::pair<const char32_t*, char*> ret = arm_convert_utf32_to_utf8(buf, len, utf8_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf8_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char*> ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-    if (ret.first.count != len) {
-        result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char32_t*, char*> ret = arm_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf32_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char*> ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
+  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    return saved_bytes;
+  }
+  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
+  return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf32_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert(buf,len,latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
-    if (ret.first.error) {
-        return ret.first;
-    } // Can return directly since scalar fallback already found correct ret.first.count
-    if (ret.first.count != len) { // All good so far, but not finished
-        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
-    return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
-    if (ret.first.error) {
-        return ret.first;
-    } // Can return directly since scalar fallback already found correct ret.first.count
-    if (ret.first.count != len) { // All good so far, but not finished
-        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_with_errors(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_valid(buf,len,latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf32_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return convert_utf32_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf16_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    return saved_bytes;
+  }
+  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
+  return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf16_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return convert_utf32_to_utf16le(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
-    if (ret.first.count != len) {
-        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return convert_utf32_to_utf16be(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
-    if (ret.first.count != len) {
-        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return convert_utf16le_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return convert_utf32_to_utf16le(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return convert_utf16be_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return convert_utf32_to_utf16be(buf, len, utf16_output);
+void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
+  utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return convert_utf16le_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return convert_utf16be_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
-{
-    utf16::change_endianness_utf16(input, length, output);
+simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept {
+  return scalar::utf8::latin1_length_from_utf8(buf,len);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
-{
-    return utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32( size_t length) const noexcept {
+  return scalar::utf32::latin1_length_from_utf32(length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
-{
-    return scalar::utf8::latin1_length_from_utf8(buf, len);
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept {
+  return scalar::latin1::utf8_length_from_latin1(input,length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
-{
-    return scalar::utf16::latin1_length_from_utf16(length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
-{
-    return scalar::utf32::latin1_length_from_utf32(length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
-{
-    return scalar::latin1::utf8_length_from_latin1(input, length);
-}
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
-}
 
-simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
-{
-    return scalar::latin1::utf16_length_from_latin1(length);
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
-{
-    return scalar::latin1::utf32_length_from_latin1(length);
-}
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
-{
-    return utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
-{
-    const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
-    const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
-    const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
-    const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
-    size_t pos = 0;
-    size_t count = 0;
-    for (; pos + 4 <= length; pos += 4) {
-        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input + pos));
-        const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
-        const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
-        const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
-        const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
+  const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
+  const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
+  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
+  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
+  size_t pos = 0;
+  size_t count = 0;
+  for(;pos + 4 <= length; pos += 4) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
+    const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
+    const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
+    const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
 
-        const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
-        const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
-        const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
+    const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
+    const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
+    const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
 
-        const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
-        const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
+    const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
+    const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
 
-        size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
-        size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
-        size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
+    size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
+    size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
+    size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
 
-        count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
-    }
-    return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+    count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
+  }
+  return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
-{
-    const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
-    const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
-    size_t pos = 0;
-    size_t count = 0;
-    for (; pos + 4 <= length; pos += 4) {
-        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input + pos));
-        const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
-        const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
-        const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask);
-        size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
-        count += 4 + surrogate_count;
-    }
-    return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
+  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
+  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
+  size_t pos = 0;
+  size_t count = 0;
+  for(;pos + 4 <= length; pos += 4) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
+    const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
+    const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
+    const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask);
+    size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
+    count += 4 + surrogate_count;
+  }
+  return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
-{
-    return utf8::utf32_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
+  return utf8::utf32_length_from_utf8(input, length);
 }
 
 } // namespace arm64
@@ -17983,403 +16758,328 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* in
 // #define SIMDUTF_IMPLEMENTATION fallback
 /* end file src/simdutf/fallback/begin.h */
 
+
+
+
+
+
+
+
+
 namespace simdutf {
 namespace fallback {
 
-simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
-{
-    // If there is a BOM, then we trust it.
-    auto bom_encoding = simdutf::BOM::check_bom(input, length);
-    if (bom_encoding != encoding_type::unspecified) {
-        return bom_encoding;
-    }
-    int out = 0;
-    if (validate_utf8(input, length)) {
-        out |= encoding_type::UTF8;
-    }
-    if ((length % 2) == 0) {
-        if (validate_utf16le(reinterpret_cast<const char16_t*>(input), length / 2)) {
-            out |= encoding_type::UTF16_LE;
-        }
-    }
-    if ((length % 4) == 0) {
-        if (validate_utf32(reinterpret_cast<const char32_t*>(input), length / 4)) {
-            out |= encoding_type::UTF32_LE;
-        }
-    }
+simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
+  int out = 0;
+  if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
+  if((length % 2) == 0) {
+    if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
+  }
+  if((length % 4) == 0) {
+    if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
+  }
 
-    return out;
+  return out;
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
-{
-    return scalar::utf8::validate(buf, len);
+simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return scalar::utf8::validate(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
-{
-    return scalar::utf8::validate_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
+  return scalar::utf8::validate_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
-{
-    return scalar::ascii::validate(buf, len);
+simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return scalar::ascii::validate(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
-{
-    return scalar::ascii::validate_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
+  return scalar::ascii::validate_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
-{
-    return scalar::utf16::validate<endianness::LITTLE>(buf, len);
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate<endianness::LITTLE>(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
-{
-    return scalar::utf16::validate<endianness::BIG>(buf, len);
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate<endianness::BIG>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
-{
-    return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
-{
-    return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
-{
-    return scalar::utf32::validate(buf, len);
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  return scalar::utf32::validate(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
-{
-    return scalar::utf32::validate_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
+  return scalar::utf32::validate_with_errors(buf, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::latin1_to_utf8::convert(buf,len,utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::latin1_to_utf32::convert(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::latin1_to_utf32::convert(buf,len,utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
-    char32_t* utf32_output) const noexcept
-{
-    return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output);
+    char32_t* utf32_output) const noexcept {
+  return scalar::utf8_to_utf32::convert_valid(input, size,  utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
 }
 
-void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
-{
-    scalar::utf16::change_endianness_utf16(input, length, output);
+void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
+  scalar::utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
-{
-    return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
+  return scalar::utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
-{
-    return scalar::utf8::latin1_length_from_utf8(buf, len);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept {
+  return scalar::utf8::latin1_length_from_utf8(buf,len);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
-{
-    return scalar::utf16::latin1_length_from_utf16(length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
-{
-    return length;
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32( size_t length) const noexcept {
+  return length;
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
-{
-    return scalar::latin1::utf8_length_from_latin1(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept {
+  return scalar::latin1::utf8_length_from_latin1(input,length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
-{
-    return scalar::latin1::utf16_length_from_latin1(length);
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
-{
-    return scalar::utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
+  return scalar::utf8::utf16_length_from_utf8(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
-{
-    return scalar::utf32::utf8_length_from_utf32(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
+  return scalar::utf32::utf8_length_from_utf32(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
-{
-    return scalar::utf32::utf16_length_from_utf32(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
+  return scalar::utf32::utf16_length_from_utf32(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
-{
-    return scalar::latin1::utf32_length_from_latin1(length);
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
-{
-    return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
+  return scalar::utf8::count_code_points(input, length);
 }
 
 } // namespace fallback
@@ -18394,6 +17094,7 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* in
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/implementation.cpp
 /* begin file src/icelake/implementation.cpp */
 
+
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/begin.h
 /* begin file src/simdutf/icelake/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "icelake"
@@ -18406,7 +17107,7 @@ SIMDUTF_TARGET_ICELAKE
 #endif
 
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe - uninitialized)
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
 #endif // end of workaround
 /* end file src/simdutf/icelake/begin.h */
 namespace simdutf {
@@ -18418,8 +17119,7 @@ namespace {
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_utf8_common.inl.cpp
 /* begin file src/icelake/icelake_utf8_common.inl.cpp */
 // Common procedures for both validating and non-validating conversions from UTF-8.
-enum block_processing_mode { SIMDUTF_FULL,
-    SIMDUTF_TAIL };
+enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL};
 
 using utf8_to_utf16_result = std::pair<const char*, char16_t*>;
 using utf8_to_utf32_result = std::pair<const char*, uint32_t*>;
@@ -18435,329 +17135,302 @@ using utf8_to_utf32_result = std::pair<const char*, uint32_t*>;
     The provided in and out pointers are advanced according to how many input
     bytes have been processed, upon success.
 */
-template<block_processing_mode tail, endianness big_endian>
-simdutf_really_inline bool process_block_utf8_to_utf16(const char*& in, char16_t*& out, size_t gap)
-{
-    // constants
-    __m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-    __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
-    __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
-    __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
-    __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
-    __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
-    __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
-    __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
-    __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
-    __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809);
-    // Note that 'tail' is a compile-time constant !
-    __mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
-    __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in);
-    __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
-    if (_ktestc_mask64_u8(m1, b)) { // NOT(m1) AND b -- if all zeroes, then all ASCII
-        // alternatively, we could do 'if (m1 == b) { '
-        if (tail == SIMDUTF_FULL) {
-            in += 64; // consumed 64 bytes
-            // we convert a full 64-byte block, writing 128 bytes.
-            __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-            if (big_endian) {
-                input1 = _mm512_shuffle_epi8(input1, byteflip);
-            }
-            _mm512_storeu_si512(out, input1);
-            out += 32;
-            __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
-            if (big_endian) {
-                input2 = _mm512_shuffle_epi8(input2, byteflip);
-            }
-            _mm512_storeu_si512(out, input2);
-            out += 32;
-            return true; // we are done
-        } else {
-            in += gap;
-            if (gap <= 32) {
-                __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-                if (big_endian) {
-                    input1 = _mm512_shuffle_epi8(input1, byteflip);
-                }
-                _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1);
-                out += gap;
-            } else {
-                __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-                if (big_endian) {
-                    input1 = _mm512_shuffle_epi8(input1, byteflip);
-                }
-                _mm512_storeu_si512(out, input1);
-                out += 32;
-                __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
-                if (big_endian) {
-                    input2 = _mm512_shuffle_epi8(input2, byteflip);
-                }
-                _mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
-                out += gap - 32;
-            }
-            return true; // we are done
-        }
-    }
-    // classify characters further
-    __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input,
-        _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
-    __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
-        _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte
-
-    __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2,
-        _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
-                        // Overlong 2-byte sequence
-    if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
-        // Overlong 2-byte sequence
-        return false;
-    }
-    if (_ktestz_mask64_u8(m34, m34) == 0) {
-        // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence!
-        __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0,
-            _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
-
-        __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b);
-
-        __mmask64 mp1 = _kshiftli_mask64(m234, 1);
-        __mmask64 mp2 = _kshiftli_mask64(m34, 2);
-        // We could do it as follows...
-        // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
-        // but GCC generates better code when we do:
-        if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
-            // Fast path with 1,2,3 bytes
-            __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
-            __mmask64 m1234 = _kor_mask64(m1, m234);
-            // mismatched continuation bytes:
-            if (tail == SIMDUTF_FULL) {
-                __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
-                // the presence of a 1 bit indicates that they overlap.
-                // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
-                if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
-                    return false;
-                }
-            } else {
-                __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
-                if (mc != bxorm1234) {
-                    return false;
-                }
-            }
-            // mend: identifying the last bytes of each sequence to be decoded
-            __mmask64 mend = _kshiftri_mask64(m1234, 1);
-            if (tail != SIMDUTF_FULL) {
-                mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
-            }
-
-            __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
-            __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
-
-            __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
-            __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII
-            __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
-                clearedbytes); // the last byte of each character
-
-            __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes
-            __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
-            __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
-            __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
-                beforeasciibytes); // the second last bytes (of two, three byte seq,
-                                   // surrogates)
-            secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
-
-            __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
-                indexofsecondlastbytes); // indices of the second last bytes
-            __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
-                clearedbytes); // only those that are the third last byte of a sequece
-            __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
-                thirdlastbyte); // the third last bytes (of three byte sequences, hi
-                                // surrogate)
-            thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
-            __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
-            // the elements of Wout excluding the last element if it happens to be a high surrogate:
-
-            __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
-
-            // Encodings out of range...
-            {
-                // the location of 3-byte sequence start bytes in the input
-                __mmask64 m3 = m34 & (b ^ m4);
-                // words in Wout corresponding to 3-byte sequences.
-                __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
-                __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
-                __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
-                __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
-                __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
-                __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
-                if (_kor_mask32(Msmall800, M3s)) {
-                    return false;
-                }
-            }
-            int64_t nout = _mm_popcnt_u64(mprocessed);
-            in += 64 - _lzcnt_u64(mprocessed);
-            if (big_endian) {
-                Wout = _mm512_shuffle_epi8(Wout, byteflip);
-            }
-            _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
-            out += nout;
-            return true; // ok
-        }
-        //
-        // We have a 4-byte sequence, this is the general case.
-        // Slow!
-        __mmask64 mp3 = _kshiftli_mask64(m4, 3);
-        __mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
-        __mmask64 m1234 = _kor_mask64(m1, m234);
-
-        // mend: identifying the last bytes of each sequence to be decoded
-        __mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
-        if (tail != SIMDUTF_FULL) {
-            mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
-        }
-        __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
-        __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
-
-        __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
-        __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII
-        __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
-            clearedbytes); // the last byte of each character
-
-        __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes
-        __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
-        __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
-        __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
-            beforeasciibytes); // the second last bytes (of two, three byte seq,
-                               // surrogates)
-        secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
-
-        __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
-            indexofsecondlastbytes); // indices of the second last bytes
-        __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
-            clearedbytes); // only those that are the third last byte of a sequece
-        __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
-            thirdlastbyte); // the third last bytes (of three byte sequences, hi
-                            // surrogate)
-        thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
-        __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
-        uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
-        __mmask32 Mlo = __mmask32(Mlo_uint64);
-        __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
-        __m512i lo_surr_mask = _mm512_maskz_mov_epi16(Mlo,
-            mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
-        __m512i shifted4_thirdsecondandlastbytes = _mm512_srli_epi16(thirdsecondandlastbytes,
-            4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
-        __m512i tagged_lo_surrogates = _mm512_or_si512(thirdsecondandlastbytes,
-            lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
-        __m512i Wout = _mm512_mask_add_epi16(tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
-            mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
-        // the elements of Wout excluding the last element if it happens to be a high surrogate:
-        __mmask32 Mout = ~(Mhi & 0x80000000);
-        __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(Mout, mend) : _pdep_u64(Mout, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
-
-        // mismatched continuation bytes:
-        if (tail == SIMDUTF_FULL) {
-            __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
-            // the presence of a 1 bit indicates that they overlap.
-            // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
-            if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
-                return false;
-            }
-        } else {
-            __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
-            if (mc != bxorm1234) {
-                return false;
-            }
-        }
-        // Encodings out of range...
-        {
-            // the location of 3-byte sequence start bytes in the input
-            __mmask64 m3 = m34 & (b ^ m4);
-            // words in Wout corresponding to 3-byte sequences.
-            __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
-            __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
-            __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
-            __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
-            __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
-            __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
-            __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
-            __mmask32 M4s = _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
-            if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) {
-                return false;
-            }
-        }
-        in += 64 - _lzcnt_u64(mprocessed);
-        int64_t nout = _mm_popcnt_u64(mprocessed);
-        if (big_endian) {
-            Wout = _mm512_shuffle_epi8(Wout, byteflip);
-        }
-        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
-        out += nout;
-        return true; // ok
-    }
-    // Fast path 2: all ASCII or 2 byte
-    __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m234) : _kand_mask64(_knot_mask64(m234), b);
-    // on top of -0xc0 we substract -2 which we get back later of the
-    // continuation byte tags
-    __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
-    __mmask64 leading = tail == (tail == SIMDUTF_FULL) ? _kor_mask64(m1, m234) : _kand_mask64(_kor_mask64(m1, m234), b); // first bytes of each sequence
+template <block_processing_mode tail, endianness big_endian>
+simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
+  // constants
+  __m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
+  __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
+  __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
+  __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
+  __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
+  __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
+  __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
+  __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
+  __m512i byteflip = _mm512_setr_epi64(
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
+  // Note that 'tail' is a compile-time constant !
+  __mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
+  __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in);
+  __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
+  if(_ktestc_mask64_u8(m1, b)) {// NOT(m1) AND b -- if all zeroes, then all ASCII
+  // alternatively, we could do 'if (m1 == b) { '
     if (tail == SIMDUTF_FULL) {
-        __mmask64 xnor234leading = _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
-        if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) {
-            return false;
-        }
+      in += 64;          // consumed 64 bytes
+      // we convert a full 64-byte block, writing 128 bytes.
+      __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+      if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
+      _mm512_storeu_si512(out, input1);
+      out += 32;
+      __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
+      if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); }
+      _mm512_storeu_si512(out, input2);
+      out += 32;
+      return true; // we are done
     } else {
-        __mmask64 bxorleading = _kxor_mask64(b, leading);
-        if (_kshiftli_mask64(m234, 1) != bxorleading) {
-            return false;
-        }
+      in += gap;
+      if (gap <= 32) {
+        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+        if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
+        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1);
+        out += gap;
+      } else {
+        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+        if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
+        _mm512_storeu_si512(out, input1);
+        out += 32;
+        __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
+        if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); }
+        _mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
+        out += gap - 32;
+      }
+      return true; // we are done
+    }
+  }
+  // classify characters further
+  __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input,
+                                        _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
+  __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
+                                       _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte
+
+  __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2,
+                                                     _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
+                                                                     // Overlong 2-byte sequence
+  if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
+    // Overlong 2-byte sequence
+    return false;
+  }
+  if (_ktestz_mask64_u8(m34, m34) == 0) {
+    // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence!
+    __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0,
+                                        _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
+
+    __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b);
+
+    __mmask64 mp1 = _kshiftli_mask64(m234, 1);
+    __mmask64 mp2 = _kshiftli_mask64(m34, 2);
+    // We could do it as follows...
+    // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
+    // but GCC generates better code when we do:
+    if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
+      // Fast path with 1,2,3 bytes
+      __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
+      __mmask64 m1234 = _kor_mask64(m1, m234);
+      // mismatched continuation bytes:
+      if (tail == SIMDUTF_FULL) {
+        __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
+        // the presence of a 1 bit indicates that they overlap.
+        // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
+        if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; }
+      } else {
+        __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
+        if (mc != bxorm1234) { return false; }
+      }
+      // mend: identifying the last bytes of each sequence to be decoded
+      __mmask64 mend = _kshiftri_mask64(m1234, 1);
+      if (tail != SIMDUTF_FULL) {
+        mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
+      }
+
+
+      __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
+      __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
+
+      __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
+      __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input);             // high two bits cleared where not ASCII
+      __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
+                                                        clearedbytes); // the last byte of each character
+
+      __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1);               // bytes that precede non-ASCII bytes
+      __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
+      __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
+      __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
+                                                              beforeasciibytes); // the second last bytes (of two, three byte seq,
+                                                                                 // surrogates)
+      secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6);                   // shifted into position
+
+      __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
+                                                       indexofsecondlastbytes); // indices of the second last bytes
+      __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
+                                                    clearedbytes); // only those that are the third last byte of a sequece
+      __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
+                                                             thirdlastbyte); // the third last bytes (of three byte sequences, hi
+                                                                             // surrogate)
+      thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12);                // shifted into position
+      __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
+      // the elements of Wout excluding the last element if it happens to be a high surrogate:
+
+      __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
+
+
+      // Encodings out of range...
+      {
+        // the location of 3-byte sequence start bytes in the input
+        __mmask64 m3 = m34 & (b ^ m4);
+        // words in Wout corresponding to 3-byte sequences.
+        __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
+        __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
+        __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
+        __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
+        __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
+        __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
+        if (_kor_mask32(Msmall800, M3s)) { return false; }
+      }
+      int64_t nout = _mm_popcnt_u64(mprocessed);
+      in +=  64 - _lzcnt_u64(mprocessed);
+      if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); }
+      _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
+      out += nout;
+      return true; // ok
     }
     //
+    // We have a 4-byte sequence, this is the general case.
+    // Slow!
+    __mmask64 mp3 = _kshiftli_mask64(m4, 3);
+    __mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
+    __mmask64 m1234 = _kor_mask64(m1, m234);
+
+    // mend: identifying the last bytes of each sequence to be decoded
+    __mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
+    if (tail != SIMDUTF_FULL) {
+      mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
+    }
+    __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
+    __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
+
+    __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
+    __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input);             // high two bits cleared where not ASCII
+    __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
+                                                      clearedbytes); // the last byte of each character
+
+    __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1);               // bytes that precede non-ASCII bytes
+    __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
+    __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
+    __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
+                                                            beforeasciibytes); // the second last bytes (of two, three byte seq,
+                                                                               // surrogates)
+    secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6);                   // shifted into position
+
+    __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
+                                                     indexofsecondlastbytes); // indices of the second last bytes
+    __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
+                                                  clearedbytes); // only those that are the third last byte of a sequece
+    __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
+                                                           thirdlastbyte); // the third last bytes (of three byte sequences, hi
+                                                                           // surrogate)
+    thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12);                // shifted into position
+    __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
+    uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
+    __mmask32 Mlo = __mmask32(Mlo_uint64);
+    __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
+    __m512i lo_surr_mask = _mm512_maskz_mov_epi16(Mlo,
+                                                  mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
+    __m512i shifted4_thirdsecondandlastbytes = _mm512_srli_epi16(thirdsecondandlastbytes,
+                                                                 4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
+    __m512i tagged_lo_surrogates = _mm512_or_si512(thirdsecondandlastbytes,
+                                                   lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
+    __m512i Wout = _mm512_mask_add_epi16(tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
+                                         mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
+    // the elements of Wout excluding the last element if it happens to be a high surrogate:
+    __mmask32 Mout = ~(Mhi & 0x80000000);
+    __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(Mout, mend) : _pdep_u64(Mout, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
+
+
+    // mismatched continuation bytes:
     if (tail == SIMDUTF_FULL) {
-        // In the two-byte/ASCII scenario, we are easily latency bound, so we want
-        // to increment the input buffer as quickly as possible.
-        // We process 32 bytes unless the byte at index 32 is a continuation byte,
-        // in which case we include it as well for a total of 33 bytes.
-        // Note that if x is an ASCII byte, then the following is false:
-        // int8_t(x) <= int8_t(0xc0) under two's complement.
-        in += 32;
-        if (int8_t(*in) <= int8_t(0xc0))
-            in++;
-        // The alternative is to do
-        // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
-        // but it requires loading the input, doing the mask computation, and converting
-        // back the mask to a general register. It just takes too long, leaving the
-        // processor likely to be idle.
+      __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
+      // the presence of a 1 bit indicates that they overlap.
+      // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
+      if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; }
     } else {
-        in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
-    }
-    __m512i lead = _mm512_maskz_compress_epi8(leading, leading2byte); // will contain zero for ascii, and the data
-    lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead)); // ... zero extended into words
-    __m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence
-    follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow)); // ... zero extended into words
-    lead = _mm512_slli_epi16(lead, 6); // shifted into position
-    __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow
+      __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
+      if (mc != bxorm1234) { return false; }
+    }
+    // Encodings out of range...
+    {
+      // the location of 3-byte sequence start bytes in the input
+      __mmask64 m3 = m34 & (b ^ m4);
+      // words in Wout corresponding to 3-byte sequences.
+      __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
+      __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
+      __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
+      __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
+      __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
+      __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
+      __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
+      __mmask32 M4s = _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
+      if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) { return false; }
+    }
+    in += 64 - _lzcnt_u64(mprocessed);
+    int64_t nout = _mm_popcnt_u64(mprocessed);
+    if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); }
+    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
+    out += nout;
+    return true; // ok
+  }
+  // Fast path 2: all ASCII or 2 byte
+  __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m234) : _kand_mask64(_knot_mask64(m234), b);
+  // on top of -0xc0 we substract -2 which we get back later of the
+  // continuation byte tags
+  __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
+  __mmask64 leading = tail == (tail == SIMDUTF_FULL) ? _kor_mask64(m1, m234) : _kand_mask64(_kor_mask64(m1, m234), b); // first bytes of each sequence
+  if (tail == SIMDUTF_FULL) {
+    __mmask64 xnor234leading = _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
+    if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) { return false; }
+  } else {
+    __mmask64 bxorleading = _kxor_mask64(b, leading);
+    if (_kshiftli_mask64(m234, 1) != bxorleading) { return false; }
+  }
+  //
+  if (tail == SIMDUTF_FULL) {
+    // In the two-byte/ASCII scenario, we are easily latency bound, so we want
+    // to increment the input buffer as quickly as possible.
+    // We process 32 bytes unless the byte at index 32 is a continuation byte,
+    // in which case we include it as well for a total of 33 bytes.
+    // Note that if x is an ASCII byte, then the following is false:
+    // int8_t(x) <= int8_t(0xc0) under two's complement.
+    in += 32;
+    if(int8_t(*in) <= int8_t(0xc0)) in++;
+    // The alternative is to do
+    // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
+    // but it requires loading the input, doing the mask computation, and converting
+    // back the mask to a general register. It just takes too long, leaving the
+    // processor likely to be idle.
+  } else {
+    in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
+  }
+  __m512i lead = _mm512_maskz_compress_epi8(leading, leading2byte);          // will contain zero for ascii, and the data
+  lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead));                 // ... zero extended into words
+  __m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence
+  follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow));             // ... zero extended into words
+  lead = _mm512_slli_epi16(lead, 6);                                         // shifted into position
+  __m512i final = _mm512_add_epi16(follow, lead);                            // combining lead and follow
+
+  if(big_endian) { final = _mm512_shuffle_epi8(final, byteflip); }
+  if (tail == SIMDUTF_FULL) {
+    // Next part is UTF-16 specific and can be generalized to UTF-32.
+    int nout = _mm_popcnt_u32(uint32_t(leading));
+    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
+    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+  } else {
+    int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
+    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
+    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+  }
+
+  return true; // we are fine.
+}
+
 
-    if (big_endian) {
-        final = _mm512_shuffle_epi8(final, byteflip);
-    }
-    if (tail == SIMDUTF_FULL) {
-        // Next part is UTF-16 specific and can be generalized to UTF-32.
-        int nout = _mm_popcnt_u32(uint32_t(leading));
-        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
-        out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
-    } else {
-        int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
-        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
-        out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
-    }
 
-    return true; // we are fine.
-}
 
 /*
     utf32_to_utf16_masked converts `count` lower UTF-32 words
@@ -18780,9 +17453,8 @@ simdutf_really_inline bool process_block_utf8_to_utf16(const char*& in, char16_t
     We pass it to the (always inlined) function to encourage the compiler to
     keep the value in a (constant) register.
 */
-template<endianness big_endian>
-simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) {
 
     const __mmask16 valid = uint16_t((1 << count) - 1);
     // 1. check if we have any surrogate pairs
@@ -18790,11 +17462,11 @@ simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m51
     const __mmask16 sp_mask = _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);
 
     if (sp_mask == 0) {
-        if (big_endian) {
-            _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
+        if(big_endian) {
+          _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
 
         } else {
-            _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm512_cvtepi32_epi16(utf32));
+          _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm512_cvtepi32_epi16(utf32));
         }
         return count;
     }
@@ -18824,14 +17496,12 @@ simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m51
         // Here we want to trim all of the upper 16-bit words from the 2-byte
         // characters represented as 4-byte values. We can compute it from
         // sp_mask or the following... It can be more optimized!
-        const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
-        const __mmask32 nonzero_masked = _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2 * count)) - 1));
-        if (big_endian) {
-            t5 = _mm512_shuffle_epi8(t5, byteflip);
-        }
+        const  __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
+        const  __mmask32 nonzero_masked = _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2*count)) - 1));
+        if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); }
         // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4)
         __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
-        _mm512_mask_storeu_epi16(output, (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
+        _mm512_mask_storeu_epi16(output, (1<<(count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
         //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
     }
 
@@ -18858,19 +17528,18 @@ simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m51
     We pass it to the (always inlined) function to encourage the compiler to
     keep the value in a (constant) register.
 */
-template<endianness big_endian>
-simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) {
     // check if we have any surrogate pairs
     const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
     const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
 
     if (sp_mask == 0) {
         // technically, it should be _mm256_storeu_epi16
-        if (big_endian) {
-            _mm256_storeu_si256((__m256i*)output, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
+        if(big_endian) {
+          _mm256_storeu_si256((__m256i*)output, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),_mm512_castsi512_si256(byteflip)));
         } else {
-            _mm256_storeu_si256((__m256i*)output, _mm512_cvtepi32_epi16(utf32));
+          _mm256_storeu_si256((__m256i*)output, _mm512_cvtepi32_epi16(utf32));
         }
         return count;
     }
@@ -18897,13 +17566,11 @@ simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf3
         const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
         const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
         __m512i t5 = _mm512_ror_epi32(t4, 16);
-        const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
-        if (big_endian) {
-            t5 = _mm512_shuffle_epi8(t5, byteflip);
-        }
+        const  __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
+        if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); }
         // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4)
         __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
-        _mm512_mask_storeu_epi16(output, (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
+        _mm512_mask_storeu_epi16(output, (1<<(count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
         //_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
     }
 
@@ -18913,23 +17580,21 @@ simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf3
 /**
  * Store the last N bytes of previous followed by 512-N bytes from input.
  */
-template<int N>
-__m512i prev(__m512i input, __m512i previous)
-{
-    static_assert(N <= 32, "N must be no larger than 32");
-    const __m512i movemask = _mm512_setr_epi32(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+template <int N>
+__m512i prev(__m512i input, __m512i previous) {
+    static_assert(N<=32, "N must be no larger than 32");
+    const __m512i movemask = _mm512_setr_epi32(28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11);
     const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
 #if SIMDUTF_GCC8 || SIMDUTF_GCC9
-    constexpr int shift = 16 - N; // workaround for GCC8,9
+    constexpr int shift = 16-N; // workaround for GCC8,9
     return _mm512_alignr_epi8(input, rotated, shift);
 #else
-    return _mm512_alignr_epi8(input, rotated, 16 - N);
+    return _mm512_alignr_epi8(input, rotated, 16-N);
 #endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
 }
 
-template<unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
-__m512i shuffle_epi128(__m512i v)
-{
+template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
+__m512i shuffle_epi128(__m512i v) {
     static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
     static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
     static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
@@ -18939,18 +17604,16 @@ __m512i shuffle_epi128(__m512i v)
     return _mm512_shuffle_i32x4(v, v, shuffle);
 }
 
-template<unsigned idx>
-constexpr __m512i broadcast_epi128(__m512i v)
-{
+template <unsigned idx>
+constexpr __m512i broadcast_epi128(__m512i v) {
     return shuffle_epi128<idx, idx, idx, idx>(v);
 }
 
 /**
  * Current unused.
  */
-template<int N>
-__m512i rotate_by_N_epi8(const __m512i input)
-{
+template <int N>
+__m512i rotate_by_N_epi8(const __m512i input) {
 
     // lanes order: 1, 2, 3, 0 => 0b00_11_10_01
     const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39);
@@ -18966,8 +17629,7 @@ __m512i rotate_by_N_epi8(const __m512i input)
     0x8080800N, where N is 4 higest bits from the leading byte; 0x80 resets
     corresponding bytes during pshufb.
 */
-simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8)
-{
+simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8) {
     /*
         Input:
         - utf8: bytes stored at separate 32-bit words
@@ -19056,7 +17718,8 @@ simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i
             0x0707070707070707,
             0x0b0a090900000000,
             0x0707070707070707,
-            0x0b0a090900000000);
+            0x0b0a090900000000
+        );
 
         const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
         values = _mm512_sllv_epi32(values, shift);
@@ -19077,7 +17740,8 @@ simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i
             0x1919191919191919,
             0x0b10151500000000,
             0x1919191919191919,
-            0x0b10151500000000);
+            0x0b10151500000000
+        );
 
         const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
         values = _mm512_srlv_epi32(values, shift);
@@ -19086,29 +17750,29 @@ simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i
     return values;
 }
 
-simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, int& count)
-{
+
+simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, int &count) {
     const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
     const __m512i expand_ver2 = _mm512_setr_epi64(
-        0x0403020103020100,
-        0x0605040305040302,
-        0x0807060507060504,
-        0x0a09080709080706,
-        0x0c0b0a090b0a0908,
-        0x0e0d0c0b0d0c0b0a,
-        0x000f0e0d0f0e0d0c,
-        0x0201000f01000f0e);
+                0x0403020103020100,
+                0x0605040305040302,
+                0x0807060507060504,
+                0x0a09080709080706,
+                0x0c0b0a090b0a0908,
+                0x0e0d0c0b0d0c0b0a,
+                0x000f0e0d0f0e0d0c,
+                0x0201000f01000f0e
+    );
     const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
     const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
     const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
     const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
     const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
     count = static_cast<int>(count_ones(leading_bytes));
-    return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, input);
+    return  _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, input);
 }
 
-simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input)
-{
+simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
     __m512i char_class = _mm512_srli_epi32(input, 4);
     /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */
     const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
@@ -19161,95 +17825,97 @@ simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input)
         ]
 */
 
-#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                                                                     \
-    {                                                                                                                         \
-        const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);                                                   \
-        const __m512i expand_ver2 = _mm512_setr_epi64(                                                                        \
-            0x0403020103020100,                                                                                               \
-            0x0605040305040302,                                                                                               \
-            0x0807060507060504,                                                                                               \
-            0x0a09080709080706,                                                                                               \
-            0x0c0b0a090b0a0908,                                                                                               \
-            0x0e0d0c0b0d0c0b0a,                                                                                               \
-            0x000f0e0d0f0e0d0c,                                                                                               \
-            0x0201000f01000f0e);                                                                                              \
-        const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);                                                       \
-                                                                                                                              \
-        __mmask16 leading_bytes;                                                                                              \
-        const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                                                                  \
-        const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                                                              \
-        const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                                                                  \
-        leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                                                            \
-                                                                                                                              \
-        __m512i char_class;                                                                                                   \
-        char_class = _mm512_srli_epi32(input, 4);                                                                             \
-        /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                                                                \
-        const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                                                                  \
-        const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                                                            \
-        char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);                                   \
-                                                                                                                              \
-        const int valid_count = static_cast<int>(count_ones(leading_bytes));                                                  \
-        const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);                                                      \
-                                                                                                                              \
-        const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, utf32);                         \
-                                                                                                                              \
-        if (UTF32) {                                                                                                          \
-            if (MASKED) {                                                                                                     \
-                const __mmask16 valid = uint16_t((1 << valid_count) - 1);                                                     \
-                _mm512_mask_storeu_epi32((__m512i*)output, valid, out);                                                       \
-            } else {                                                                                                          \
-                _mm512_storeu_si512((__m512i*)output, out);                                                                   \
-            }                                                                                                                 \
-            output += valid_count;                                                                                            \
-        } else {                                                                                                              \
-            if (MASKED) {                                                                                                     \
-                output += utf32_to_utf16_masked<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t*>(output)); \
-            } else {                                                                                                          \
-                output += utf32_to_utf16<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t*>(output));        \
-            }                                                                                                                 \
-        }                                                                                                                     \
-    }
-
-#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)                                                        \
-    {                                                                                                                           \
-        if (UTF32) {                                                                                                            \
-            if (MASKED) {                                                                                                       \
-                const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);                                                  \
-                _mm512_mask_storeu_epi32((__m512i*)output, valid_mask, INPUT);                                                  \
-            } else {                                                                                                            \
-                _mm512_storeu_si512((__m512i*)output, INPUT);                                                                   \
-            }                                                                                                                   \
-            output += VALID_COUNT;                                                                                              \
-        } else {                                                                                                                \
-            if (MASKED) {                                                                                                       \
-                output += utf32_to_utf16_masked<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t*>(output)); \
-            } else {                                                                                                            \
-                output += utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t*>(output));        \
-            }                                                                                                                   \
-        }                                                                                                                       \
-    }
-
-#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                                                               \
-    if (UTF32) {                                                                                                       \
-        const __m128i t0 = _mm512_castsi512_si128(utf8);                                                               \
-        const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                                                         \
-        const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                                                         \
-        const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                                                         \
-        _mm512_storeu_si512((__m512i*)(output + 0 * 16), _mm512_cvtepu8_epi32(t0));                                    \
-        _mm512_storeu_si512((__m512i*)(output + 1 * 16), _mm512_cvtepu8_epi32(t1));                                    \
-        _mm512_storeu_si512((__m512i*)(output + 2 * 16), _mm512_cvtepu8_epi32(t2));                                    \
-        _mm512_storeu_si512((__m512i*)(output + 3 * 16), _mm512_cvtepu8_epi32(t3));                                    \
-    } else {                                                                                                           \
-        const __m256i h0 = _mm512_castsi512_si256(utf8);                                                               \
-        const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                                                         \
-        if (big_endian) {                                                                                              \
-            _mm512_storeu_si512((__m512i*)(output + 0 * 16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \
-            _mm512_storeu_si512((__m512i*)(output + 2 * 16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \
-        } else {                                                                                                       \
-            _mm512_storeu_si512((__m512i*)(output + 0 * 16), _mm512_cvtepu8_epi16(h0));                                \
-            _mm512_storeu_si512((__m512i*)(output + 2 * 16), _mm512_cvtepu8_epi16(h1));                                \
-        }                                                                                                              \
-    }
+#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                                                    \
+        {                                                                                                    \
+            const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);                              \
+            const __m512i expand_ver2 = _mm512_setr_epi64(                                                   \
+                0x0403020103020100,                                                                          \
+                0x0605040305040302,                                                                          \
+                0x0807060507060504,                                                                          \
+                0x0a09080709080706,                                                                          \
+                0x0c0b0a090b0a0908,                                                                          \
+                0x0e0d0c0b0d0c0b0a,                                                                          \
+                0x000f0e0d0f0e0d0c,                                                                          \
+                0x0201000f01000f0e                                                                           \
+            );                                                                                               \
+            const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);                                  \
+                                                                                                             \
+            __mmask16 leading_bytes;                                                                         \
+            const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                                             \
+            const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                                         \
+            const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                                             \
+            leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                                       \
+                                                                                                             \
+            __m512i char_class;                                                                              \
+            char_class = _mm512_srli_epi32(input, 4);                                                        \
+            /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                                           \
+            const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                                             \
+            const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                                       \
+            char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);              \
+                                                                                                             \
+            const int valid_count = static_cast<int>(count_ones(leading_bytes));                             \
+            const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);                                 \
+                                                                                                             \
+            const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, utf32);    \
+                                                                                                             \
+            if (UTF32) {                                                                                     \
+                if(MASKED) {                                                                                 \
+                    const __mmask16 valid = uint16_t((1 << valid_count) - 1);                                \
+                    _mm512_mask_storeu_epi32((__m512i*)output, valid, out);                                  \
+                } else {                                                                                     \
+                    _mm512_storeu_si512((__m512i*)output, out);                                              \
+                }                                                                                            \
+                output += valid_count;                                                                       \
+            } else {                                                                                         \
+                if(MASKED) {                                                                                 \
+                    output += utf32_to_utf16_masked<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
+                } else {                                                                                     \
+                    output += utf32_to_utf16<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t *>(output));        \
+                }                                                                                            \
+            }                                                                                                \
+        }
+
+#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)                                    \
+{                                                                                                           \
+    if (UTF32) {                                                                                            \
+        if(MASKED) {                                                                                        \
+            const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);                                  \
+            _mm512_mask_storeu_epi32((__m512i*)output, valid_mask, INPUT);                                  \
+        } else {                                                                                            \
+            _mm512_storeu_si512((__m512i*)output, INPUT);                                              \
+        }                                                                                                   \
+        output += VALID_COUNT;                                                                              \
+    } else {                                                                                                \
+        if(MASKED) {                                                                                        \
+            output += utf32_to_utf16_masked<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t *>(output));      \
+        } else {                                                                                            \
+            output += utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t *>(output));             \
+        }                                                                                                   \
+    }                                                                                                       \
+}
+
+
+#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                                  \
+        if (UTF32) {                                                                      \
+                const __m128i t0 = _mm512_castsi512_si128(utf8);                          \
+                const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                    \
+                const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                    \
+                const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                    \
+                _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi32(t0)); \
+                _mm512_storeu_si512((__m512i*)(output + 1*16), _mm512_cvtepu8_epi32(t1)); \
+                _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi32(t2)); \
+                _mm512_storeu_si512((__m512i*)(output + 3*16), _mm512_cvtepu8_epi32(t3)); \
+        } else {                                                                          \
+                const __m256i h0 = _mm512_castsi512_si256(utf8);                          \
+                const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                    \
+                if(big_endian) {                                                          \
+                _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \
+                _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \
+                } else {                                                                  \
+                _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi16(h0)); \
+                _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi16(h1)); \
+                }                                                                         \
+        }
 /* end file src/icelake/icelake_macros.inl.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_from_valid_utf8.inl.cpp
 /* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */
@@ -19272,23 +17938,23 @@ simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input)
     - pair.first    - the first unprocessed input byte
     - pair.second   - the first unprocessed output word
 */
-template<endianness big_endian, typename OUTPUT>
-std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords)
-{
+template <endianness big_endian, typename OUTPUT>
+std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
     static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32");
 
     __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809);
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
     const char* ptr = str;
     const char* end = ptr + len;
 
@@ -19303,7 +17969,7 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
         const __m512i v_80 = _mm512_set1_epi8(char(0x80));
         const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
-        if (ascii == 0) {
+        if(ascii == 0) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
@@ -19317,8 +17983,8 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
         const __m512i lane2 = broadcast_epi128<2>(utf8);
         int valid_count1;
         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-        if (valid_count0 + valid_count1 <= 16) {
-            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+        if(valid_count0 + valid_count1 <= 16) {
+            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
             valid_count0 += valid_count1;
             vec0 = expand_utf8_to_utf32(vec0);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
@@ -19336,8 +18002,8 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
         const __m512i lane4 = _mm512_set1_epi32(tmp1);
         int valid_count3;
         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-        if (valid_count2 + valid_count3 <= 16) {
-            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
+        if(valid_count2 + valid_count3 <= 16) {
+            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
             valid_count2 += valid_count3;
             vec2 = expand_utf8_to_utf32(vec2);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
@@ -19347,14 +18013,14 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
         }
-        ptr += 4 * 16;
+        ptr += 4*16;
     }
 
     if (ptr + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
         const __m512i v_80 = _mm512_set1_epi8(char(0x80));
         const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
-        if (ascii == 0) {
+        if(ascii == 0) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
@@ -19366,8 +18032,8 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
             const __m512i lane2 = broadcast_epi128<2>(utf8);
             int valid_count1;
             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-            if (valid_count0 + valid_count1 <= 16) {
-                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+            if(valid_count0 + valid_count1 <= 16) {
+                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
                 valid_count0 += valid_count1;
                 vec0 = expand_utf8_to_utf32(vec0);
                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
@@ -19381,21 +18047,22 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
             const __m512i lane3 = broadcast_epi128<3>(utf8);
             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
 
-            ptr += 3 * 16;
+            ptr += 3*16;
         }
     }
-    return { ptr, output };
+    return {ptr, output};
 }
 
+
 using utf8_to_utf16_result = std::pair<const char*, char16_t*>;
 /* end file src/icelake/icelake_from_valid_utf8.inl.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_utf8_validation.inl.cpp
 /* begin file src/icelake/icelake_utf8_validation.inl.cpp */
 // file included directly
 
-simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i prev1)
-{
-    __m512i mask1 = _mm512_setr_epi64(
+
+simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i prev1) {
+  __m512i mask1 = _mm512_setr_epi64(
         0x0202020202020202,
         0x4915012180808080,
         0x0202020202020202,
@@ -19417,7 +18084,7 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
         0xcbcbdbcbcbcbcbcb,
         0xcbcbcb8b8383a3e7,
         0xcbcbdbcbcbcbcbcb);
-    __m512i index2 = _mm512_and_si512(prev1, v_0f);
+     __m512i index2 = _mm512_and_si512(prev1, v_0f);
 
     __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
     __m512i mask3 = _mm512_setr_epi64(
@@ -19428,19 +18095,19 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
         0x101010101010101,
         0x1010101babaaee6,
         0x101010101010101,
-        0x1010101babaaee6);
+        0x1010101babaaee6
+    );
     __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
     __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
     return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
-}
+  }
 
-simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
-    const __m512i prev_input, const __m512i sc)
-{
+  simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
+      const __m512i prev_input, const __m512i sc) {
     __m512i prev2 = prev<2>(input, prev_input);
     __m512i prev3 = prev<3>(input, prev_input);
-    __m512i is_third_byte = _mm512_subs_epu8(prev2, _mm512_set1_epi8(0b11100000u - 1)); // Only 111_____ will be > 0
-    __m512i is_fourth_byte = _mm512_subs_epu8(prev3, _mm512_set1_epi8(0b11110000u - 1)); // Only 1111____ will be > 0
+    __m512i is_third_byte  = _mm512_subs_epu8(prev2, _mm512_set1_epi8(0b11100000u-1)); // Only 111_____ will be > 0
+    __m512i is_fourth_byte  = _mm512_subs_epu8(prev3, _mm512_set1_epi8(0b11110000u-1)); // Only 1111____ will be > 0
     __m512i is_third_or_fourth_byte = _mm512_or_si512(is_third_byte, is_fourth_byte);
     const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
     is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
@@ -19448,14 +18115,13 @@ simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
     const __m512i v_80 = _mm512_set1_epi8(char(0x80));
     return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc, 0b1101010);
     //__m512i is_third_or_fourth_byte_mask = _mm512_and_si512(is_third_or_fourth_byte, v_80);
-    // return _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
-}
-//
-// Return nonzero if there are incomplete multibyte characters at the end of the block:
-// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-//
-simdutf_really_inline __m512i is_incomplete(const __m512i input)
-{
+    //return _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
+  }
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdutf_really_inline __m512i is_incomplete(const __m512i input) {
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     __m512i max_value = _mm512_setr_epi64(
@@ -19468,61 +18134,57 @@ simdutf_really_inline __m512i is_incomplete(const __m512i input)
         0xffffffffffffffff,
         0xbfdfefffffffffff);
     return _mm512_subs_epu8(input, max_value);
-}
+  }
 
-struct avx512_utf8_checker {
+  struct avx512_utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
-    __m512i error {};
+    __m512i error{};
 
     // The last input we received
-    __m512i prev_input_block {};
+    __m512i prev_input_block{};
     // Whether the last input we received was incomplete (used for ASCII fast path)
-    __m512i prev_incomplete {};
+    __m512i prev_incomplete{};
 
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const __m512i input, const __m512i prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        __m512i prev1 = prev<1>(input, prev_input);
-        __m512i sc = check_special_cases(input, prev1);
-        this->error = _mm512_or_si512(check_multibyte_lengths(input, prev_input, sc), this->error);
+    simdutf_really_inline void check_utf8_bytes(const __m512i input, const __m512i prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      __m512i prev1 = prev<1>(input, prev_input);
+      __m512i sc = check_special_cases(input, prev1);
+      this->error = _mm512_or_si512(check_multibyte_lengths(input, prev_input, sc), this->error);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof()
-    {
-        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-        // possibly finish them.
-        this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+    simdutf_really_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error = _mm512_or_si512(this->error, this->prev_incomplete);
     }
 
     // returns true if ASCII.
-    simdutf_really_inline bool check_next_input(const __m512i input)
-    {
-        const __m512i v_80 = _mm512_set1_epi8(char(0x80));
-        const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
-        if (ascii == 0) {
-            this->error = _mm512_or_si512(this->error, this->prev_incomplete);
-            return true;
-        } else {
-            this->check_utf8_bytes(input, this->prev_input_block);
-            this->prev_incomplete = is_incomplete(input);
-            this->prev_input_block = input;
-            return false;
-        }
+    simdutf_really_inline bool check_next_input(const __m512i input) {
+      const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+      const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
+      if(ascii == 0) {
+        this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+        return true;
+      } else {
+        this->check_utf8_bytes(input, this->prev_input_block);
+        this->prev_incomplete = is_incomplete(input);
+        this->prev_input_block = input;
+        return false;
+      }
     }
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const
-    {
+    simdutf_really_inline bool errors() const {
         return _mm512_test_epi8_mask(this->error, this->error) != 0;
     }
 
-}; // struct avx512_utf8_checker
+  }; // struct avx512_utf8_checker
 /* end file src/icelake/icelake_utf8_validation.inl.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_from_utf8.inl.cpp
 /* begin file src/icelake/icelake_from_utf8.inl.cpp */
@@ -19537,56 +18199,48 @@ struct avx512_utf8_checker {
  * completed. Upon error, the output is set to null.
  */
 
-template<endianness big_endian>
-utf8_to_utf16_result fast_avx512_convert_utf8_to_utf16(const char* in, size_t len, char16_t* out)
-{
-    const char* const final_in = in + len;
-    bool result = true;
-    while (result) {
-        if (in + 64 <= final_in) {
-            result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
-        } else if (in < final_in) {
-            result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
-        } else {
-            break;
-        }
-    }
-    if (!result) {
-        out = nullptr;
-    }
-    return std::make_pair(in, out);
+template <endianness big_endian>
+utf8_to_utf16_result fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) {
+  const char *const final_in = in + len;
+  bool result = true;
+  while (result) {
+    if (in + 64 <= final_in) {
+        result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
+    } else if(in < final_in) {
+        result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
+    } else { break; }
+  }
+  if(!result) { out = nullptr; }
+  return std::make_pair(in, out);
+}
+
+template <endianness big_endian>
+simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in, size_t len, char16_t *out) {
+  const char *const init_in = in;
+  const char16_t *const init_out = out;
+  const char *const final_in = in + len;
+  bool  result = true;
+  while (result) {
+    if (in + 64 <= final_in) {
+        result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
+    } else if(in < final_in) {
+        result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
+    } else { break; }
+  }
+  if(!result) {
+    // rewind_and_convert_with_errors will seek a potential error from in onward,
+    // with the ability to go back up to in - init_in bytes, and read final_in - in bytes forward.
+    simdutf::result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(in - init_in, in, final_in - in, out);
+    res.count += (in - init_in);
+    return res;
+  } else {
+    return simdutf::result(error_code::SUCCESS,out - init_out);
+  }
 }
 
-template<endianness big_endian>
-simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char* in, size_t len, char16_t* out)
-{
-    const char* const init_in = in;
-    const char16_t* const init_out = out;
-    const char* const final_in = in + len;
-    bool result = true;
-    while (result) {
-        if (in + 64 <= final_in) {
-            result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
-        } else if (in < final_in) {
-            result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
-        } else {
-            break;
-        }
-    }
-    if (!result) {
-        // rewind_and_convert_with_errors will seek a potential error from in onward,
-        // with the ability to go back up to in - init_in bytes, and read final_in - in bytes forward.
-        simdutf::result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(in - init_in, in, final_in - in, out);
-        res.count += (in - init_in);
-        return res;
-    } else {
-        return simdutf::result(error_code::SUCCESS, out - init_out);
-    }
-}
 
-template<endianness big_endian, typename OUTPUT>
-std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords)
-{
+template <endianness big_endian, typename OUTPUT>
+std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
@@ -19595,16 +18249,17 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
     const char* ptr = str;
     const char* end = ptr + len;
     __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809);
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
     OUTPUT* output = dwords;
-    avx512_utf8_checker checker {};
+    avx512_utf8_checker checker{};
     /**
      * In the main loop, we consume 64 bytes per iteration,
      * but we access 64 + 4 bytes.
@@ -19613,7 +18268,7 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
      */
     while (ptr + 64 + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if (checker.check_next_input(utf8)) {
+        if(checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
@@ -19626,8 +18281,8 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
         const __m512i lane2 = broadcast_epi128<2>(utf8);
         int valid_count1;
         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-        if (valid_count0 + valid_count1 <= 16) {
-            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+        if(valid_count0 + valid_count1 <= 16) {
+            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
             valid_count0 += valid_count1;
             vec0 = expand_utf8_to_utf32(vec0);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
@@ -19645,8 +18300,8 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
         const __m512i lane4 = _mm512_set1_epi32(tmp1);
         int valid_count3;
         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-        if (valid_count2 + valid_count3 <= 16) {
-            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
+        if(valid_count2 + valid_count3 <= 16) {
+            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
             valid_count2 += valid_count3;
             vec2 = expand_utf8_to_utf32(vec2);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
@@ -19656,7 +18311,7 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
         }
-        ptr += 4 * 16;
+        ptr += 4*16;
     }
     const char* validatedptr = ptr; // validated up to ptr
 
@@ -19664,7 +18319,7 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
     // 3*16 bytes, so we may end up double-validating 16 bytes.
     if (ptr + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if (checker.check_next_input(utf8)) {
+        if(checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
@@ -19676,8 +18331,8 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
             const __m512i lane2 = broadcast_epi128<2>(utf8);
             int valid_count1;
             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-            if (valid_count0 + valid_count1 <= 16) {
-                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+            if(valid_count0 + valid_count1 <= 16) {
+                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
                 valid_count0 += valid_count1;
                 vec0 = expand_utf8_to_utf32(vec0);
                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
@@ -19691,25 +18346,24 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
             const __m512i lane3 = broadcast_epi128<3>(utf8);
             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
 
-            ptr += 3 * 16;
+            ptr += 3*16;
         }
-        validatedptr += 4 * 16;
+        validatedptr += 4*16;
     }
     {
-        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL << (end - validatedptr)) - 1, (const __m512i*)validatedptr);
-        checker.check_next_input(utf8);
+       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
+       checker.check_next_input(utf8);
     }
     checker.check_eof();
-    if (checker.errors()) {
-        return { ptr, nullptr }; // We found an error.
+    if(checker.errors()) {
+        return {ptr, nullptr}; // We found an error.
     }
-    return { ptr, output };
+    return {ptr, output};
 }
 
 // Like validating_utf8_to_fixed_length but returns as soon as an error is identified
-template<endianness big_endian, typename OUTPUT>
-std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords)
-{
+template <endianness big_endian, typename OUTPUT>
+std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords) {
     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
@@ -19718,16 +18372,17 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
     const char* ptr = str;
     const char* end = ptr + len;
     __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809);
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
     OUTPUT* output = dwords;
-    avx512_utf8_checker checker {};
+    avx512_utf8_checker checker{};
     /**
      * In the main loop, we consume 64 bytes per iteration,
      * but we access 64 + 4 bytes.
@@ -19736,14 +18391,14 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
      */
     while (ptr + 64 + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if (checker.check_next_input(utf8)) {
+        if(checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
             continue;
         }
-        if (checker.errors()) {
-            return { ptr, output, false }; // We found an error.
+        if(checker.errors()) {
+            return {ptr, output, false}; // We found an error.
         }
         const __m512i lane0 = broadcast_epi128<0>(utf8);
         const __m512i lane1 = broadcast_epi128<1>(utf8);
@@ -19752,8 +18407,8 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
         const __m512i lane2 = broadcast_epi128<2>(utf8);
         int valid_count1;
         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-        if (valid_count0 + valid_count1 <= 16) {
-            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+        if(valid_count0 + valid_count1 <= 16) {
+            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
             valid_count0 += valid_count1;
             vec0 = expand_utf8_to_utf32(vec0);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
@@ -19771,8 +18426,8 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
         const __m512i lane4 = _mm512_set1_epi32(tmp1);
         int valid_count3;
         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-        if (valid_count2 + valid_count3 <= 16) {
-            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
+        if(valid_count2 + valid_count3 <= 16) {
+            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
             valid_count2 += valid_count3;
             vec2 = expand_utf8_to_utf32(vec2);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
@@ -19782,7 +18437,7 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
         }
-        ptr += 4 * 16;
+        ptr += 4*16;
     }
     const char* validatedptr = ptr; // validated up to ptr
 
@@ -19790,12 +18445,12 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
     // 3*16 bytes, so we may end up double-validating 16 bytes.
     if (ptr + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if (checker.check_next_input(utf8)) {
+        if(checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
-        } else if (checker.errors()) {
-            return { ptr, output, false }; // We found an error.
+        } else if(checker.errors()) {
+            return {ptr, output, false}; // We found an error.
         } else {
             const __m512i lane0 = broadcast_epi128<0>(utf8);
             const __m512i lane1 = broadcast_epi128<1>(utf8);
@@ -19804,8 +18459,8 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
             const __m512i lane2 = broadcast_epi128<2>(utf8);
             int valid_count1;
             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-            if (valid_count0 + valid_count1 <= 16) {
-                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+            if(valid_count0 + valid_count1 <= 16) {
+                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
                 valid_count0 += valid_count1;
                 vec0 = expand_utf8_to_utf32(vec0);
                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
@@ -19819,19 +18474,19 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
             const __m512i lane3 = broadcast_epi128<3>(utf8);
             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
 
-            ptr += 3 * 16;
+            ptr += 3*16;
         }
-        validatedptr += 4 * 16;
+        validatedptr += 4*16;
     }
     {
-        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL << (end - validatedptr)) - 1, (const __m512i*)validatedptr);
-        checker.check_next_input(utf8);
+       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
+       checker.check_next_input(utf8);
     }
     checker.check_eof();
-    if (checker.errors()) {
-        return { ptr, output, false }; // We found an error.
+    if(checker.errors()) {
+        return {ptr, output, false}; // We found an error.
     }
-    return { ptr, output, true };
+    return {ptr, output, true};
 }
 /* end file src/icelake/icelake_from_utf8.inl.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf32.inl.cpp
@@ -19842,110 +18497,108 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
   Returns a pair: the first unprocessed byte from buf and utf32_output
   A scalar routing should carry on the conversion of the tail.
 */
-template<endianness big_endian>
-std::tuple<const char16_t*, char32_t*, bool> convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output)
-{
-    const char16_t* end = buf + len;
-    const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
-    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
-    const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
-    __mmask32 carry { 0 };
-    const __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809);
-    while (buf + 32 <= end) {
-        // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
-        __m512i in = _mm512_loadu_si512((__m512i*)buf);
-        if (big_endian) {
-            in = _mm512_shuffle_epi8(in, byteflip);
-        }
-
-        // H - bitmask for high surrogates
-        const __mmask32 H = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
-        // H - bitmask for low surrogates
-        const __mmask32 L = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
-
-        if ((H | L)) {
-            // surrogate pair(s) in a register
-            const __mmask32 V = (L ^ (carry | (H << 1))); // A high surrogate must be followed by low one and a low one must be preceded by a high one.
-                                                          // If valid, V should be equal to 0
-
-            if (V == 0) {
-                // valid case
-                /*
-                    Input surrogate pair:
-                    |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
-                        low surrogate      high surrogate
-                */
-                /*  1. Expand all words to 32-bit words
-                    in  |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
-                */
-                const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
-                const __m512i second = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
-
-                /*  2. Shift by one 16-bit word to align low surrogates with high surrogates
-                    in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
-                    shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
-                */
-                const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
-                const __m512i shifted_second = _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
-
-                /*  3. Align all high surrogates in first and second by shifting to the left by 10 bits
-                    |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
-                */
-                const __m512i aligned_first = _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
-                const __m512i aligned_second = _mm512_mask_slli_epi32(second, (__mmask16)(H >> 16), second, 10);
-
-                /*  4. Remove surrogate prefixes and add offset 0x10000 by adding in, shifted and constant
-                    in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
-                    shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
-                    constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
-                */
-                const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
-                const __m512i added_first = _mm512_mask_add_epi32(aligned_first, (__mmask16)H, aligned_first, shifted_first);
-                const __m512i utf32_first = _mm512_mask_add_epi32(added_first, (__mmask16)H, added_first, constant);
-
-                const __m512i added_second = _mm512_mask_add_epi32(aligned_second, (__mmask16)(H >> 16), aligned_second, shifted_second);
-                const __m512i utf32_second = _mm512_mask_add_epi32(added_second, (__mmask16)(H >> 16), added_second, constant);
-
-                //  5. Store all valid UTF-32 words (low surrogate positions and 32nd word are invalid)
-                const __mmask32 valid = ~L & 0x7fffffff;
-                // We deliberately do a _mm512_maskz_compress_epi32 followed by storeu_epi32
-                // to ease performance portability to Zen 4.
-                const __m512i compressed_first = _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
-                const size_t howmany1 = count_ones((uint16_t)(valid));
-                _mm512_storeu_si512((__m512i*)utf32_output, compressed_first);
-                utf32_output += howmany1;
-                const __m512i compressed_second = _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
-                const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
-                // The following could be unsafe in some cases?
-                //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
-                _mm512_mask_storeu_epi32((__m512i*)utf32_output, __mmask16((1 << howmany2) - 1), compressed_second);
-                utf32_output += howmany2;
-                // Only process 31 words, but keep track if the 31st word is a high surrogate as a carry
-                buf += 31;
-                carry = (H >> 30) & 0x1;
-            } else {
-                // invalid case
-                return std::make_tuple(buf + carry, utf32_output, false);
-            }
-        } else {
-            // no surrogates
-            // extend all thirty-two 16-bit words to thirty-two 32-bit words
-            _mm512_storeu_si512((__m512i*)(utf32_output), _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
-            _mm512_storeu_si512((__m512i*)(utf32_output) + 1, _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)));
-            utf32_output += 32;
-            buf += 32;
-            carry = 0;
-        }
-    } // while
-    return std::make_tuple(buf + carry, utf32_output, true);
+template <endianness big_endian>
+std::tuple<const char16_t*, char32_t*, bool> convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
+  const char16_t* end = buf + len;
+  const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
+  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+  const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
+  __mmask32 carry{0};
+  const __m512i byteflip = _mm512_setr_epi64(
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
+  while (buf + 32 <= end) {
+    // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
+    __m512i in = _mm512_loadu_si512((__m512i*)buf);
+    if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
+
+    // H - bitmask for high surrogates
+    const __mmask32 H = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
+    // H - bitmask for low surrogates
+    const __mmask32 L = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
+
+    if ((H|L)) {
+      // surrogate pair(s) in a register
+      const __mmask32 V = (L ^ (carry | (H << 1)));   // A high surrogate must be followed by low one and a low one must be preceded by a high one.
+                                                      // If valid, V should be equal to 0
+
+      if(V == 0) {
+        // valid case
+        /*
+            Input surrogate pair:
+            |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
+                low surrogate      high surrogate
+        */
+        /*  1. Expand all words to 32-bit words
+            in  |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
+        */
+        const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
+        const __m512i second = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1));
+
+        /*  2. Shift by one 16-bit word to align low surrogates with high surrogates
+            in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
+            shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
+        */
+        const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
+        const __m512i shifted_second = _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
+
+        /*  3. Align all high surrogates in first and second by shifting to the left by 10 bits
+            |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
+        */
+        const __m512i aligned_first = _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
+        const __m512i aligned_second = _mm512_mask_slli_epi32(second, (__mmask16)(H>>16), second, 10);
+
+        /*  4. Remove surrogate prefixes and add offset 0x10000 by adding in, shifted and constant
+            in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
+            shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
+            constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
+        */
+        const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
+        const __m512i added_first = _mm512_mask_add_epi32(aligned_first, (__mmask16)H, aligned_first, shifted_first);
+        const __m512i utf32_first = _mm512_mask_add_epi32(added_first, (__mmask16)H, added_first, constant);
+
+        const __m512i added_second = _mm512_mask_add_epi32(aligned_second, (__mmask16)(H>>16), aligned_second, shifted_second);
+        const __m512i utf32_second = _mm512_mask_add_epi32(added_second, (__mmask16)(H>>16), added_second, constant);
+
+        //  5. Store all valid UTF-32 words (low surrogate positions and 32nd word are invalid)
+        const __mmask32 valid = ~L & 0x7fffffff;
+        // We deliberately do a _mm512_maskz_compress_epi32 followed by storeu_epi32
+        // to ease performance portability to Zen 4.
+        const __m512i compressed_first = _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
+        const size_t howmany1 = count_ones((uint16_t)(valid));
+        _mm512_storeu_si512((__m512i *) utf32_output,  compressed_first);
+        utf32_output += howmany1;
+        const __m512i compressed_second = _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
+        const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
+        // The following could be unsafe in some cases?
+        //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
+        _mm512_mask_storeu_epi32((__m512i *) utf32_output, __mmask16((1<<howmany2)-1), compressed_second);
+        utf32_output += howmany2;
+        // Only process 31 words, but keep track if the 31st word is a high surrogate as a carry
+        buf += 31;
+        carry = (H >> 30) & 0x1;
+      } else {
+        // invalid case
+        return std::make_tuple(buf+carry, utf32_output, false);
+      }
+    } else {
+      // no surrogates
+      // extend all thirty-two 16-bit words to thirty-two 32-bit words
+      _mm512_storeu_si512((__m512i *)(utf32_output), _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
+      _mm512_storeu_si512((__m512i *)(utf32_output) + 1, _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1)));
+      utf32_output += 32;
+      buf += 32;
+      carry = 0;
+    }
+  } // while
+  return std::make_tuple(buf+carry, utf32_output, true);
 }
 /* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf8.inl.cpp
@@ -19953,497 +18606,485 @@ std::tuple<const char16_t*, char32_t*, bool> convert_utf16_to_utf32(const char16
 // file included directly
 
 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
-std::pair<const char32_t*, char*> avx512_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output)
-{
-    const char32_t* end = buf + len;
-    const __m256i v_0000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-    const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-    const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-    const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-    const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-    __m256i running_max = _mm256_setzero_si256();
-    __m256i forbidden_bytemask = _mm256_setzero_si256();
-
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-    while (buf + 16 + safety_margin <= end) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
-        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
-        running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
-
-        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-        __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
-        in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
-
-        // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
-
-        if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-            // 1. pack the bytes
-            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
-            // 2. store (16 bytes)
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-            // 3. adjust pointers
-            buf += 16;
-            utf8_output += 16;
-            continue; // we are done for this round!
-        }
-        // no bits set above 7th bit
-        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-        // no bits set above 11th bit
-        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-        if (one_or_two_bytes_bitmask == 0xffffffff) {
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const __m256i t3 = _mm256_or_si256(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-            // 2. merge ASCII and 2-byte codewords
-            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-            // 3. prepare bitmask for 8-bit lookup
-            const uint32_t M0 = one_byte_bitmask & 0x55555555;
-            const uint32_t M1 = M0 >> 7;
-            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-            // 4. pack the bytes
-
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
-
-            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-            // 5. store bytes
-            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-            utf8_output += row[0];
-            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
-            utf8_output += row_2[0];
-
-            // 6. adjust pointers
-            buf += 16;
-            continue;
-        }
-        // Must check for overflow in packing
-        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-        if (saturation_bitmask == 0xffffffff) {
-            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-            const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-            forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
-
-            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-            /* In this branch we handle three cases:
-              1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-              2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-              3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-              We expand the input word (16-bit) into two words (32-bit), thus
-              we have room for four bytes. However, we need five distinct bit
-              layouts. Note that the last byte in cases #2 and #3 is the same.
-
-              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-              in register t2.
-
-              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-              either byte 1 for case #2 or byte 2 for case #3. Note that they
-              differ by exactly one bit.
-
-              Finally from these two words we build proper UTF-8 sequence, taking
-              into account the case (i.e, the number of bytes to write).
-            */
-            /**
-             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-             * t2 => [0ccc|cccc] [10cc|cccc]
-             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-             */
+std::pair<const char32_t*, char*> avx512_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
+  const char32_t* end = buf + len;
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+  __m256i running_max = _mm256_setzero_si256();
+  __m256i forbidden_bytemask = _mm256_setzero_si256();
+
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+    __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
+    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+
+    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
+    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+    // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+    if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
+      // 4. pack the bytes
+
+      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
+
+      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+      const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
+      utf8_output += row_2[0];
+
+      // 6. adjust pointers
+      buf += 16;
+      continue;
+    }
+    // Must check for overflow in packing
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffffffff) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+      forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+
+      const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                                              0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+      /* In this branch we handle three cases:
+        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+        We expand the input word (16-bit) into two words (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
+
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
+
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
+
+        Finally from these two words we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-            const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
-            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
-
-            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-            const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
-            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
-            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
-            const __m256i s4 = _mm256_xor_si256(s3, m0);
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
+
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
 #undef simdutf_vec
 
-            // 4. expand words 16-bit => 32-bit
-            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-            // Due to the wider registers, the following path is less likely to be useful.
-            /*if(mask == 0) {
-              // We only have three-byte words. Use fast path.
-              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-              utf8_output += 12;
-              buf += 16;
-              continue;
-            }*/
-            const uint8_t mask0 = uint8_t(mask);
-            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
-
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-            utf8_output += row0[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-            utf8_output += row1[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-            utf8_output += row2[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-            utf8_output += row3[0];
-            buf += 16;
-        } else {
-            // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // may require large, non-trivial tables?
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xFFFFF800) == 0) { // 2-byte
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xFFFF0000) == 0) { // 3-byte
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(nullptr, utf8_output);
-                    }
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else { // 4-byte
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(nullptr, utf8_output);
-                    }
-                    *utf8_output++ = char((word >> 18) | 0b11110000);
-                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
-        }
-    } // while
-
-    // check for invalid input
-    const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-    if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
-        return std::make_pair(nullptr, utf8_output);
-    }
-
-    if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
-        return std::make_pair(nullptr, utf8_output);
-    }
-
-    return std::make_pair(buf, utf8_output);
+      // 4. expand words 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be useful.
+      /*if(mask == 0) {
+        // We only have three-byte words. Use fast path.
+        const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+        const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+        const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+        utf8_output += 12;
+        buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+      const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+      const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+      const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
+
+
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+      const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
+
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+    } else {
+      // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // may require large, non-trivial tables?
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
+          *utf8_output++ = char(word);
+        } else if((word & 0xFFFFF800)==0) { // 2-byte
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word & 0xFFFF0000 )==0) {  // 3-byte
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {  // 4-byte
+          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
+          *utf8_output++ = char((word>>18) | 0b11110000);
+          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  // check for invalid input
+  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+  if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+    return std::make_pair(nullptr, utf8_output);
+  }
+
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
+
+  return std::make_pair(buf, utf8_output);
 }
 
 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
-std::pair<result, char*> avx512_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output)
-{
-    const char32_t* end = buf + len;
-    const char32_t* start = buf;
-
-    const __m256i v_0000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-    const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-    const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-    const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-    const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-    const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-    while (buf + 16 + safety_margin <= end) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
-        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
-        // Check for too large input
-        const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
-        if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
-            return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-        }
-
-        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-        __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
-        in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
-
-        // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
-
-        if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-            // 1. pack the bytes
-            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
-            // 2. store (16 bytes)
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-            // 3. adjust pointers
-            buf += 16;
-            utf8_output += 16;
-            continue; // we are done for this round!
-        }
-        // no bits set above 7th bit
-        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-        // no bits set above 11th bit
-        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-        if (one_or_two_bytes_bitmask == 0xffffffff) {
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const __m256i t3 = _mm256_or_si256(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-            // 2. merge ASCII and 2-byte codewords
-            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-            // 3. prepare bitmask for 8-bit lookup
-            const uint32_t M0 = one_byte_bitmask & 0x55555555;
-            const uint32_t M1 = M0 >> 7;
-            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-            // 4. pack the bytes
-
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
-
-            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-            // 5. store bytes
-            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-            utf8_output += row[0];
-            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
-            utf8_output += row_2[0];
-
-            // 6. adjust pointers
-            buf += 16;
-            continue;
-        }
-        // Must check for overflow in packing
-        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-        if (saturation_bitmask == 0xffffffff) {
-            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-
-            // Check for illegal surrogate words
-            const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-            const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
-            if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
-                return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
-            }
-
-            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-            /* In this branch we handle three cases:
-              1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-              2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-              3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-              We expand the input word (16-bit) into two words (32-bit), thus
-              we have room for four bytes. However, we need five distinct bit
-              layouts. Note that the last byte in cases #2 and #3 is the same.
-
-              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-              in register t2.
-
-              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-              either byte 1 for case #2 or byte 2 for case #3. Note that they
-              differ by exactly one bit.
-
-              Finally from these two words we build proper UTF-8 sequence, taking
-              into account the case (i.e, the number of bytes to write).
-            */
-            /**
-             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-             * t2 => [0ccc|cccc] [10cc|cccc]
-             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-             */
+std::pair<result, char*> avx512_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
+  const char32_t* end = buf + len;
+  const char32_t* start = buf;
+
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+    __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
+    // Check for too large input
+    const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+    if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
+    }
+
+    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
+    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+    // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+    if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
+      // 4. pack the bytes
+
+      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
+
+      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+      const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
+      utf8_output += row_2[0];
+
+      // 6. adjust pointers
+      buf += 16;
+      continue;
+    }
+    // Must check for overflow in packing
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffffffff) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+
+      // Check for illegal surrogate words
+      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+      const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
+      }
+
+      const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                                              0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+      /* In this branch we handle three cases:
+        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+        We expand the input word (16-bit) into two words (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
+
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
+
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
+
+        Finally from these two words we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-            const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
-            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
-
-            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-            const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
-            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
-            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
-            const __m256i s4 = _mm256_xor_si256(s3, m0);
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
+
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
 #undef simdutf_vec
 
-            // 4. expand words 16-bit => 32-bit
-            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-            // Due to the wider registers, the following path is less likely to be useful.
-            /*if(mask == 0) {
-              // We only have three-byte words. Use fast path.
-              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-              utf8_output += 12;
-              buf += 16;
-              continue;
-            }*/
-            const uint8_t mask0 = uint8_t(mask);
-            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
-
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-            utf8_output += row0[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-            utf8_output += row1[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-            utf8_output += row2[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-            utf8_output += row3[0];
-            buf += 16;
-        } else {
-            // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // may require large, non-trivial tables?
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xFFFFF800) == 0) { // 2-byte
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xFFFF0000) == 0) { // 3-byte
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output);
-                    }
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else { // 4-byte
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output);
-                    }
-                    *utf8_output++ = char((word >> 18) | 0b11110000);
-                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
-        }
-    } // while
-
-    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+      // 4. expand words 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be useful.
+      /*if(mask == 0) {
+        // We only have three-byte words. Use fast path.
+        const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+        const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+        const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+        utf8_output += 12;
+        buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+      const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+      const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+      const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
+
+
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+      const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
+
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+    } else {
+      // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // may require large, non-trivial tables?
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
+          *utf8_output++ = char(word);
+        } else if((word & 0xFFFFF800)==0) { // 2-byte
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word & 0xFFFF0000 )==0) {  // 3-byte
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {  // 4-byte
+          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
+          *utf8_output++ = char((word>>18) | 0b11110000);
+          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf16.inl.cpp
@@ -20451,177 +19092,160 @@ std::pair<result, char*> avx512_convert_utf32_to_utf8_with_errors(const char32_t
 // file included directly
 
 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
-template<endianness big_endian>
-std::pair<const char32_t*, char16_t*> avx512_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output)
-{
-    const char32_t* end = buf + len;
+template <endianness big_endian>
+std::pair<const char32_t*, char16_t*> avx512_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
+  const char32_t* end = buf + len;
 
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-    __m256i forbidden_bytemask = _mm256_setzero_si256();
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+  __m256i forbidden_bytemask = _mm256_setzero_si256();
 
-    while (buf + 8 + safety_margin <= end) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-        const __m256i v_00000000 = _mm256_setzero_si256();
-        const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+  while (buf + 8 + safety_margin <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-        // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-        if (saturation_bitmask == 0xffffffff) {
-            const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-            const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-            forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
+    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
 
-            __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
-            if (big_endian) {
-                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-            }
-            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-            utf16_output += 8;
-            buf += 8;
+    if (saturation_bitmask == 0xffffffff) {
+      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+      forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
+
+      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
+      if (big_endian) {
+        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 7;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFF0000)==0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
+          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
         } else {
-            size_t forward = 7;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFF0000) == 0) {
-                    // will not generate a surrogate pair
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(nullptr, utf16_output);
-                    }
-                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
-                } else {
-                    // will generate a surrogate pair
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(nullptr, utf16_output);
-                    }
-                    word -= 0x10000;
-                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-                    if (big_endian) {
-                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-                    }
-                    *utf16_output++ = char16_t(high_surrogate);
-                    *utf16_output++ = char16_t(low_surrogate);
-                }
-            }
-            buf += k;
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
         }
+      }
+      buf += k;
     }
+  }
 
-    // check for invalid input
-    if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
-        return std::make_pair(nullptr, utf16_output);
-    }
+  // check for invalid input
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
 
-    return std::make_pair(buf, utf16_output);
+  return std::make_pair(buf, utf16_output);
 }
 
 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
-template<endianness big_endian>
-std::pair<result, char16_t*> avx512_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)
-{
-    const char32_t* start = buf;
-    const char32_t* end = buf + len;
-
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+template <endianness big_endian>
+std::pair<result, char16_t*> avx512_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
+  const char32_t* start = buf;
+  const char32_t* end = buf + len;
 
-    while (buf + 8 + safety_margin <= end) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
-
-        const __m256i v_00000000 = _mm256_setzero_si256();
-        const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-        // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-
-        if (saturation_bitmask == 0xffffffff) {
-            const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-            const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-            const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
-            if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
-                return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
-            }
+  while (buf + 8 + safety_margin <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-            __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
-            if (big_endian) {
-                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-            }
-            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-            utf16_output += 8;
-            buf += 8;
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+
+    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+    if (saturation_bitmask == 0xffffffff) {
+      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+      const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
+      }
+
+      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
+      if (big_endian) {
+        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 7;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFF0000)==0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
+          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
         } else {
-            size_t forward = 7;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFF0000) == 0) {
-                    // will not generate a surrogate pair
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output);
-                    }
-                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
-                } else {
-                    // will generate a surrogate pair
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output);
-                    }
-                    word -= 0x10000;
-                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-                    if (big_endian) {
-                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-                    }
-                    *utf16_output++ = char16_t(high_surrogate);
-                    *utf16_output++ = char16_t(low_surrogate);
-                }
-            }
-            buf += k;
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
         }
+      }
+      buf += k;
     }
+  }
 
-    return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
 /* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_ascii_validation.inl.cpp
 /* begin file src/icelake/icelake_ascii_validation.inl.cpp */
 // file included directly
 
-bool validate_ascii(const char* buf, size_t len)
-{
-    const char* end = buf + len;
-    const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
-    __m512i running_or = _mm512_setzero_si512();
-    for (; buf + 64 <= end; buf += 64) {
-        const __m512i utf8 = _mm512_loadu_si512((const __m512i*)buf);
-        running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
-    }
-    if (buf < end) {
-        const __m512i utf8 = _mm512_maskz_loadu_epi8((uint64_t(1) << (end - buf)) - 1, (const __m512i*)buf);
-        running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
-    }
-    return (_mm512_test_epi8_mask(running_or, running_or) == 0);
+bool validate_ascii(const char* buf, size_t len) {
+  const char* end = buf + len;
+  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
+  __m512i running_or = _mm512_setzero_si512();
+  for (; buf + 64 <= end; buf += 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i*)buf);
+    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
+  }
+  if(buf < end) {
+     const __m512i utf8 = _mm512_maskz_loadu_epi8((uint64_t(1) << (end-buf)) - 1,(const __m512i*)buf);
+    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
+  }
+  return (_mm512_test_epi8_mask(running_or, running_or) == 0);
 }
 /* end file src/icelake/icelake_ascii_validation.inl.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_utf32_validation.inl.cpp
 /* begin file src/icelake/icelake_utf32_validation.inl.cpp */
 // file included directly
 
-const char32_t* validate_utf32(const char32_t* buf, size_t len)
-{
+const char32_t* validate_utf32(const char32_t* buf, size_t len) {
     const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
 
     const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
@@ -20629,21 +19253,21 @@ const char32_t* validate_utf32(const char32_t* buf, size_t len)
     __m512i currentoffsetmax = _mm512_setzero_si512();
 
     while (buf <= end) {
-        __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
-        buf += 16;
-        currentoffsetmax = _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
-        currentmax = _mm512_max_epu32(utf32, currentmax);
+      __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
+      buf += 16;
+      currentoffsetmax = _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
+      currentmax = _mm512_max_epu32(utf32, currentmax);
     }
 
     const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
     const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
     __m512i is_zero = _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax);
     if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
-        return nullptr;
+      return nullptr;
     }
     is_zero = _mm512_xor_si512(_mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
     if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
-        return nullptr;
+      return nullptr;
     }
 
     return buf;
@@ -20659,191 +19283,199 @@ const char32_t* validate_utf32(const char32_t* buf, size_t len)
  * is written to 'outlen' and the function reports the number of input word
  * consumed.
  */
-template<endianness big_endian>
-size_t utf16_to_utf8_avx512i(const char16_t* inbuf, size_t inlen,
-    unsigned char* outbuf, size_t* outlen)
-{
-    __m512i in;
-    __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
-    __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809);
-    const char16_t* const inbuf_orig = inbuf;
-    const unsigned char* const outbuf_orig = outbuf;
-    size_t adjust = 0;
-    int carry = 0;
-
-    while (inlen >= 32) {
-        in = _mm512_loadu_si512(inbuf);
-        if (big_endian) {
-            in = _mm512_shuffle_epi8(in, byteflip);
-        }
-        inlen -= 31;
-    lastiteration:
-        inbuf += 31;
-
-    failiteration:
-        const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
-            inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
-
-        if (_ktestz_mask32_u8(inmask, is234byte)) {
-            // fast path for ASCII only
-            _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
-            outbuf += 31;
-            carry = 0;
-
-            if (inlen < 32) {
-                goto tail;
-            } else {
-                continue;
-            }
-        }
+template <endianness big_endian>
+size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
+                               unsigned char *outbuf, size_t *outlen) {
+  __m512i in;
+  __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
+  __m512i byteflip = _mm512_setr_epi64(
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
+  const char16_t * const inbuf_orig = inbuf;
+  const unsigned char * const outbuf_orig = outbuf;
+  size_t adjust = 0;
+  int carry = 0;
+
+  while (inlen >= 32) {
+    in = _mm512_loadu_si512(inbuf);
+    if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
+    inlen -= 31;
+  lastiteration:
+    inbuf += 31;
+
+  failiteration:
+    const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
+      inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
+
+    if (_ktestz_mask32_u8(inmask, is234byte)) {
+      // fast path for ASCII only
+      _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
+      outbuf += 31;
+      carry = 0;
+
+      if (inlen < 32) {
+        goto tail;
+      } else {
+        continue;
+      }
+    }
+
+    const __mmask32 is12byte =
+        _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
+
+    if (_ktestc_mask32_u8(is12byte, inmask)) {
+      // fast path for 1 and 2 byte only
+
+      const __m512i twobytes = _mm512_ternarylogic_epi32(
+          _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
+          _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
+      in = _mm512_mask_add_epi16(in, is234byte, twobytes,
+                                 _mm512_set1_epi16(int16_t(0x80c0)));
+      const __m512i cmpmask =
+          _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
+                                  _mm512_set1_epi16(0x0800));
+      const __mmask64 smoosh = _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
+      const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
+      _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), _cvtmask64_u64(smoosh))),
+                              out);
+      outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
+      carry = 0;
+
+      if (inlen < 32) {
+        goto tail;
+      } else {
+        continue;
+      }
+    }
+    __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
+    __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
+
+
+    __m512i taglo = _mm512_set1_epi32(0x8080e000);
+    __m512i taghi = taglo;
+
+    const __m512i fc00masked = _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
+    const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
+        inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
+    const __mmask32 losurr = _mm512_cmp_epu16_mask(
+        fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
+
+    int carryout = 0;
+    if (!_kortestz_mask32_u8(hisurr, losurr)) {
+      // handle surrogates
+
+      __m512i los = _mm512_alignr_epi32(hi, lo, 1);
+      __m512i his = _mm512_alignr_epi32(lo, hi, 1);
+
+      const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
+      taglo =
+          _mm512_mask_mov_epi32(taglo,__mmask16(hisurr), _mm512_set1_epi32(0x808080f0));
+      taghi =
+          _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), _mm512_set1_epi32(0x808080f0));
+
+      lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
+      hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
+      los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
+      his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
+      lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
+      hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
+
+      carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
+
+      const uint32_t  h = _cvtmask32_u32(hisurr);
+      const uint32_t  l = _cvtmask32_u32(losurr);
+      // check for mismatched surrogates
+      if ((h + h + carry) ^ l) {
+        const uint32_t lonohi = l & ~(h + h + carry);
+        const uint32_t hinolo = h & ~(l >> 1);
+        inlen = _tzcnt_u32(hinolo | lonohi);
+        inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1));
+        in = _mm512_maskz_mov_epi16(inmask, in);
+        adjust = (int)inlen - 31;
+        inlen = 0;
+        goto failiteration;
+      }
+    }
 
-        const __mmask32 is12byte = _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
-
-        if (_ktestc_mask32_u8(is12byte, inmask)) {
-            // fast path for 1 and 2 byte only
-
-            const __m512i twobytes = _mm512_ternarylogic_epi32(
-                _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
-                _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
-            in = _mm512_mask_add_epi16(in, is234byte, twobytes,
-                _mm512_set1_epi16(int16_t(0x80c0)));
-            const __m512i cmpmask = _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
-                _mm512_set1_epi16(0x0800));
-            const __mmask64 smoosh = _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
-            const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
-            _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), _cvtmask64_u64(smoosh))),
-                out);
-            outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
-            carry = 0;
-
-            if (inlen < 32) {
-                goto tail;
-            } else {
-                continue;
-            }
-        }
-        __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
-        __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
-
-        __m512i taglo = _mm512_set1_epi32(0x8080e000);
-        __m512i taghi = taglo;
-
-        const __m512i fc00masked = _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
-        const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
-            inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
-        const __mmask32 losurr = _mm512_cmp_epu16_mask(
-            fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
-
-        int carryout = 0;
-        if (!_kortestz_mask32_u8(hisurr, losurr)) {
-            // handle surrogates
-
-            __m512i los = _mm512_alignr_epi32(hi, lo, 1);
-            __m512i his = _mm512_alignr_epi32(lo, hi, 1);
-
-            const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
-            taglo = _mm512_mask_mov_epi32(taglo, __mmask16(hisurr), _mm512_set1_epi32(0x808080f0));
-            taghi = _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), _mm512_set1_epi32(0x808080f0));
-
-            lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
-            hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
-            los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
-            his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
-            lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
-            hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
-
-            carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
-
-            const uint32_t h = _cvtmask32_u32(hisurr);
-            const uint32_t l = _cvtmask32_u32(losurr);
-            // check for mismatched surrogates
-            if ((h + h + carry) ^ l) {
-                const uint32_t lonohi = l & ~(h + h + carry);
-                const uint32_t hinolo = h & ~(l >> 1);
-                inlen = _tzcnt_u32(hinolo | lonohi);
-                inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1));
-                in = _mm512_maskz_mov_epi16(inmask, in);
-                adjust = (int)inlen - 31;
-                inlen = 0;
-                goto failiteration;
-            }
-        }
+    hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff),hi);
+    carry = carryout;
 
-        hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff), hi);
-        carry = carryout;
+    __m512i mslo =
+        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
 
-        __m512i mslo = _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
+    __m512i mshi =
+        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
 
-        __m512i mshi = _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
+    const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
+    const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
 
-        const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
-        const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
+    const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
+    const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
+    const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
 
-        const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
-        const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
-        const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
+    taglo =
+        _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000));
+    taghi =
+        _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000));
+    __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
+                                      _mm512_set1_epi32(0x00010101));
+    __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
+                                      _mm512_set1_epi32(0x00010101));
 
-        taglo = _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000));
-        taghi = _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000));
-        __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
-            _mm512_set1_epi32(0x00010101));
-        __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
-            _mm512_set1_epi32(0x00010101));
 
-        magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
-            _mm512_set1_epi32(0x00010101));
-        magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
-            _mm512_set1_epi32(0x00010101));
+    magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
+                                      _mm512_set1_epi32(0x00010101));
+    magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
+                                      _mm512_set1_epi32(0x00010101));
 
-        mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
-            0xea); // A&B|C
-        mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
-            0xea);
-        mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
+    mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
+                                     0xea); // A&B|C
+    mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
+                                     0xea);
+    mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
 
-        mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
+    mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
 
-        const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
-        const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
-        const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
-        const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
-        const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
-        const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
+    const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
+    const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
+    const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
+    const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
+    const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
+    const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
 
-        uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
-        uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
+    uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
+    uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
 
-        _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
-        _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi);
-        outbuf += advlo + advhi;
-    }
-    outbuf -= adjust;
+    _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
+    _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi);
+    outbuf += advlo + advhi;
+  }
+  outbuf -= adjust;
 
 tail:
-    if (inlen != 0) {
-        // We must have inlen < 31.
-        inmask = _cvtu32_mask32((1 << inlen) - 1);
-        in = _mm512_maskz_loadu_epi16(inmask, inbuf);
-        if (big_endian) {
-            in = _mm512_shuffle_epi8(in, byteflip);
-        }
-        adjust = inlen - 31;
-        inlen = 0;
-        goto lastiteration;
-    }
-    *outlen = (outbuf - outbuf_orig) + adjust;
-    return ((inbuf - inbuf_orig) + adjust);
+  if (inlen != 0) {
+    // We must have inlen < 31.
+    inmask = _cvtu32_mask32((1 << inlen) - 1);
+    in = _mm512_maskz_loadu_epi16(inmask, inbuf);
+    if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
+    adjust = inlen - 31;
+    inlen = 0;
+    goto lastiteration;
+  }
+  *outlen = (outbuf - outbuf_orig) + adjust;
+  return ((inbuf - inbuf_orig) + adjust);
 }
 /* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
 
+#include <cstdint>
+
 } // namespace
 } // namespace icelake
 } // namespace simdutf
@@ -20852,137 +19484,139 @@ namespace simdutf {
 namespace icelake {
 
 simdutf_warn_unused int
-implementation::detect_encodings(const char* input,
-    size_t length) const noexcept
-{
-    // If there is a BOM, then we trust it.
-    auto bom_encoding = simdutf::BOM::check_bom(input, length);
-    if (bom_encoding != encoding_type::unspecified) {
-        return bom_encoding;
-    }
-    if (length % 2 == 0) {
-        const char* buf = input;
-
-        const char* start = buf;
-        const char* end = input + length;
-
-        bool is_utf8 = true;
-        bool is_utf16 = true;
-        bool is_utf32 = true;
-
-        int out = 0;
-
-        avx512_utf8_checker checker {};
-        __m512i currentmax = _mm512_setzero_si512();
-        while (buf + 64 <= end) {
-            __m512i in = _mm512_loadu_si512((__m512i*)buf);
-            __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-            __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-            if (surrogates) {
-                is_utf8 = false;
-
-                // Can still be either UTF-16LE or UTF-32 depending on the positions
-                // of the surrogates To be valid UTF-32, a surrogate cannot be in the
-                // two most significant bytes of any 32-bit word. On the other hand, to
-                // be valid UTF-16LE, at least one surrogate must be in the two most
-                // significant bytes of a 32-bit word since they always come in pairs in
-                // UTF-16LE. Note that we always proceed in multiple of 4 before this
-                // point so there is no offset in 32-bit words.
-
-                if ((surrogates & 0xaaaaaaaa) != 0) {
-                    is_utf32 = false;
-                    __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(
-                        diff, _mm512_set1_epi16(uint16_t(0x0400)));
-                    __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-                    // high must be followed by low
-                    if ((highsurrogates << 1) != lowsurrogates) {
-                        return simdutf::encoding_type::unspecified;
-                    }
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
+  if (length % 2 == 0) {
+    const char *buf = input;
 
-                    bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-                    if (ends_with_high) {
-                        buf += 31 * sizeof(char16_t); // advance only by 31 words so that we start
-                                                      // with the high surrogate on the next round.
-                    } else {
-                        buf += 32 * sizeof(char16_t);
-                    }
-                    is_utf16 = validate_utf16le(reinterpret_cast<const char16_t*>(buf),
-                        (end - buf) / sizeof(char16_t));
-                    if (!is_utf16) {
-                        return simdutf::encoding_type::unspecified;
+    const char *start = buf;
+    const char *end = input + length;
 
-                    } else {
-                        return simdutf::encoding_type::UTF16_LE;
-                    }
+    bool is_utf8 = true;
+    bool is_utf16 = true;
+    bool is_utf32 = true;
 
-                } else {
-                    is_utf16 = false;
-                    // Check for UTF-32
-                    if (length % 4 == 0) {
-                        const char32_t* input32 = reinterpret_cast<const char32_t*>(buf);
-                        const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + length / 4;
-                        if (validate_utf32(input32, end32 - input32)) {
-                            return simdutf::encoding_type::UTF32_LE;
-                        }
-                    }
-                    return simdutf::encoding_type::unspecified;
-                }
-                break;
-            }
-            // If no surrogate, validate under other encodings as well
+    int out = 0;
 
-            // UTF-32 validation
-            currentmax = _mm512_max_epu32(in, currentmax);
+    avx512_utf8_checker checker{};
+    __m512i currentmax = _mm512_setzero_si512();
+    while (buf + 64 <= end) {
+      __m512i in = _mm512_loadu_si512((__m512i *)buf);
+      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+      __mmask32 surrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+      if (surrogates) {
+        is_utf8 = false;
+
+        // Can still be either UTF-16LE or UTF-32 depending on the positions
+        // of the surrogates To be valid UTF-32, a surrogate cannot be in the
+        // two most significant bytes of any 32-bit word. On the other hand, to
+        // be valid UTF-16LE, at least one surrogate must be in the two most
+        // significant bytes of a 32-bit word since they always come in pairs in
+        // UTF-16LE. Note that we always proceed in multiple of 4 before this
+        // point so there is no offset in 32-bit words.
+
+        if ((surrogates & 0xaaaaaaaa) != 0) {
+          is_utf32 = false;
+          __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(
+              diff, _mm512_set1_epi16(uint16_t(0x0400)));
+          __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+          // high must be followed by low
+          if ((highsurrogates << 1) != lowsurrogates) {
+            return simdutf::encoding_type::unspecified;
+          }
+
+          bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+          if (ends_with_high) {
+            buf +=
+                31 *
+                sizeof(char16_t); // advance only by 31 words so that we start
+                                  // with the high surrogate on the next round.
+          } else {
+            buf += 32 * sizeof(char16_t);
+          }
+          is_utf16 = validate_utf16le(reinterpret_cast<const char16_t *>(buf),
+                                      (end - buf) / sizeof(char16_t));
+          if (!is_utf16) {
+            return simdutf::encoding_type::unspecified;
 
-            // UTF-8 validation
-            checker.check_next_input(in);
+          } else {
+            return simdutf::encoding_type::UTF16_LE;
+          }
 
-            buf += 64;
+        } else {
+          is_utf16 = false;
+          // Check for UTF-32
+          if (length % 4 == 0) {
+            const char32_t *input32 = reinterpret_cast<const char32_t *>(buf);
+            const char32_t *end32 =
+                reinterpret_cast<const char32_t *>(start) + length / 4;
+            if (validate_utf32(input32, end32 - input32)) {
+              return simdutf::encoding_type::UTF32_LE;
+            }
+          }
+          return simdutf::encoding_type::unspecified;
         }
+        break;
+      }
+      // If no surrogate, validate under other encodings as well
 
-        // Check which encodings are possible
+      // UTF-32 validation
+      currentmax = _mm512_max_epu32(in, currentmax);
 
-        if (is_utf8) {
-            size_t current_length = static_cast<size_t>(buf - start);
-            if (current_length != length) {
-                const __m512i utf8 = _mm512_maskz_loadu_epi8(
-                    (1ULL << (length - current_length)) - 1, (const __m512i*)buf);
-                checker.check_next_input(utf8);
-            }
-            checker.check_eof();
-            if (!checker.errors()) {
-                out |= simdutf::encoding_type::UTF8;
-            }
-        }
+      // UTF-8 validation
+      checker.check_next_input(in);
 
-        if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (length - (buf - start)) / 2)) {
-            out |= simdutf::encoding_type::UTF16_LE;
-        }
+      buf += 64;
+    }
 
-        if (is_utf32 && (length % 4 == 0)) {
-            currentmax = _mm512_max_epu32(
-                _mm512_maskz_loadu_epi8(
-                    (1ULL << (length - static_cast<size_t>(buf - start))) - 1,
-                    (const __m512i*)buf),
-                currentmax);
-            __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff),
-                _MM_CMPINT_GT);
-            if (outside_range == 0) {
-                out |= simdutf::encoding_type::UTF32_LE;
-            }
-        }
+    // Check which encodings are possible
 
-        return out;
-    } else if (implementation::validate_utf8(input, length)) {
-        return simdutf::encoding_type::UTF8;
-    } else {
-        return simdutf::encoding_type::unspecified;
+    if (is_utf8) {
+      size_t current_length = static_cast<size_t>(buf - start);
+      if (current_length != length) {
+        const __m512i utf8 = _mm512_maskz_loadu_epi8(
+            (1ULL << (length - current_length)) - 1, (const __m512i *)buf);
+        checker.check_next_input(utf8);
+      }
+      checker.check_eof();
+      if (!checker.errors()) {
+        out |= simdutf::encoding_type::UTF8;
+      }
+    }
+
+    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(
+                        reinterpret_cast<const char16_t *>(buf),
+                        (length - (buf - start)) / 2)) {
+      out |= simdutf::encoding_type::UTF16_LE;
     }
+
+    if (is_utf32 && (length % 4 == 0)) {
+      currentmax = _mm512_max_epu32(
+          _mm512_maskz_loadu_epi8(
+              (1ULL << (length - static_cast<size_t>(buf - start))) - 1,
+              (const __m512i *)buf),
+          currentmax);
+      __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff),
+                                _MM_CMPINT_GT);
+      if (outside_range == 0) {
+        out |= simdutf::encoding_type::UTF32_LE;
+      }
+    }
+
+    return out;
+  } else if (implementation::validate_utf8(input, length)) {
+    return simdutf::encoding_type::UTF8;
+  } else {
+    return simdutf::encoding_type::unspecified;
+  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
-{
-    avx512_utf8_checker checker {};
+simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+    avx512_utf8_checker checker{};
     const char* ptr = buf;
     const char* end = ptr + len;
     for (; ptr + 64 <= end; ptr += 64) {
@@ -20990,1157 +19624,1150 @@ simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t l
         checker.check_next_input(utf8);
     }
     {
-        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL << (end - ptr)) - 1, (const __m512i*)ptr);
-        checker.check_next_input(utf8);
+       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
+       checker.check_next_input(utf8);
     }
     checker.check_eof();
-    return !checker.errors();
+    return ! checker.errors();
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
-{
-    avx512_utf8_checker checker {};
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
+    avx512_utf8_checker checker{};
     const char* ptr = buf;
     const char* end = ptr + len;
-    size_t count { 0 };
+    size_t count{0};
     for (; ptr + 64 <= end; ptr += 64) {
-        const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        checker.check_next_input(utf8);
-        if (checker.errors()) {
-            if (count != 0) {
-                count--;
-            } // Sometimes the error is only detected in the next chunk
-            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
-            res.count += count;
-            return res;
-        }
-        count += 64;
-    }
-    {
-        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL << (end - ptr)) - 1, (const __m512i*)ptr);
-        checker.check_next_input(utf8);
-        if (checker.errors()) {
-            if (count != 0) {
-                count--;
-            } // Sometimes the error is only detected in the next chunk
-            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
-            res.count += count;
-            return res;
-        } else {
-            return result(error_code::SUCCESS, len);
-        }
-    }
-}
-
-simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
-{
-    return icelake::validate_ascii(buf, len);
-}
-
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
-{
-    const char* buf_orig = buf;
-    const char* end = buf + len;
-    const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
-    for (; buf + 64 <= end; buf += 64) {
-        const __m512i input = _mm512_loadu_si512((const __m512i*)buf);
-        __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
-        if (notascii) {
-            return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
-        }
+      const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
+      checker.check_next_input(utf8);
+      if(checker.errors()) {
+        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
+        res.count += count;
+        return res;
+      }
+      count += 64;
     }
     {
-        const __m512i input = _mm512_maskz_loadu_epi8((1ULL << (end - buf)) - 1, (const __m512i*)buf);
-        __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
-        if (notascii) {
-            return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
-        }
-    }
-    return result(error_code::SUCCESS, len);
-}
-
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
-{
-    const char16_t* end = buf + len;
-
-    for (; buf + 32 <= end;) {
-        __m512i in = _mm512_loadu_si512((__m512i*)buf);
-        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-        if (surrogates) {
-            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-            // high must be followed by low
-            if ((highsurrogates << 1) != lowsurrogates) {
-                return false;
-            }
-            bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-            if (ends_with_high) {
-                buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
-            } else {
-                buf += 32;
-            }
+      const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
+      checker.check_next_input(utf8);
+      if(checker.errors()) {
+        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
+        res.count += count;
+        return res;
+      } else {
+        return result(error_code::SUCCESS, len);
+      }
+    }
+}
+
+simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return icelake::validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
+  const char* buf_orig = buf;
+  const char* end = buf + len;
+  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
+  for (; buf + 64 <= end; buf += 64) {
+    const __m512i input = _mm512_loadu_si512((const __m512i*)buf);
+    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
+    if(notascii) {
+      return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
+    }
+  }
+  {
+    const __m512i input = _mm512_maskz_loadu_epi8((1ULL<<(end - buf))-1, (const __m512i*)buf);
+    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
+    if(notascii) {
+      return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
+    }
+  }
+  return result(error_code::SUCCESS, len);
+}
+
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
+    const char16_t *end = buf + len;
+
+    for(;buf + 32 <= end; ) {
+      __m512i in = _mm512_loadu_si512((__m512i*)buf);
+      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+      if(surrogates) {
+        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+        // high must be followed by low
+        if ((highsurrogates << 1) != lowsurrogates) {
+           return false;
+        }
+        bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+        if(ends_with_high) {
+          buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
         } else {
-            buf += 32;
+          buf += 32;
         }
+      } else {
+        buf += 32;
+      }
     }
-    if (buf < end) {
-        __m512i in = _mm512_maskz_loadu_epi16((1 << (end - buf)) - 1, (__m512i*)buf);
-        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-        if (surrogates) {
-            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-            // high must be followed by low
-            if ((highsurrogates << 1) != lowsurrogates) {
-                return false;
-            }
+    if(buf < end) {
+      __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf);
+      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+      if(surrogates) {
+        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+        // high must be followed by low
+        if ((highsurrogates << 1) != lowsurrogates) {
+           return false;
         }
+      }
     }
     return true;
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
-{
-    const char16_t* end = buf + len;
-    const __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809);
-    for (; buf + 32 <= end;) {
-        __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
-        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-        if (surrogates) {
-            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-            // high must be followed by low
-            if ((highsurrogates << 1) != lowsurrogates) {
-                return false;
-            }
-            bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-            if (ends_with_high) {
-                buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
-            } else {
-                buf += 32;
-            }
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
+   const char16_t *end = buf + len;
+   const __m512i byteflip = _mm512_setr_epi64(
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
+    for(;buf + 32 <= end; ) {
+      __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
+      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+      if(surrogates) {
+        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+        // high must be followed by low
+        if ((highsurrogates << 1) != lowsurrogates) {
+           return false;
+        }
+        bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+        if(ends_with_high) {
+          buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
         } else {
-            buf += 32;
+          buf += 32;
         }
+      } else {
+        buf += 32;
+      }
     }
-    if (buf < end) {
-        __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1 << (end - buf)) - 1, (__m512i*)buf), byteflip);
-        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-        if (surrogates) {
-            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-            // high must be followed by low
-            if ((highsurrogates << 1) != lowsurrogates) {
-                return false;
-            }
+    if(buf < end) {
+      __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip);
+      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+      if(surrogates) {
+        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+        // high must be followed by low
+        if ((highsurrogates << 1) != lowsurrogates) {
+           return false;
         }
+      }
     }
     return true;
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
-{
-    const char16_t* start_buf = buf;
-    const char16_t* end = buf + len;
-    for (; buf + 32 <= end;) {
-        __m512i in = _mm512_loadu_si512((__m512i*)buf);
-        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-        if (surrogates) {
-            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-            // high must be followed by low
-            if ((highsurrogates << 1) != lowsurrogates) {
-                uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
-                uint32_t extra_high = _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
-                return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
-            }
-            bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-            if (ends_with_high) {
-                buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
-            } else {
-                buf += 32;
-            }
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
+    const char16_t *start_buf = buf;
+    const char16_t *end = buf + len;
+    for(;buf + 32 <= end; ) {
+      __m512i in = _mm512_loadu_si512((__m512i*)buf);
+      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+      if(surrogates) {
+        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+        // high must be followed by low
+        if ((highsurrogates << 1) != lowsurrogates) {
+          uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
+          uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
+          return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
+        }
+        bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+        if(ends_with_high) {
+          buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
         } else {
-            buf += 32;
-        }
-    }
-    if (buf < end) {
-        __m512i in = _mm512_maskz_loadu_epi16((1 << (end - buf)) - 1, (__m512i*)buf);
-        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-        if (surrogates) {
-            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-            // high must be followed by low
-            if ((highsurrogates << 1) != lowsurrogates) {
-                uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
-                uint32_t extra_high = _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
-                return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
-            }
-        }
+          buf += 32;
+        }
+      } else {
+        buf += 32;
+      }
+    }
+    if(buf < end) {
+      __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf);
+      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+      if(surrogates) {
+        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+        // high must be followed by low
+        if ((highsurrogates << 1) != lowsurrogates) {
+          uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
+          uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
+          return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
+        }
+      }
     }
     return result(error_code::SUCCESS, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
-{
-    const char16_t* start_buf = buf;
-    const char16_t* end = buf + len;
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
+    const char16_t *start_buf = buf;
+    const char16_t *end = buf + len;
     const __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809);
-    for (; buf + 32 <= end;) {
-        __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
-        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-        if (surrogates) {
-            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-            // high must be followed by low
-            if ((highsurrogates << 1) != lowsurrogates) {
-                uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
-                uint32_t extra_high = _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
-                return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
-            }
-            bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-            if (ends_with_high) {
-                buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
-            } else {
-                buf += 32;
-            }
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
+    for(;buf + 32 <= end; ) {
+      __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
+      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+      if(surrogates) {
+        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+        // high must be followed by low
+        if ((highsurrogates << 1) != lowsurrogates) {
+          uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
+          uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
+          return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
+        }
+        bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+        if(ends_with_high) {
+          buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
         } else {
-            buf += 32;
-        }
-    }
-    if (buf < end) {
-        __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1 << (end - buf)) - 1, (__m512i*)buf), byteflip);
-        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-        if (surrogates) {
-            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-            // high must be followed by low
-            if ((highsurrogates << 1) != lowsurrogates) {
-                uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
-                uint32_t extra_high = _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
-                return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
-            }
-        }
+          buf += 32;
+        }
+      } else {
+        buf += 32;
+      }
+    }
+    if(buf < end) {
+      __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip);
+      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+      if(surrogates) {
+        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+        // high must be followed by low
+        if ((highsurrogates << 1) != lowsurrogates) {
+          uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
+          uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
+          return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
+        }
+      }
     }
     return result(error_code::SUCCESS, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
-{
-    const char32_t* tail = icelake::validate_utf32(buf, len);
-    if (tail) {
-        return scalar::utf32::validate(tail, len - (tail - buf));
-    } else {
-        return false;
-    }
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  const char32_t * tail = icelake::validate_utf32(buf, len);
+  if (tail) {
+    return scalar::utf32::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
-{
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
 
     const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
     const char32_t* buf_orig = buf;
     while (buf <= end) {
-        __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
-        __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
-            _MM_CMPINT_GT);
-        if (outside_range) {
-            return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
-        }
-
-        __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
-
-        __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
-            _MM_CMPINT_GT);
-        if (surrogate_range) {
-            return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
-        }
-        buf += 16;
-    }
-    if (buf < buf_orig + len) {
-        __m512i utf32 = _mm512_maskz_loadu_epi32(__mmask16((1 << (buf_orig + len - buf)) - 1), (const __m512i*)buf);
-        __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
-            _MM_CMPINT_GT);
-        if (outside_range) {
-            return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
-        }
-        __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
-
-        __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
-            _MM_CMPINT_GT);
-        if (surrogate_range) {
-            return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
-        }
+      __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
+      __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
+                                _MM_CMPINT_GT);
+      if (outside_range) {
+        return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
+      }
+
+      __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
+
+      __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
+                                _MM_CMPINT_GT);
+      if (surrogate_range) {
+        return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
+      }
+      buf += 16;
+    }
+    if(buf < buf_orig + len) {
+      __m512i utf32 = _mm512_maskz_loadu_epi32(__mmask16((1<<(buf_orig + len - buf))-1),(const __m512i*)buf);
+      __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
+                                _MM_CMPINT_GT);
+      if (outside_range) {
+        return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
+      }
+      __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
+
+      __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
+                                _MM_CMPINT_GT);
+      if (surrogate_range) {
+        return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
+      }
     }
 
     return result(error_code::SUCCESS, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::latin1_to_utf8::convert(buf,len,utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept
-{
-    return scalar::latin1_to_utf32::convert(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept {
+  return scalar::latin1_to_utf32::convert(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-    if (ret.second == nullptr) {
-        return 0;
-    }
-    return ret.second - utf16_output;
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.second == nullptr) {
+    return 0;
+  }
+  return ret.second - utf16_output;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(buf, len, utf16_output);
-    if (ret.second == nullptr) {
-        return 0;
-    }
-    return ret.second - utf16_output;
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.second == nullptr) {
+    return 0;
+  }
+  return ret.second - utf16_output;
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+   return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+   return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(buf, len, utf16_output);
-    size_t saved_bytes = ret.second - utf16_output;
-    const char* end = buf + len;
-    if (ret.first == end) {
-        return saved_bytes;
-    }
-
-    // Note: AVX512 procedure looks up 4 bytes forward, and
-    //       correctly converts multi-byte chars even if their
-    //       continuation bytes lie outsiede 16-byte window.
-    //       It meas, we have to skip continuation bytes from
-    //       the beginning ret.first, as they were already consumed.
-    while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-        ret.first += 1;
-    }
-
-    if (ret.first != end) {
-        const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(buf, len, utf16_output);
+  size_t saved_bytes = ret.second - utf16_output;
+  const char* end = buf + len;
+  if (ret.first == end) {
     return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(buf, len, utf16_output);
-    size_t saved_bytes = ret.second - utf16_output;
-    const char* end = buf + len;
-    if (ret.first == end) {
-        return saved_bytes;
-    }
-
-    // Note: AVX512 procedure looks up 4 bytes forward, and
-    //       correctly converts multi-byte chars even if their
-    //       continuation bytes lie outsiede 16-byte window.
-    //       It meas, we have to skip continuation bytes from
-    //       the beginning ret.first, as they were already consumed.
-    while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-        ret.first += 1;
-    }
-
-    if (ret.first != end) {
-        const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-
+  }
+
+  // Note: AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outsiede 16-byte window.
+  //       It meas, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+      ret.first += 1;
+  }
+
+  if (ret.first != end) {
+    const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(buf, len, utf16_output);
+  size_t saved_bytes = ret.second - utf16_output;
+  const char* end = buf + len;
+  if (ret.first == end) {
     return saved_bytes;
-}
+  }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept
-{
-    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
-    utf8_to_utf32_result ret = icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
-    if (ret.second == nullptr)
-        return 0;
+  // Note: AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outsiede 16-byte window.
+  //       It meas, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+      ret.first += 1;
+  }
 
-    size_t saved_bytes = ret.second - utf32_output;
-    const char* end = buf + len;
-    if (ret.first == end) {
-        return saved_bytes;
-    }
-
-    // Note: the AVX512 procedure looks up 4 bytes forward, and
-    //       correctly converts multi-byte chars even if their
-    //       continuation bytes lie outside 16-byte window.
-    //       It means, we have to skip continuation bytes from
-    //       the beginning ret.first, as they were already consumed.
-    while (ret.first != end and ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-        ret.first += 1;
-    }
+  if (ret.first != end) {
+    const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
 
-    if (ret.first != end) {
-        const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
-            ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-
-    return saved_bytes;
+  return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32) const noexcept
-{
-    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32);
-    auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
-    if (!std::get<2>(ret)) {
-        auto new_buf = std::get<0>(ret);
-        // rewind_and_convert_with_errors will seek a potential error from new_buf onward,
-        // with the ability to go back up to new_buf - buf bytes, and read len - (new_buf - buf) bytes forward.
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(new_buf - buf, new_buf, len - (new_buf - buf), reinterpret_cast<char32_t*>(std::get<1>(ret)));
-        res.count += (std::get<0>(ret) - buf);
-        return res;
-    }
-    size_t saved_bytes = std::get<1>(ret) - utf32_output;
-    const char* end = buf + len;
-    if (std::get<0>(ret) == end) {
-        return { simdutf::SUCCESS, saved_bytes };
-    }
 
-    // Note: the AVX512 procedure looks up 4 bytes forward, and
-    //       correctly converts multi-byte chars even if their
-    //       continuation bytes lie outside 16-byte window.
-    //       It means, we have to skip continuation bytes from
-    //       the beginning ret.first, as they were already consumed.
-    while (std::get<0>(ret) != end and ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
-        std::get<0>(ret) += 1;
-    }
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept {
+  uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  utf8_to_utf32_result ret = icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+  if (ret.second == nullptr)
+    return 0;
 
-    if (std::get<0>(ret) != end) {
-        auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), reinterpret_cast<char32_t*>(utf32_output) + saved_bytes);
-        if (scalar_result.error != simdutf::SUCCESS) {
-            scalar_result.count += (std::get<0>(ret) - buf);
-        } else {
-            scalar_result.count += saved_bytes;
-        }
-        return scalar_result;
+  size_t saved_bytes = ret.second - utf32_output;
+  const char* end = buf + len;
+  if (ret.first == end) {
+    return saved_bytes;
+  }
+
+  // Note: the AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outside 16-byte window.
+  //       It means, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (ret.first != end and ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+      ret.first += 1;
+  }
+
+  if (ret.first != end) {
+    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
+                                        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32) const noexcept {
+  uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32);
+  auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) {
+    auto new_buf = std::get<0>(ret);
+    // rewind_and_convert_with_errors will seek a potential error from new_buf onward,
+    // with the ability to go back up to new_buf - buf bytes, and read len - (new_buf - buf) bytes forward.
+    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(new_buf - buf, new_buf, len - (new_buf - buf), reinterpret_cast<char32_t *>(std::get<1>(ret)));
+    res.count += (std::get<0>(ret) - buf);
+    return res;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  const char* end = buf + len;
+  if (std::get<0>(ret) == end) {
+    return {simdutf::SUCCESS, saved_bytes};
+  }
+
+  // Note: the AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outside 16-byte window.
+  //       It means, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (std::get<0>(ret) != end and ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
+      std::get<0>(ret) += 1;
+  }
+
+  if (std::get<0>(ret) != end) {
+    auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
+                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
+    if (scalar_result.error != simdutf::SUCCESS) {
+      scalar_result.count +=  (std::get<0>(ret) - buf);
+    } else {
+      scalar_result.count += saved_bytes;
     }
+    return scalar_result;
+  }
 
-    return { simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output) };
+  return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept
-{
-    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
-    utf8_to_utf32_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
-    size_t saved_bytes = ret.second - utf32_output;
-    const char* end = buf + len;
-    if (ret.first == end) {
-        return saved_bytes;
-    }
-
-    // Note: AVX512 procedure looks up 4 bytes forward, and
-    //       correctly converts multi-byte chars even if their
-    //       continuation bytes lie outsiede 16-byte window.
-    //       It meas, we have to skip continuation bytes from
-    //       the beginning ret.first, as they were already consumed.
-    while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-        ret.first += 1;
-    }
-
-    if (ret.first != end) {
-        const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
-            ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
 
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept {
+  uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  utf8_to_utf32_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+  size_t saved_bytes = ret.second - utf32_output;
+  const char* end = buf + len;
+  if (ret.first == end) {
     return saved_bytes;
-}
+  }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
-}
+  // Note: AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outsiede 16-byte window.
+  //       It meas, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+      ret.first += 1;
+  }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
-}
+  if (ret.first != end) {
+    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
+                                        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
+  return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
-}
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    size_t outlen;
-    size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
-    if (inlen != len) {
-        return 0;
-    }
-    return outlen;
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    size_t outlen;
-    size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
-    if (inlen != len) {
-        return 0;
-    }
-    return outlen;
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    size_t outlen;
-    size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
-    if (inlen != len) {
-        result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf + inlen, len - outlen, utf8_output + outlen);
-        res.count += inlen;
-        return res;
-    }
-    return { simdutf::SUCCESS, outlen };
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    size_t outlen;
-    size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
-    if (inlen != len) {
-        result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf + inlen, len - outlen, utf8_output + outlen);
-        res.count += inlen;
-        return res;
-    }
-    return { simdutf::SUCCESS, outlen };
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  size_t outlen;
+  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
+  if(inlen != len) { return 0; }
+  return outlen;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  size_t outlen;
+  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
+  if(inlen != len) { return 0; }
+  return outlen;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  size_t outlen;
+  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
+  if(inlen != len) {
+    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf + inlen, len - outlen, utf8_output + outlen);
+    res.count += inlen;
+    return res;
+  }
+  return {simdutf::SUCCESS, outlen};
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  size_t outlen;
+  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
+  if(inlen != len) {
+    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf + inlen, len - outlen, utf8_output + outlen);
+    res.count += inlen;
+    return res;
+  }
+  return {simdutf::SUCCESS, outlen};
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf16le_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    std::pair<const char32_t*, char*> ret = avx512_convert_utf32_to_utf8(buf, len, utf8_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf8_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char*> ret = icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-    if (ret.first.count != len) {
-        result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return convert_utf32_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_with_errors(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf16_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_valid(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf16_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
-}
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
-    if (ret.first.count != len) {
-        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char32_t*, char*> ret = avx512_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
-    if (ret.first.count != len) {
-        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char*> ret = icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) { return 0; }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) { return 0; }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) {
+    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    scalar_res.count += (std::get<0>(ret) - buf);
+    return scalar_res;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_res.error) {
+      scalar_res.count += (std::get<0>(ret) - buf);
+      return scalar_res;
+    } else {
+      scalar_res.count += saved_bytes;
+      return scalar_res;
+    }
+  }
+  return simdutf::result(simdutf::SUCCESS, saved_bytes);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) {
+    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    scalar_res.count += (std::get<0>(ret) - buf);
+    return scalar_res;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_res.error) {
+      scalar_res.count += (std::get<0>(ret) - buf);
+      return scalar_res;
+    } else {
+      scalar_res.count += saved_bytes;
+      return scalar_res;
+    }
+  }
+  return simdutf::result(simdutf::SUCCESS, saved_bytes);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) { return 0; }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) { return 0; }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
+  size_t pos = 0;
+  const __m512i byteflip = _mm512_setr_epi64(
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
+  while (pos + 32 <= length) {
+    __m512i utf16 = _mm512_loadu_si512((const __m512i*)(input + pos));
+    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+    _mm512_storeu_si512(output + pos, utf16);
+    pos += 32;
+  }
+  if(pos < length) {
+    __mmask32 m((1<< (length - pos))-1);
+    __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i*)(input + pos));
+    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+    _mm512_mask_storeu_epi16(output + pos, m, utf16);
+  }
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return convert_utf32_to_utf16le(buf, len, utf16_output);
-}
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return convert_utf32_to_utf16be(buf, len, utf16_output);
-}
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
+  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
+  const char16_t* ptr = input;
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-    if (!std::get<2>(ret)) {
-        return 0;
-    }
-    size_t saved_bytes = std::get<1>(ret) - utf32_output;
-    if (std::get<0>(ret) != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
-}
+  const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+  const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-    if (!std::get<2>(ret)) {
-        return 0;
-    }
-    size_t saved_bytes = std::get<1>(ret) - utf32_output;
-    if (std::get<0>(ret) != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
-}
+  size_t count{0};
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-    if (!std::get<2>(ret)) {
-        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-        scalar_res.count += (std::get<0>(ret) - buf);
-        return scalar_res;
-    }
-    size_t saved_bytes = std::get<1>(ret) - utf32_output;
-    if (std::get<0>(ret) != buf + len) {
-        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-        if (scalar_res.error) {
-            scalar_res.count += (std::get<0>(ret) - buf);
-            return scalar_res;
-        } else {
-            scalar_res.count += saved_bytes;
-            return scalar_res;
-        }
-    }
-    return simdutf::result(simdutf::SUCCESS, saved_bytes);
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-    if (!std::get<2>(ret)) {
-        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-        scalar_res.count += (std::get<0>(ret) - buf);
-        return scalar_res;
-    }
-    size_t saved_bytes = std::get<1>(ret) - utf32_output;
-    if (std::get<0>(ret) != buf + len) {
-        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-        if (scalar_res.error) {
-            scalar_res.count += (std::get<0>(ret) - buf);
-            return scalar_res;
-        } else {
-            scalar_res.count += saved_bytes;
-            return scalar_res;
-        }
-    }
-    return simdutf::result(simdutf::SUCCESS, saved_bytes);
-}
+  while (ptr <= end) {
+    __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
+    ptr += 32;
+    uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
+    count += count_ones(not_high_surrogate);
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-    if (!std::get<2>(ret)) {
-        return 0;
-    }
-    size_t saved_bytes = std::get<1>(ret) - utf32_output;
-    if (std::get<0>(ret) != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+  return count + scalar::utf16::count_code_points<endianness::LITTLE>(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-    if (!std::get<2>(ret)) {
-        return 0;
-    }
-    size_t saved_bytes = std::get<1>(ret) - utf32_output;
-    if (std::get<0>(ret) != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
-}
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
+  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
+  const char16_t* ptr = input;
 
-void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
-{
-    size_t pos = 0;
-    const __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809);
-    while (pos + 32 <= length) {
-        __m512i utf16 = _mm512_loadu_si512((const __m512i*)(input + pos));
-        utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-        _mm512_storeu_si512(output + pos, utf16);
-        pos += 32;
-    }
-    if (pos < length) {
-        __mmask32 m((1 << (length - pos)) - 1);
-        __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i*)(input + pos));
-        utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-        _mm512_mask_storeu_epi16(output + pos, m, utf16);
-    }
-}
+  const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+  const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
-    const char16_t* ptr = input;
+  size_t count{0};
+  const __m512i byteflip = _mm512_setr_epi64(
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
+  while (ptr <= end) {
+    __m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip);
+    ptr += 32;
+    uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
+    count += count_ones(not_high_surrogate);
+  }
 
-    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
-    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+  return count + scalar::utf16::count_code_points<endianness::BIG>(ptr, length - (ptr - input));
+}
 
-    size_t count { 0 };
 
-    while (ptr <= end) {
-        __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
-        ptr += 32;
-        uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
-        count += count_ones(not_high_surrogate);
-    }
+simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
+  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
+  size_t answer =  length / sizeof(__m512i) * sizeof(__m512i); // Number of 512-bit chunks that fits into the length.
+  size_t i = 0;
+  __m512i unrolled_popcount{0}; 
 
-    return count + scalar::utf16::count_code_points<endianness::LITTLE>(ptr, length - (ptr - input));
-}
+  const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
-    const char16_t* ptr = input;
+  while (i + sizeof(__m512i) <= length) {
+    size_t iterations = (length - i) / sizeof(__m512i);
 
-    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
-    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+    size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
+    for (; i + 8*sizeof(__m512i) <= max_i; i += 8*sizeof(__m512i)) {
+        __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
+        __m512i input2 = _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
+        __m512i input3 = _mm512_loadu_si512((const __m512i *)(str + i + 2*sizeof(__m512i)));
+        __m512i input4 = _mm512_loadu_si512((const __m512i *)(str + i + 3*sizeof(__m512i)));
+        __m512i input5 = _mm512_loadu_si512((const __m512i *)(str + i + 4*sizeof(__m512i)));
+        __m512i input6 = _mm512_loadu_si512((const __m512i *)(str + i + 5*sizeof(__m512i)));
+        __m512i input7 = _mm512_loadu_si512((const __m512i *)(str + i + 6*sizeof(__m512i)));
+        __m512i input8 = _mm512_loadu_si512((const __m512i *)(str + i + 7*sizeof(__m512i)));
 
-    size_t count { 0 };
-    const __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809);
-    while (ptr <= end) {
-        __m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip);
-        ptr += 32;
-        uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
-        count += count_ones(not_high_surrogate);
-    }
 
-    return count + scalar::utf16::count_code_points<endianness::BIG>(ptr, length - (ptr - input));
-}
+        __mmask64 mask1 = _mm512_cmple_epi8_mask(input1, continuation);
+        __mmask64 mask2 = _mm512_cmple_epi8_mask(input2, continuation);
+        __mmask64 mask3 = _mm512_cmple_epi8_mask(input3, continuation);
+        __mmask64 mask4 = _mm512_cmple_epi8_mask(input4, continuation);
+        __mmask64 mask5 = _mm512_cmple_epi8_mask(input5, continuation);
+        __mmask64 mask6 = _mm512_cmple_epi8_mask(input6, continuation);
+        __mmask64 mask7 = _mm512_cmple_epi8_mask(input7, continuation);
+        __mmask64 mask8 = _mm512_cmple_epi8_mask(input8, continuation);
 
-simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
-{
-    const char* end = length >= 64 ? input + length - 64 : nullptr;
-    const char* ptr = input;
+        __m512i mask_register = _mm512_set_epi64(mask8, mask7, mask6, mask5, mask4, mask3, mask2, mask1);
 
-    const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
 
-    size_t count { 0 };
+        unrolled_popcount = _mm512_add_epi64(unrolled_popcount, _mm512_popcnt_epi64(mask_register));
+    }
 
-    while (ptr <= end) {
-        __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        ptr += 64;
-        uint64_t continuation_bitmask = static_cast<uint64_t>(_mm512_cmple_epi8_mask(utf8, continuation));
-        count += 64 - count_ones(continuation_bitmask);
+    for (; i <= max_i; i += sizeof(__m512i)) {
+      __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
+      uint64_t continuation_bitmask = static_cast<uint64_t>(_mm512_cmple_epi8_mask(more_input, continuation));
+      answer -= count_ones(continuation_bitmask);
     }
+  }
+
+  __m256i first_half = _mm512_extracti64x4_epi64(unrolled_popcount, 0);
+  __m256i second_half = _mm512_extracti64x4_epi64(unrolled_popcount, 1);
+  answer -= (size_t)_mm256_extract_epi64(first_half, 0) +
+            (size_t)_mm256_extract_epi64(first_half, 1) +
+            (size_t)_mm256_extract_epi64(first_half, 2) +
+            (size_t)_mm256_extract_epi64(first_half, 3) +
+            (size_t)_mm256_extract_epi64(second_half, 0) +
+            (size_t)_mm256_extract_epi64(second_half, 1) +
+            (size_t)_mm256_extract_epi64(second_half, 2) +
+            (size_t)_mm256_extract_epi64(second_half, 3);
 
-    return count + scalar::utf8::count_code_points(ptr, length - (ptr - input));
+  return answer + scalar::utf8::count_code_points(reinterpret_cast<const char *>(str + i), length - i);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
-{
-    return scalar::utf8::latin1_length_from_utf8(buf, len);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept {
+  return scalar::utf8::latin1_length_from_utf8(buf,len);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
-{
-    return scalar::utf16::latin1_length_from_utf16(length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
-{
-    return scalar::utf32::latin1_length_from_utf32(length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32( size_t length) const noexcept {
+  return scalar::utf32::latin1_length_from_utf32(length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
-    const char16_t* ptr = input;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
+  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
+  const char16_t* ptr = input;
 
-    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+  const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+  const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+  const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
 
-    size_t count { 0 };
+  size_t count{0};
 
-    while (ptr <= end) {
-        __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
-        ptr += 32;
-        __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-        __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-        __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-        __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+  while (ptr <= end) {
+    __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
+    ptr += 32;
+    __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+    __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+    __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+    __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
 
-        size_t ascii_count = count_ones(ascii_bitmask);
-        size_t two_bytes_count = count_ones(two_bytes_bitmask);
-        size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-        size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+    size_t ascii_count = count_ones(ascii_bitmask);
+    size_t two_bytes_count = count_ones(two_bytes_bitmask);
+    size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+    size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
 
-        count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count + 2 * surrogate_bytes_count;
-    }
+    count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
+  }
 
-    return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(ptr, length - (ptr - input));
+  return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
-    const char16_t* ptr = input;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
+  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
+  const char16_t* ptr = input;
 
-    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+  const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+  const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+  const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
 
-    size_t count { 0 };
-    const __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809,
-        0x0607040502030001,
-        0x0e0f0c0d0a0b0809);
-    while (ptr <= end) {
-        __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
-        utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-        ptr += 32;
-        __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-        __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-        __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-        __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+  size_t count{0};
+  const __m512i byteflip = _mm512_setr_epi64(
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
+  while (ptr <= end) {
+    __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
+    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+    ptr += 32;
+    __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+    __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+    __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+    __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
 
-        size_t ascii_count = count_ones(ascii_bitmask);
-        size_t two_bytes_count = count_ones(two_bytes_bitmask);
-        size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-        size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
-        count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count + 2 * surrogate_bytes_count;
-    }
+    size_t ascii_count = count_ones(ascii_bitmask);
+    size_t two_bytes_count = count_ones(two_bytes_bitmask);
+    size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+    size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+    count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
+  }
 
-    return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(ptr, length - (ptr - input));
+  return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return implementation::count_utf16le(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
+  return implementation::count_utf16le(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return implementation::count_utf16be(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
+  return implementation::count_utf16be(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
-{
-    return scalar::latin1::utf16_length_from_latin1(length);
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
-{
-    return scalar::latin1::utf32_length_from_latin1(length);
+
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
-{
-    return scalar::latin1::utf8_length_from_latin1(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept {
+  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
+  size_t answer = length / sizeof(__m512i) * sizeof(__m512i);
+  size_t i = 0;
+  unsigned char v_0xFF = 0xff;
+  __m512i eight_64bits = _mm512_setzero_si512();
+  while (i + sizeof(__m512i) <= length) {
+    __m512i runner = _mm512_setzero_si512();
+    size_t iterations = (length - i) / sizeof(__m512i);
+    if (iterations > 255) {
+      iterations = 255;
+    }
+    size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
+    for (; i + 4*sizeof(__m512i) <= max_i; i += 4*sizeof(__m512i)) {
+            // Load four __m512i vectors
+            __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
+            __m512i input2 = _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
+            __m512i input3 = _mm512_loadu_si512((const __m512i *)(str + i + 2*sizeof(__m512i)));
+            __m512i input4 = _mm512_loadu_si512((const __m512i *)(str + i + 3*sizeof(__m512i)));
+
+            // Generate four masks
+            __mmask64 mask1 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1);
+            __mmask64 mask2 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2);
+            __mmask64 mask3 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3);
+            __mmask64 mask4 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4);
+            // Apply the masks and subtract from the runner
+            __m512i not_ascii1 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF);
+            __m512i not_ascii2 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF);
+            __m512i not_ascii3 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF);
+            __m512i not_ascii4 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF);
+
+            runner = _mm512_sub_epi8(runner, not_ascii1);
+            runner = _mm512_sub_epi8(runner, not_ascii2);
+            runner = _mm512_sub_epi8(runner, not_ascii3);
+            runner = _mm512_sub_epi8(runner, not_ascii4);
+    }
+
+    for (; i <= max_i; i += sizeof(__m512i)) {
+      __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
+
+      __mmask64 mask = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input);
+      __m512i not_ascii = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF);
+      runner = _mm512_sub_epi8(runner, not_ascii);
+    }
+
+    eight_64bits = _mm512_add_epi64(eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512()));
+  }
+
+  __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0);
+  __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1);
+  answer += (size_t)_mm256_extract_epi64(first_half, 0) +
+            (size_t)_mm256_extract_epi64(first_half, 1) +
+            (size_t)_mm256_extract_epi64(first_half, 2) +
+            (size_t)_mm256_extract_epi64(first_half, 3) +
+            (size_t)_mm256_extract_epi64(second_half, 0) +
+            (size_t)_mm256_extract_epi64(second_half, 1) +
+            (size_t)_mm256_extract_epi64(second_half, 2) +
+            (size_t)_mm256_extract_epi64(second_half, 3);
+  return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast<const char *>(str + i), length - i);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
-{
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for (; pos + 64 <= length; pos += 64) {
-        __m512i utf8 = _mm512_loadu_si512((const __m512i*)(input + pos));
-        uint64_t utf8_continuation_mask = _mm512_cmple_epi8_mask(utf8, _mm512_set1_epi8(-65 + 1));
-        // We count one word for anything that is not a continuation (so
-        // leading bytes).
-        count += 64 - count_ones(utf8_continuation_mask);
-        uint64_t utf8_4byte = _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
-        count += count_ones(utf8_4byte);
+    for(;pos + 64 <= length; pos += 64) {
+      __m512i utf8 = _mm512_loadu_si512((const __m512i*)(input+pos));
+      uint64_t utf8_continuation_mask = _mm512_cmple_epi8_mask(utf8, _mm512_set1_epi8(-65+1));
+      // We count one word for anything that is not a continuation (so
+      // leading bytes).
+      count += 64 - count_ones(utf8_continuation_mask);
+      uint64_t utf8_4byte = _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
+      count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
-{
-    const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
-    const char32_t* ptr = input;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
+  const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
+  const char32_t* ptr = input;
 
-    const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
-    const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
-    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+  const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
+  const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
+  const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
 
-    size_t count { 0 };
+  size_t count{0};
 
-    while (ptr <= end) {
-        __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
-        ptr += 16;
-        __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
-        __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
-        __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff);
+  while (ptr <= end) {
+    __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
+    ptr += 16;
+    __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
+    __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
+    __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff);
 
-        size_t ascii_count = count_ones(ascii_bitmask);
-        size_t two_bytes_count = count_ones(two_bytes_bitmask);
-        size_t three_bytes_count = count_ones(three_bytes_bitmask);
-        size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count;
-        count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count + 4 * four_bytes_count;
-    }
+    size_t ascii_count = count_ones(ascii_bitmask);
+    size_t two_bytes_count = count_ones(two_bytes_bitmask);
+    size_t three_bytes_count = count_ones(three_bytes_bitmask);
+    size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count;
+    count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 4*four_bytes_count;
+  }
 
-    return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
+  return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
-{
-    const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
-    const char32_t* ptr = input;
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
+  const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
+  const char32_t* ptr = input;
 
-    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+  const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
 
-    size_t count { 0 };
+  size_t count{0};
 
-    while (ptr <= end) {
-        __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
-        ptr += 16;
-        __mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
+  while (ptr <= end) {
+    __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
+    ptr += 16;
+    __mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
 
-        count += 16 + count_ones(surrogates_bitmask);
-    }
+    count += 16 + count_ones(surrogates_bitmask);
+  }
 
-    return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
+  return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
-{
-    return implementation::count_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
+  return implementation::count_utf8(input, length);
 }
 
 } // namespace icelake
@@ -22154,6 +20781,7 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* in
 SIMDUTF_UNTARGET_REGION
 #endif
 
+
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
 SIMDUTF_POP_DISABLE_WARNINGS
 #endif // end of workaround
@@ -22176,7 +20804,7 @@ SIMDUTF_TARGET_HASWELL
 #endif
 
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe - uninitialized)
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
 #endif // end of workaround
 /* end file src/simdutf/haswell/begin.h */
 namespace simdutf {
@@ -22187,34 +20815,31 @@ namespace {
 #endif
 using namespace simd;
 
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input)
-{
-    return input.reduce_or().is_ascii();
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+  return input.reduce_or().is_ascii();
 }
 
-simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
-{
-    simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
-    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
-    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
-    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-    return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
-{
-    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
-    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
-    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-    return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_detect_encodings.cpp
 /* begin file src/haswell/avx2_detect_encodings.cpp */
 template<class checker>
 // len is known to be a multiple of 2 when this is called
-int avx2_detect_encodings(const char* buf, size_t len)
-{
+int avx2_detect_encodings(const char * buf, size_t len) {
     const char* start = buf;
     const char* end = buf + len;
 
@@ -22229,11 +20854,11 @@ int avx2_detect_encodings(const char* buf, size_t len)
 
     __m256i currentmax = _mm256_setzero_si256();
 
-    checker check {};
+    checker check{};
 
-    while (buf + 64 <= end) {
+    while(buf + 64 <= end) {
         __m256i in = _mm256_loadu_si256((__m256i*)buf);
-        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
+        __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
 
         const auto u0 = simd16<uint16_t>(in);
         const auto u1 = simd16<uint16_t>(nextin);
@@ -22259,15 +20884,15 @@ int avx2_detect_encodings(const char* buf, size_t len)
             if ((surrogates_bitmask0 & 0xaaaaaaaa) != 0) {
                 is_utf32 = false;
                 // Code from avx2_validate_utf16le.cpp
-                const char16_t* input = reinterpret_cast<const char16_t*>(buf);
-                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len / 2;
+                const char16_t * input = reinterpret_cast<const char16_t*>(buf);
+                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
 
                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
                 const uint32_t V0 = ~surrogates_bitmask0;
 
-                const auto vH0 = (in16 & v_fc) == v_dc;
+                const auto    vH0 = (in16 & v_fc) == v_dc;
                 const uint32_t H0 = vH0.to_bitmask();
 
                 const uint32_t L0 = ~H0 & surrogates_bitmask0;
@@ -22300,7 +20925,7 @@ int avx2_detect_encodings(const char* buf, size_t len)
                     } else {
                         const uint32_t V = ~surrogates_bitmask;
 
-                        const auto vH = (in_16 & v_fc) == v_dc;
+                        const auto    vH = (in_16 & v_fc) == v_dc;
                         const uint32_t H = vH.to_bitmask();
 
                         const uint32_t L = ~H & surrogates_bitmask;
@@ -22324,8 +20949,8 @@ int avx2_detect_encodings(const char* buf, size_t len)
                 is_utf16 = false;
                 // Check for UTF-32
                 if (len % 4 == 0) {
-                    const char32_t* input = reinterpret_cast<const char32_t*>(buf);
-                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len / 4;
+                    const char32_t * input = reinterpret_cast<const char32_t*>(buf);
+                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
 
                     // Must start checking for surrogates
                     __m256i currentoffsetmax = _mm256_setzero_si256();
@@ -22339,14 +20964,14 @@ int avx2_detect_encodings(const char* buf, size_t len)
                     currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(nextin, offset), currentoffsetmax);
 
                     while (input + 8 < end32) {
-                        const __m256i in32 = _mm256_loadu_si256((__m256i*)input);
-                        currentmax = _mm256_max_epu32(in32, currentmax);
+                        const __m256i in32 = _mm256_loadu_si256((__m256i *)input);
+                        currentmax = _mm256_max_epu32(in32,currentmax);
                         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in32, offset), currentoffsetmax);
                         input += 8;
                     }
 
                     __m256i forbidden_words = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-                    if (_mm256_testz_si256(forbidden_words, forbidden_words) == 0) {
+                    if(_mm256_testz_si256(forbidden_words, forbidden_words) == 0) {
                         return simdutf::encoding_type::unspecified;
                     }
                 } else {
@@ -22373,7 +20998,7 @@ int avx2_detect_encodings(const char* buf, size_t len)
 
     if (is_utf8) {
         if (static_cast<size_t>(buf - start) != len) {
-            uint8_t block[64] {};
+            uint8_t block[64]{};
             std::memset(block, 0x20, 64);
             std::memcpy(block, buf, len - (buf - start));
             simd::simd8x64<uint8_t> in(block);
@@ -22384,14 +21009,14 @@ int avx2_detect_encodings(const char* buf, size_t len)
         }
     }
 
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start)) / 2)) {
+    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
         out |= simdutf::encoding_type::UTF16_LE;
     }
 
     if (is_utf32 && (len % 4 == 0)) {
         const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
         __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
-        if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start)) / 4)) {
+        if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
             out |= simdutf::encoding_type::UTF32_LE;
         }
     }
@@ -22447,9 +21072,8 @@ int avx2_detect_encodings(const char* buf, size_t len)
    - pointer to the last unprocessed character (a scalar fallback should check the rest);
    - nullptr if an error was detected.
 */
-template<endianness big_endian>
-const char16_t* avx2_validate_utf16(const char16_t* input, size_t size)
-{
+template <endianness big_endian>
+const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
     const char16_t* end = input + size;
 
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -22491,19 +21115,19 @@ const char16_t* avx2_validate_utf16(const char16_t* input, size_t size)
             const uint32_t V = ~surrogates_bitmask;
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto vH = (in & v_fc) == v_dc;
+            const auto    vH = (in & v_fc) == v_dc;
             const uint32_t H = vH.to_bitmask();
 
             // L - word mask for low surrogates
             //     L = not H and surrogates_wordmask
             const uint32_t L = ~H & surrogates_bitmask;
 
-            const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one.
-                                             // (A low surrogate placed in the 7th register's word
-                                             // is an exception we handle.)
-            const uint32_t b = a << 1; // Just mark that the opposite fact is hold,
-                                       // thanks to that we have only two masks for valid case.
-            const uint32_t c = V | a | b; // Combine all the masks into the final one.
+            const uint32_t a = L & (H >> 1);  // A low surrogate must be followed by high one.
+                                              // (A low surrogate placed in the 7th register's word
+                                              // is an exception we handle.)
+            const uint32_t b = a << 1;        // Just mark that the opposite fact is hold,
+                                              // thanks to that we have only two masks for valid case.
+            const uint32_t c = V | a | b;     // Combine all the masks into the final one.
 
             if (c == 0xffffffff) {
                 // The whole input register contains valid UTF-16, i.e.,
@@ -22524,9 +21148,9 @@ const char16_t* avx2_validate_utf16(const char16_t* input, size_t size)
     return input;
 }
 
-template<endianness big_endian>
-const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size)
-{
+
+template <endianness big_endian>
+const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) {
     const char16_t* start = input;
     const char16_t* end = input + size;
 
@@ -22569,19 +21193,19 @@ const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size)
             const uint32_t V = ~surrogates_bitmask;
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto vH = (in & v_fc) == v_dc;
+            const auto    vH = (in & v_fc) == v_dc;
             const uint32_t H = vH.to_bitmask();
 
             // L - word mask for low surrogates
             //     L = not H and surrogates_wordmask
             const uint32_t L = ~H & surrogates_bitmask;
 
-            const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one.
-                                             // (A low surrogate placed in the 7th register's word
-                                             // is an exception we handle.)
-            const uint32_t b = a << 1; // Just mark that the opposite fact is hold,
-                                       // thanks to that we have only two masks for valid case.
-            const uint32_t c = V | a | b; // Combine all the masks into the final one.
+            const uint32_t a = L & (H >> 1);  // A low surrogate must be followed by high one.
+                                              // (A low surrogate placed in the 7th register's word
+                                              // is an exception we handle.)
+            const uint32_t b = a << 1;        // Just mark that the opposite fact is hold,
+                                              // thanks to that we have only two masks for valid case.
+            const uint32_t c = V | a | b;     // Combine all the masks into the final one.
 
             if (c == 0xffffffff) {
                 // The whole input register contains valid UTF-16, i.e.,
@@ -22608,8 +21232,7 @@ const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size)
    - pointer to the last unprocessed character (a scalar fallback should check the rest);
    - nullptr if an error was detected.
 */
-const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size)
-{
+const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
     const char32_t* end = input + size;
 
     const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
@@ -22619,26 +21242,26 @@ const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size)
     __m256i currentoffsetmax = _mm256_setzero_si256();
 
     while (input + 8 < end) {
-        const __m256i in = _mm256_loadu_si256((__m256i*)input);
-        currentmax = _mm256_max_epu32(in, currentmax);
+        const __m256i in = _mm256_loadu_si256((__m256i *)input);
+        currentmax = _mm256_max_epu32(in,currentmax);
         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
         input += 8;
     }
     __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
-    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+    if(_mm256_testz_si256(is_zero, is_zero) == 0) {
         return nullptr;
     }
 
     is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+    if(_mm256_testz_si256(is_zero, is_zero) == 0) {
         return nullptr;
     }
 
     return input;
 }
 
-const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size)
-{
+
+const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size) {
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -22649,17 +21272,17 @@ const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t siz
     __m256i currentoffsetmax = _mm256_setzero_si256();
 
     while (input + 8 < end) {
-        const __m256i in = _mm256_loadu_si256((__m256i*)input);
-        currentmax = _mm256_max_epu32(in, currentmax);
+        const __m256i in = _mm256_loadu_si256((__m256i *)input);
+        currentmax = _mm256_max_epu32(in,currentmax);
         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
 
         __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
-        if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+        if(_mm256_testz_si256(is_zero, is_zero) == 0) {
             return result(error_code::TOO_LARGE, input - start);
         }
 
         is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-        if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+        if(_mm256_testz_si256(is_zero, is_zero) == 0) {
             return result(error_code::SURROGATE, input - start);
         }
         input += 8;
@@ -22673,278 +21296,311 @@ const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t siz
 /* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
+
 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-template<endianness big_endian>
-size_t convert_masked_utf8_to_utf16(const char* input,
-    uint64_t utf8_end_of_code_point_mask,
-    char16_t*& utf16_output)
-{
-    // we use an approach where we try to process up to 12 input bytes.
-    // Why 12 input bytes and not 16? Because we are concerned with the size of
-    // the lookup tables. Also 12 is nicely divisible by two and three.
-    //
-    //
-    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-    // beneficial to have fast paths that depend on branch prediction but have less latency.
-    // This results in more instructions but, potentially, also higher speeds.
-    //
-    // We first try a few fast paths.
-    const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i in = _mm_loadu_si128((__m128i*)input);
-    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
-    if (((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-        // We process the data in chunks of 16 bytes.
-        __m256i ascii = _mm256_cvtepu8_epi16(in);
-        if (big_endian) {
-            const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-            ascii = _mm256_shuffle_epi8(ascii, swap256);
-        }
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf16_output), ascii);
-        utf16_output += 16; // We wrote 16 16-bit characters.
-        return 16; // We consumed 16 bytes.
-    }
-    if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
-        // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
-        // There is probably a more efficient sequence, but the following might do.
-        const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-        __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-        if (big_endian)
-            composed = _mm_shuffle_epi8(composed, swap);
-        _mm_storeu_si128((__m128i*)utf16_output, composed);
-        utf16_output += 8; // We wrote 16 bytes, 8 code points.
-        return 16;
-    }
-    if (input_utf8_end_of_code_point_mask == 0x924) {
-        // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
-        // There is probably a more efficient sequence, but the following might do.
-        const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-        __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-        if (big_endian)
-            composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-        _mm_storeu_si128((__m128i*)utf16_output, composed_repacked);
-        utf16_output += 4;
-        return 12;
-    }
-
-    const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-    const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-    if (idx < 64) {
-        // SIX (6) input code-words
-        // this is a relatively easy scenario
-        // we process SIX (6) input code-words. The max length in bytes of six code
-        // words spanning between 1 and 2 bytes each is 12 bytes. On processors
-        // where pdep/pext is fast, we might be able to use a small lookup table.
-        const __m128i sh = _mm_loadu_si128((const __m128i*)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-        __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-        if (big_endian)
-            composed = _mm_shuffle_epi8(composed, swap);
-        _mm_storeu_si128((__m128i*)utf16_output, composed);
-        utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential overflow of 4 bytes.
-    } else if (idx < 145) {
-        // FOUR (4) input code-words
-        const __m128i sh = _mm_loadu_si128((const __m128i*)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-        __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-        if (big_endian)
-            composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-        _mm_storeu_si128((__m128i*)utf16_output, composed_repacked);
-        utf16_output += 4; // Here we overflow by 8 bytes.
-    } else if (idx < 209) {
-        // TWO (2) input code-words
-        //////////////
-        // There might be garbage inputs where a leading byte mascarades as a four-byte
-        // leading byte (by being followed by 3 continuation byte), but is not greater than
-        // 0xf0. This could trigger a buffer overflow if we only counted leading
-        // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
-        // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
-        // We do as at the cost of an extra mask.
-        /////////////
-        const __m128i sh = _mm_loadu_si128((const __m128i*)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-        __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-        // correct for spurious high bit
-        const __m128i correct = _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-        middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-        const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-        // We deliberately carry the leading four bits in highbyte if they are present,
-        // we remove them later when computing hightenbits.
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
-        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-        // When we need to generate a surrogate pair (leading byte > 0xF0), then
-        // the corresponding 32-bit value in 'composed'  will be greater than
-        // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
-        // location of the surrogate pairs.
-        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-            _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-        const __m128i composedminus = _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
-        const __m128i lowtenbits = _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
-        // Notice the 0x3ff mask:
-        const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
-        const __m128i lowtenbitsadd = _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
-        const __m128i hightenbitsadd = _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
-        const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
-        __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
-        uint32_t basic_buffer[4];
-        uint32_t basic_buffer_swap[4];
-        if (big_endian) {
-            _mm_storeu_si128((__m128i*)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
-            surrogates = _mm_shuffle_epi8(surrogates, swap);
-        }
-        _mm_storeu_si128((__m128i*)basic_buffer, composed);
-        uint32_t surrogate_buffer[4];
-        _mm_storeu_si128((__m128i*)surrogate_buffer, surrogates);
-        for (size_t i = 0; i < 3; i++) {
-            if (basic_buffer[i] > 0x3c00000) {
-                utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
-                utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
-                utf16_output += 2;
-            } else {
-                utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
-                utf16_output++;
-            }
-        }
-    } else {
-        // here we know that there is an error but we do not handle errors
-    }
-    return consumed;
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+                           uint64_t utf8_end_of_code_point_mask,
+                           char16_t *&utf16_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+  // beneficial to have fast paths that depend on branch prediction but have less latency.
+  // This results in more instructions but, potentially, also higher speeds.
+  //
+  // We first try a few fast paths.
+  const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
+    // We process the data in chunks of 16 bytes.
+    __m256i ascii = _mm256_cvtepu8_epi16(in);
+    if (big_endian) {
+      const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      ascii = _mm256_shuffle_epi8(ascii, swap256);
+    }
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
+    utf16_output += 16; // We wrote 16 16-bit characters.
+    return 16; // We consumed 16 bytes.
+  }
+  if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+    // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
+    // There is probably a more efficient sequence, but the following might do.
+    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed);
+    utf16_output += 8; // We wrote 16 bytes, 8 code points.
+    return 16;
+  }
+  if(input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
+    // There is probably a more efficient sequence, but the following might do.
+    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+    if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+    utf16_output += 4;
+    return 12;
+  }
+
+  const uint8_t idx =
+      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  if (idx < 64) {
+    // SIX (6) input code-words
+    // this is a relatively easy scenario
+    // we process SIX (6) input code-words. The max length in bytes of six code
+    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+    // where pdep/pext is fast, we might be able to use a small lookup table.
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed);
+    utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential overflow of 4 bytes.
+  } else if (idx < 145) {
+    // FOUR (4) input code-words
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+    if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+    utf16_output += 4; // Here we overflow by 8 bytes.
+  } else if (idx < 209) {
+    // TWO (2) input code-words
+    //////////////
+    // There might be garbage inputs where a leading byte mascarades as a four-byte
+    // leading byte (by being followed by 3 continuation byte), but is not greater than
+    // 0xf0. This could trigger a buffer overflow if we only counted leading
+    // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
+    // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
+    // We do as at the cost of an extra mask.
+    /////////////
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+    // correct for spurious high bit
+    const __m128i correct =
+        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+    // We deliberately carry the leading four bits in highbyte if they are present,
+    // we remove them later when computing hightenbits.
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+    // When we need to generate a surrogate pair (leading byte > 0xF0), then
+    // the corresponding 32-bit value in 'composed'  will be greater than
+    // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+    // location of the surrogate pairs.
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+    const __m128i composedminus =
+        _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+    const __m128i lowtenbits =
+        _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+    // Notice the 0x3ff mask:
+    const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
+    const __m128i lowtenbitsadd =
+        _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+    const __m128i hightenbitsadd =
+        _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+    const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+    __m128i surrogates =
+        _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+    uint32_t basic_buffer[4];
+    uint32_t basic_buffer_swap[4];
+    if (big_endian) {
+      _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
+      surrogates = _mm_shuffle_epi8(surrogates, swap);
+    }
+    _mm_storeu_si128((__m128i *)basic_buffer, composed);
+    uint32_t surrogate_buffer[4];
+    _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
+    for (size_t i = 0; i < 3; i++) {
+      if(basic_buffer[i] > 0x3c00000) {
+        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+        utf16_output += 2;
+      } else  {
+        utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
+        utf16_output++;
+      }
+    }
+  } else {
+    // here we know that there is an error but we do not handle errors
+  }
+  return consumed;
 }
 /* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf32.cpp
 /* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
+
 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_utf32(const char* input,
-    uint64_t utf8_end_of_code_point_mask,
-    char32_t*& utf32_output)
-{
-    // we use an approach where we try to process up to 12 input bytes.
-    // Why 12 input bytes and not 16? Because we are concerned with the size of
-    // the lookup tables. Also 12 is nicely divisible by two and three.
-    //
-    //
-    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-    // beneficial to have fast paths that depend on branch prediction but have less latency.
-    // This results in more instructions but, potentially, also higher speeds.
-    //
-    // We first try a few fast paths.
-    const __m128i in = _mm_loadu_si128((__m128i*)input);
-    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
-    if (((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-        // We process the data in chunks of 16 bytes.
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output), _mm256_cvtepu8_epi32(in));
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output + 8), _mm256_cvtepu8_epi32(_mm_srli_si128(in, 8)));
-        utf32_output += 16; // We wrote 16 32-bit characters.
-        return 16; // We consumed 16 bytes.
-    }
-    if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
-        // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
-        // There is probably a more efficient sequence, but the following might do.
-        const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-        const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-        _mm256_storeu_si256((__m256i*)utf32_output, _mm256_cvtepu16_epi32(composed));
-        utf32_output += 8; // We wrote 16 bytes, 8 code points.
-        return 16;
-    }
-    if (input_utf8_end_of_code_point_mask == 0x924) {
-        // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
-        // There is probably a more efficient sequence, but the following might do.
-        const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-        _mm_storeu_si128((__m128i*)utf32_output, composed);
-        utf32_output += 4;
-        return 12;
-    }
-    /// We do not have a fast path available, so we fallback.
-
-    const uint8_t idx = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-    const uint8_t consumed = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-    if (idx < 64) {
-        // SIX (6) input code-words
-        // this is a relatively easy scenario
-        // we process SIX (6) input code-words. The max length in bytes of six code
-        // words spanning between 1 and 2 bytes each is 12 bytes. On processors
-        // where pdep/pext is fast, we might be able to use a small lookup table.
-        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-        const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-        _mm256_storeu_si256((__m256i*)utf32_output, _mm256_cvtepu16_epi32(composed));
-        utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
-        // overflow of 32 - 24 = 8 bytes.
-    } else if (idx < 145) {
-        // FOUR (4) input code-words
-        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-        _mm_storeu_si128((__m128i*)utf32_output, composed);
-        utf32_output += 4;
-    } else if (idx < 209) {
-        // TWO (2) input code-words
-        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-        __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-        // correct for spurious high bit
-        const __m128i correct = _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-        middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-        const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
-        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-            _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-        _mm_storeu_si128((__m128i*)utf32_output, composed);
-        utf32_output += 3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
-    } else {
-        // here we know that there is an error but we do not handle errors
-    }
-    return consumed;
+size_t convert_masked_utf8_to_utf32(const char *input,
+                           uint64_t utf8_end_of_code_point_mask,
+                           char32_t *&utf32_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+  // beneficial to have fast paths that depend on branch prediction but have less latency.
+  // This results in more instructions but, potentially, also higher speeds.
+  //
+  // We first try a few fast paths.
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
+    // We process the data in chunks of 16 bytes.
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu8_epi32(in));
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output+8), _mm256_cvtepu8_epi32(_mm_srli_si128(in,8)));
+    utf32_output += 16; // We wrote 16 32-bit characters.
+    return 16; // We consumed 16 bytes.
+  }
+  if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+    // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
+    // There is probably a more efficient sequence, but the following might do.
+    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
+    utf32_output += 8; // We wrote 16 bytes, 8 code points.
+    return 16;
+  }
+  if(input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
+    // There is probably a more efficient sequence, but the following might do.
+    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    _mm_storeu_si128((__m128i *)utf32_output, composed);
+    utf32_output += 4;
+    return 12;
+  }
+  /// We do not have a fast path available, so we fallback.
+
+  const uint8_t idx =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  if (idx < 64) {
+    // SIX (6) input code-words
+    // this is a relatively easy scenario
+    // we process SIX (6) input code-words. The max length in bytes of six code
+    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+    // where pdep/pext is fast, we might be able to use a small lookup table.
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
+    utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
+    // overflow of 32 - 24 = 8 bytes.
+  } else if (idx < 145) {
+    // FOUR (4) input code-words
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    _mm_storeu_si128((__m128i *)utf32_output, composed);
+    utf32_output += 4;
+  } else if (idx < 209) {
+    // TWO (2) input code-words
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+    // correct for spurious high bit
+    const __m128i correct =
+        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+    _mm_storeu_si128((__m128i *)utf32_output, composed);
+    utf32_output += 3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
+  } else {
+    // here we know that there is an error but we do not handle errors
+  }
+  return consumed;
 }
 /* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */
 
@@ -22999,490 +21655,486 @@ size_t convert_masked_utf8_to_utf32(const char* input,
     - We need two 256-entry tables that have 8704 bytes in total.
 */
 
+
 /*
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template<endianness big_endian>
-std::pair<const char16_t*, char*> avx2_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output)
-{
-    const char16_t* end = buf + len;
-    const __m256i v_0000 = _mm256_setzero_si256();
-    const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-    const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-    const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-    while (buf + 16 + safety_margin <= end) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
-        if (big_endian) {
-            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-            in = _mm256_shuffle_epi8(in, swap);
-        }
-        // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-        const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
-        if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
-            // 1. pack the bytes
-            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
-            // 2. store (16 bytes)
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-            // 3. adjust pointers
-            buf += 16;
-            utf8_output += 16;
-            continue; // we are done for this round!
-        }
-        // no bits set above 7th bit
-        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
-        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-        // no bits set above 11th bit
-        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
-        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-        if (one_or_two_bytes_bitmask == 0xffffffff) {
-
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const __m256i t0 = _mm256_slli_epi16(in, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const __m256i t2 = _mm256_and_si256(in, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const __m256i t3 = _mm256_or_si256(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-            // 2. merge ASCII and 2-byte codewords
-            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
-
-            // 3. prepare bitmask for 8-bit lookup
-            const uint32_t M0 = one_byte_bitmask & 0x55555555;
-            const uint32_t M1 = M0 >> 7;
-            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-            // 4. pack the bytes
-
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
-
-            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-            // 5. store bytes
-            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-            utf8_output += row[0];
-            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
-            utf8_output += row_2[0];
-
-            // 6. adjust pointers
-            buf += 16;
-            continue;
-        }
-        // 1. Check if there are any surrogate word in the input chunk.
-        //    We have also deal with situation when there is a surrogate word
-        //    at the end of a chunk.
-        const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
-
-        // bitmask = 0x0000 if there are no surrogates
-        //         = 0xc000 if the last word is a surrogate
-        const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
-        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-        // it is likely an uncommon occurrence.
-        if (surrogates_bitmask == 0x00000000) {
-            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-            /* In this branch we handle three cases:
-               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-              We expand the input word (16-bit) into two words (32-bit), thus
-              we have room for four bytes. However, we need five distinct bit
-              layouts. Note that the last byte in cases #2 and #3 is the same.
-
-              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-              in register t2.
-
-              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-              either byte 1 for case #2 or byte 2 for case #3. Note that they
-              differ by exactly one bit.
-
-              Finally from these two words we build proper UTF-8 sequence, taking
-              into account the case (i.e, the number of bytes to write).
-            */
-            /**
-             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-             * t2 => [0ccc|cccc] [10cc|cccc]
-             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-             */
+template <endianness big_endian>
+std::pair<const char16_t*, char*> avx2_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
+  const char16_t* end = buf + len;
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+    if (big_endian) {
+      const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
+    }
+    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+    if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+
+          // 1. prepare 2-byte values
+          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+          // expected output   : [110a|aaaa|10bb|bbbb] x 8
+          const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+          const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+          // t0 = [000a|aaaa|bbbb|bb00]
+          const __m256i t0 = _mm256_slli_epi16(in, 2);
+          // t1 = [000a|aaaa|0000|0000]
+          const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+          // t2 = [0000|0000|00bb|bbbb]
+          const __m256i t2 = _mm256_and_si256(in, v_003f);
+          // t3 = [000a|aaaa|00bb|bbbb]
+          const __m256i t3 = _mm256_or_si256(t1, t2);
+          // t4 = [110a|aaaa|10bb|bbbb]
+          const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+          // 2. merge ASCII and 2-byte codewords
+          const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+          // 3. prepare bitmask for 8-bit lookup
+          const uint32_t M0 = one_byte_bitmask & 0x55555555;
+          const uint32_t M1 = M0 >> 7;
+          const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
+          // 4. pack the bytes
+
+          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+          const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
+
+          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+          const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+          const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
+          // 5. store bytes
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+          utf8_output += row[0];
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
+          utf8_output += row_2[0];
+
+          // 6. adjust pointers
+          buf += 16;
+          continue;
+    }
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+        const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                                                0x0000, 0x0202, 0x0404, 0x0606,
+                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+        /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+          We expand the input word (16-bit) into two words (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+          either byte 1 for case #2 or byte 2 for case #3. Note that they
+          differ by exactly one bit.
+
+          Finally from these two words we build proper UTF-8 sequence, taking
+          into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-            const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
-            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
-            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
-
-            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-            const __m256i s0 = _mm256_srli_epi16(in, 4);
-            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
-            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
-            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
-            const __m256i s4 = _mm256_xor_si256(s3, m0);
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
+
+        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+        const __m256i s0 = _mm256_srli_epi16(in, 4);
+        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+        const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+        const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+        const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+        const __m256i s4 = _mm256_xor_si256(s3, m0);
 #undef simdutf_vec
 
-            // 4. expand words 16-bit => 32-bit
-            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-            // Due to the wider registers, the following path is less likely to be useful.
-            /*if(mask == 0) {
-              // We only have three-byte words. Use fast path.
-              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-              utf8_output += 12;
-              buf += 16;
-              continue;
-            }*/
-            const uint8_t mask0 = uint8_t(mask);
-            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
-
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-            utf8_output += row0[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-            utf8_output += row1[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-            utf8_output += row2[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-            utf8_output += row3[0];
-            buf += 16;
-            // surrogate pair(s) in a register
+        // 4. expand words 16-bit => 32-bit
+        const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+        const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                              (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+        // Due to the wider registers, the following path is less likely to be useful.
+        /*if(mask == 0) {
+          // We only have three-byte words. Use fast path.
+          const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+          const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+          const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+          utf8_output += 12;
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+          utf8_output += 12;
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+          utf8_output += 12;
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+          utf8_output += 12;
+          buf += 16;
+          continue;
+        }*/
+        const uint8_t mask0 = uint8_t(mask);
+        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+        const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+        const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+        const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+        const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+        const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+        const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
+
+
+        const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+        const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+        const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+        const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
+
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+        utf8_output += row0[0];
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+        utf8_output += row1[0];
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+        utf8_output += row2[0];
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+        utf8_output += row3[0];
+        buf += 16;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if((word & 0xFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word &0xF800 ) != 0xD800) {
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-                if ((word & 0xFF80) == 0) {
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xF800) == 0) {
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xF800) != 0xD800) {
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else {
-                    // must be a surrogate pair
-                    uint16_t diff = uint16_t(word - 0xD800);
-                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-                    k++;
-                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
-                    if ((diff | diff2) > 0x3FF) {
-                        return std::make_pair(nullptr, utf8_output);
-                    }
-                    uint32_t value = (diff << 10) + diff2 + 0x10000;
-                    *utf8_output++ = char((value >> 18) | 0b11110000);
-                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf8_output); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value>>18) | 0b11110000);
+          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
         }
-    } // while
-    return std::make_pair(buf, utf8_output);
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(buf, utf8_output);
 }
 
+
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template<endianness big_endian>
-std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output)
-{
-    const char16_t* start = buf;
-    const char16_t* end = buf + len;
-
-    const __m256i v_0000 = _mm256_setzero_si256();
-    const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-    const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-    const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-    while (buf + 16 + safety_margin <= end) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
-        if (big_endian) {
-            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-            in = _mm256_shuffle_epi8(in, swap);
-        }
-        // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-        const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
-        if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
-            // 1. pack the bytes
-            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
-            // 2. store (16 bytes)
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-            // 3. adjust pointers
-            buf += 16;
-            utf8_output += 16;
-            continue; // we are done for this round!
-        }
-        // no bits set above 7th bit
-        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
-        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-        // no bits set above 11th bit
-        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
-        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-        if (one_or_two_bytes_bitmask == 0xffffffff) {
-
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const __m256i t0 = _mm256_slli_epi16(in, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const __m256i t2 = _mm256_and_si256(in, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const __m256i t3 = _mm256_or_si256(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-            // 2. merge ASCII and 2-byte codewords
-            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
-
-            // 3. prepare bitmask for 8-bit lookup
-            const uint32_t M0 = one_byte_bitmask & 0x55555555;
-            const uint32_t M1 = M0 >> 7;
-            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-            // 4. pack the bytes
-
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
-
-            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-            // 5. store bytes
-            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-            utf8_output += row[0];
-            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
-            utf8_output += row_2[0];
-
-            // 6. adjust pointers
-            buf += 16;
-            continue;
-        }
-        // 1. Check if there are any surrogate word in the input chunk.
-        //    We have also deal with situation when there is a surrogate word
-        //    at the end of a chunk.
-        const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
-
-        // bitmask = 0x0000 if there are no surrogates
-        //         = 0xc000 if the last word is a surrogate
-        const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
-        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-        // it is likely an uncommon occurrence.
-        if (surrogates_bitmask == 0x00000000) {
-            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-            /* In this branch we handle three cases:
-               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-              We expand the input word (16-bit) into two words (32-bit), thus
-              we have room for four bytes. However, we need five distinct bit
-              layouts. Note that the last byte in cases #2 and #3 is the same.
-
-              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-              in register t2.
-
-              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-              either byte 1 for case #2 or byte 2 for case #3. Note that they
-              differ by exactly one bit.
-
-              Finally from these two words we build proper UTF-8 sequence, taking
-              into account the case (i.e, the number of bytes to write).
-            */
-            /**
-             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-             * t2 => [0ccc|cccc] [10cc|cccc]
-             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-             */
+template <endianness big_endian>
+std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
+  const char16_t* start = buf;
+  const char16_t* end = buf + len;
+
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+    if (big_endian) {
+      const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
+    }
+    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+    if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+
+          // 1. prepare 2-byte values
+          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+          // expected output   : [110a|aaaa|10bb|bbbb] x 8
+          const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+          const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+          // t0 = [000a|aaaa|bbbb|bb00]
+          const __m256i t0 = _mm256_slli_epi16(in, 2);
+          // t1 = [000a|aaaa|0000|0000]
+          const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+          // t2 = [0000|0000|00bb|bbbb]
+          const __m256i t2 = _mm256_and_si256(in, v_003f);
+          // t3 = [000a|aaaa|00bb|bbbb]
+          const __m256i t3 = _mm256_or_si256(t1, t2);
+          // t4 = [110a|aaaa|10bb|bbbb]
+          const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+          // 2. merge ASCII and 2-byte codewords
+          const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+          // 3. prepare bitmask for 8-bit lookup
+          const uint32_t M0 = one_byte_bitmask & 0x55555555;
+          const uint32_t M1 = M0 >> 7;
+          const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
+          // 4. pack the bytes
+
+          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+          const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
+
+          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+          const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+          const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
+          // 5. store bytes
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+          utf8_output += row[0];
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
+          utf8_output += row_2[0];
+
+          // 6. adjust pointers
+          buf += 16;
+          continue;
+    }
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+        const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                                                0x0000, 0x0202, 0x0404, 0x0606,
+                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+        /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+          We expand the input word (16-bit) into two words (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+          either byte 1 for case #2 or byte 2 for case #3. Note that they
+          differ by exactly one bit.
+
+          Finally from these two words we build proper UTF-8 sequence, taking
+          into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-            const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
-            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
-            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
-
-            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-            const __m256i s0 = _mm256_srli_epi16(in, 4);
-            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
-            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
-            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
-            const __m256i s4 = _mm256_xor_si256(s3, m0);
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
+
+        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+        const __m256i s0 = _mm256_srli_epi16(in, 4);
+        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+        const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+        const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+        const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+        const __m256i s4 = _mm256_xor_si256(s3, m0);
 #undef simdutf_vec
 
-            // 4. expand words 16-bit => 32-bit
-            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-            // Due to the wider registers, the following path is less likely to be useful.
-            /*if(mask == 0) {
-              // We only have three-byte words. Use fast path.
-              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-              utf8_output += 12;
-              buf += 16;
-              continue;
-            }*/
-            const uint8_t mask0 = uint8_t(mask);
-            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
-
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-            utf8_output += row0[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-            utf8_output += row1[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-            utf8_output += row2[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-            utf8_output += row3[0];
-            buf += 16;
-            // surrogate pair(s) in a register
+        // 4. expand words 16-bit => 32-bit
+        const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+        const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                              (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+        // Due to the wider registers, the following path is less likely to be useful.
+        /*if(mask == 0) {
+          // We only have three-byte words. Use fast path.
+          const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+          const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+          const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+          utf8_output += 12;
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+          utf8_output += 12;
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+          utf8_output += 12;
+          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+          utf8_output += 12;
+          buf += 16;
+          continue;
+        }*/
+        const uint8_t mask0 = uint8_t(mask);
+        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+        const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+        const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+        const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+        const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+        const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+        const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
+
+
+        const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+        const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+        const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+        const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
+
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+        utf8_output += row0[0];
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+        utf8_output += row1[0];
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+        utf8_output += row2[0];
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+        utf8_output += row3[0];
+        buf += 16;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if((word & 0xFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word &0xF800 ) != 0xD800) {
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-                if ((word & 0xFF80) == 0) {
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xF800) == 0) {
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xF800) != 0xD800) {
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else {
-                    // must be a surrogate pair
-                    uint16_t diff = uint16_t(word - 0xD800);
-                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-                    k++;
-                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
-                    if ((diff | diff2) > 0x3FF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output);
-                    }
-                    uint32_t value = (diff << 10) + diff2 + 0x10000;
-                    *utf8_output++ = char((value >> 18) | 0b11110000);
-                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
-        }
-    } // while
-    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value>>18) | 0b11110000);
+          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf32.cpp
@@ -23536,786 +22188,753 @@ std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t*
     - We need two 256-entry tables that have 8704 bytes in total.
 */
 
+
 /*
   Returns a pair: the first unprocessed byte from buf and utf32_output
   A scalar routing should carry on the conversion of the tail.
 */
-template<endianness big_endian>
-std::pair<const char16_t*, char32_t*> avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output)
-{
-    const char16_t* end = buf + len;
-    const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-    const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-
-    while (buf + 16 <= end) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
-        if (big_endian) {
-            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-            in = _mm256_shuffle_epi8(in, swap);
-        }
-
-        // 1. Check if there are any surrogate word in the input chunk.
-        //    We have also deal with situation when there is a surrogate word
-        //    at the end of a chunk.
-        const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
-
-        // bitmask = 0x0000 if there are no surrogates
-        //         = 0xc000 if the last word is a surrogate
-        const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
-        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-        // it is likely an uncommon occurrence.
-        if (surrogates_bitmask == 0x00000000) {
-            // case: we extend all sixteen 16-bit words to sixteen 32-bit words
-            _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
-            _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
-            utf32_output += 16;
-            buf += 16;
-            // surrogate pair(s) in a register
+template <endianness big_endian>
+std::pair<const char16_t*, char32_t*> avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
+  const char16_t* end = buf + len;
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+
+  while (buf + 16 <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+    if (big_endian) {
+      const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
+    }
+
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: we extend all sixteen 16-bit words to sixteen 32-bit words
+        _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+        _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
+        utf32_output += 16;
+        buf += 16;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if((word &0xF800 ) != 0xD800) {
+          // No surrogate pair
+          *utf32_output++ = char32_t(word);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-                if ((word & 0xF800) != 0xD800) {
-                    // No surrogate pair
-                    *utf32_output++ = char32_t(word);
-                } else {
-                    // must be a surrogate pair
-                    uint16_t diff = uint16_t(word - 0xD800);
-                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-                    k++;
-                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
-                    if ((diff | diff2) > 0x3FF) {
-                        return std::make_pair(nullptr, utf32_output);
-                    }
-                    uint32_t value = (diff << 10) + diff2 + 0x10000;
-                    *utf32_output++ = char32_t(value);
-                }
-            }
-            buf += k;
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf32_output); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
         }
-    } // while
-    return std::make_pair(buf, utf32_output);
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(buf, utf32_output);
 }
 
+
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template<endianness big_endian>
-std::pair<result, char32_t*> avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output)
-{
-    const char16_t* start = buf;
-    const char16_t* end = buf + len;
-    const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-    const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-
-    while (buf + 16 <= end) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
-        if (big_endian) {
-            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-            in = _mm256_shuffle_epi8(in, swap);
-        }
-
-        // 1. Check if there are any surrogate word in the input chunk.
-        //    We have also deal with situation when there is a surrogate word
-        //    at the end of a chunk.
-        const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
-
-        // bitmask = 0x0000 if there are no surrogates
-        //         = 0xc000 if the last word is a surrogate
-        const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
-        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-        // it is likely an uncommon occurrence.
-        if (surrogates_bitmask == 0x00000000) {
-            // case: we extend all sixteen 16-bit words to sixteen 32-bit words
-            _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
-            _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
-            utf32_output += 16;
-            buf += 16;
-            // surrogate pair(s) in a register
+template <endianness big_endian>
+std::pair<result, char32_t*> avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
+  const char16_t* start = buf;
+  const char16_t* end = buf + len;
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+
+  while (buf + 16 <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+    if (big_endian) {
+      const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
+    }
+
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: we extend all sixteen 16-bit words to sixteen 32-bit words
+        _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+        _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
+        utf32_output += 16;
+        buf += 16;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if((word &0xF800 ) != 0xD800) {
+          // No surrogate pair
+          *utf32_output++ = char32_t(word);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-                if ((word & 0xF800) != 0xD800) {
-                    // No surrogate pair
-                    *utf32_output++ = char32_t(word);
-                } else {
-                    // must be a surrogate pair
-                    uint16_t diff = uint16_t(word - 0xD800);
-                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-                    k++;
-                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
-                    if ((diff | diff2) > 0x3FF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output);
-                    }
-                    uint32_t value = (diff << 10) + diff2 + 0x10000;
-                    *utf32_output++ = char32_t(value);
-                }
-            }
-            buf += k;
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
         }
-    } // while
-    return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
 }
 /* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */
 
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf8.cpp
 /* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */
-std::pair<const char32_t*, char*> avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output)
-{
-    const char32_t* end = buf + len;
-    const __m256i v_0000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-    const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-    const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-    const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-    const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-    __m256i running_max = _mm256_setzero_si256();
-    __m256i forbidden_bytemask = _mm256_setzero_si256();
-
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-    while (buf + 16 + safety_margin <= end) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
-        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
-        running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
-
-        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-        __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
-        in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
-
-        // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
-
-        if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-            // 1. pack the bytes
-            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
-            // 2. store (16 bytes)
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-            // 3. adjust pointers
-            buf += 16;
-            utf8_output += 16;
-            continue; // we are done for this round!
-        }
-        // no bits set above 7th bit
-        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-        // no bits set above 11th bit
-        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-        if (one_or_two_bytes_bitmask == 0xffffffff) {
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const __m256i t3 = _mm256_or_si256(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-            // 2. merge ASCII and 2-byte codewords
-            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-            // 3. prepare bitmask for 8-bit lookup
-            const uint32_t M0 = one_byte_bitmask & 0x55555555;
-            const uint32_t M1 = M0 >> 7;
-            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-            // 4. pack the bytes
-
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
-
-            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-            // 5. store bytes
-            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-            utf8_output += row[0];
-            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
-            utf8_output += row_2[0];
-
-            // 6. adjust pointers
-            buf += 16;
-            continue;
-        }
-        // Must check for overflow in packing
-        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-        if (saturation_bitmask == 0xffffffff) {
-            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-            const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-            forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
-
-            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-            /* In this branch we handle three cases:
-              1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-              2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-              3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-              We expand the input word (16-bit) into two words (32-bit), thus
-              we have room for four bytes. However, we need five distinct bit
-              layouts. Note that the last byte in cases #2 and #3 is the same.
-
-              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-              in register t2.
-
-              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-              either byte 1 for case #2 or byte 2 for case #3. Note that they
-              differ by exactly one bit.
-
-              Finally from these two words we build proper UTF-8 sequence, taking
-              into account the case (i.e, the number of bytes to write).
-            */
-            /**
-             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-             * t2 => [0ccc|cccc] [10cc|cccc]
-             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-             */
+std::pair<const char32_t*, char*> avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
+  const char32_t* end = buf + len;
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+  __m256i running_max = _mm256_setzero_si256();
+  __m256i forbidden_bytemask = _mm256_setzero_si256();
+
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+    __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
+    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+
+    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
+    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+    // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+    if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
+      // 4. pack the bytes
+
+      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
+
+      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+      const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
+      utf8_output += row_2[0];
+
+      // 6. adjust pointers
+      buf += 16;
+      continue;
+    }
+    // Must check for overflow in packing
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffffffff) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+      forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+
+      const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                                              0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+      /* In this branch we handle three cases:
+        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+        We expand the input word (16-bit) into two words (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
+
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
+
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
+
+        Finally from these two words we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-            const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
-            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
-
-            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-            const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
-            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
-            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
-            const __m256i s4 = _mm256_xor_si256(s3, m0);
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
+
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
 #undef simdutf_vec
 
-            // 4. expand words 16-bit => 32-bit
-            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-            // Due to the wider registers, the following path is less likely to be useful.
-            /*if(mask == 0) {
-              // We only have three-byte words. Use fast path.
-              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-              utf8_output += 12;
-              buf += 16;
-              continue;
-            }*/
-            const uint8_t mask0 = uint8_t(mask);
-            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
-
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-            utf8_output += row0[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-            utf8_output += row1[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-            utf8_output += row2[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-            utf8_output += row3[0];
-            buf += 16;
-        } else {
-            // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // may require large, non-trivial tables?
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xFFFFF800) == 0) { // 2-byte
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xFFFF0000) == 0) { // 3-byte
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(nullptr, utf8_output);
-                    }
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else { // 4-byte
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(nullptr, utf8_output);
-                    }
-                    *utf8_output++ = char((word >> 18) | 0b11110000);
-                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
-        }
-    } // while
-
-    // check for invalid input
-    const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-    if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
-        return std::make_pair(nullptr, utf8_output);
-    }
-
-    if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
-        return std::make_pair(nullptr, utf8_output);
-    }
-
-    return std::make_pair(buf, utf8_output);
-}
-
-std::pair<result, char*> avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output)
-{
-    const char32_t* end = buf + len;
-    const char32_t* start = buf;
-
-    const __m256i v_0000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-    const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-    const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-    const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-    const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-    const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-    while (buf + 16 + safety_margin <= end) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
-        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
-        // Check for too large input
-        const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
-        if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
-            return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-        }
-
-        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-        __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
-        in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
-
-        // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
-
-        if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-            // 1. pack the bytes
-            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
-            // 2. store (16 bytes)
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-            // 3. adjust pointers
-            buf += 16;
-            utf8_output += 16;
-            continue; // we are done for this round!
-        }
-        // no bits set above 7th bit
-        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-        // no bits set above 11th bit
-        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-        if (one_or_two_bytes_bitmask == 0xffffffff) {
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const __m256i t3 = _mm256_or_si256(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-            // 2. merge ASCII and 2-byte codewords
-            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-            // 3. prepare bitmask for 8-bit lookup
-            const uint32_t M0 = one_byte_bitmask & 0x55555555;
-            const uint32_t M1 = M0 >> 7;
-            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-            // 4. pack the bytes
-
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
-
-            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-            // 5. store bytes
-            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-            utf8_output += row[0];
-            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
-            utf8_output += row_2[0];
-
-            // 6. adjust pointers
-            buf += 16;
-            continue;
-        }
-        // Must check for overflow in packing
-        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-        if (saturation_bitmask == 0xffffffff) {
-            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-
-            // Check for illegal surrogate words
-            const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-            const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
-            if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
-                return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
-            }
-
-            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-            /* In this branch we handle three cases:
-              1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-              2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-              3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-              We expand the input word (16-bit) into two words (32-bit), thus
-              we have room for four bytes. However, we need five distinct bit
-              layouts. Note that the last byte in cases #2 and #3 is the same.
-
-              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-              in register t2.
-
-              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-              either byte 1 for case #2 or byte 2 for case #3. Note that they
-              differ by exactly one bit.
-
-              Finally from these two words we build proper UTF-8 sequence, taking
-              into account the case (i.e, the number of bytes to write).
-            */
-            /**
-             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-             * t2 => [0ccc|cccc] [10cc|cccc]
-             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-             */
+      // 4. expand words 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be useful.
+      /*if(mask == 0) {
+        // We only have three-byte words. Use fast path.
+        const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+        const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+        const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+        utf8_output += 12;
+        buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+      const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+      const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+      const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
+
+
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+      const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
+
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+    } else {
+      // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // may require large, non-trivial tables?
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
+          *utf8_output++ = char(word);
+        } else if((word & 0xFFFFF800)==0) { // 2-byte
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word & 0xFFFF0000 )==0) {  // 3-byte
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {  // 4-byte
+          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
+          *utf8_output++ = char((word>>18) | 0b11110000);
+          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  // check for invalid input
+  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+  if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+    return std::make_pair(nullptr, utf8_output);
+  }
+
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
+
+  return std::make_pair(buf, utf8_output);
+}
+
+
+std::pair<result, char*> avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
+  const char32_t* end = buf + len;
+  const char32_t* start = buf;
+
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+    __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
+    // Check for too large input
+    const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+    if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
+    }
+
+    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
+    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+    // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+    if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
+      // 4. pack the bytes
+
+      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
+
+      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+      const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
+      utf8_output += row_2[0];
+
+      // 6. adjust pointers
+      buf += 16;
+      continue;
+    }
+    // Must check for overflow in packing
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffffffff) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+
+      // Check for illegal surrogate words
+      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+      const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
+      }
+
+      const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                                              0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+      /* In this branch we handle three cases:
+        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+        We expand the input word (16-bit) into two words (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
+
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
+
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
+
+        Finally from these two words we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
 #define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-            const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
-            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
-
-            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-            const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
-            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
-            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
-            const __m256i s4 = _mm256_xor_si256(s3, m0);
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256 (t1, simdutf_vec(0b1000000000000000));
+
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
 #undef simdutf_vec
 
-            // 4. expand words 16-bit => 32-bit
-            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-            // Due to the wider registers, the following path is less likely to be useful.
-            /*if(mask == 0) {
-              // We only have three-byte words. Use fast path.
-              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-              utf8_output += 12;
-              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-              utf8_output += 12;
-              buf += 16;
-              continue;
-            }*/
-            const uint8_t mask0 = uint8_t(mask);
-            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
-
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-            utf8_output += row0[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-            utf8_output += row1[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-            utf8_output += row2[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-            utf8_output += row3[0];
-            buf += 16;
-        } else {
-            // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // may require large, non-trivial tables?
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xFFFFF800) == 0) { // 2-byte
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xFFFF0000) == 0) { // 3-byte
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output);
-                    }
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else { // 4-byte
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output);
-                    }
-                    *utf8_output++ = char((word >> 18) | 0b11110000);
-                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
-        }
-    } // while
-
-    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+      // 4. expand words 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be useful.
+      /*if(mask == 0) {
+        // We only have three-byte words. Use fast path.
+        const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+        const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+        const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+        utf8_output += 12;
+        buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+      const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+      const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+      const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
+
+
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+      const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
+
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+    } else {
+      // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // may require large, non-trivial tables?
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
+          *utf8_output++ = char(word);
+        } else if((word & 0xFFFFF800)==0) { // 2-byte
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word & 0xFFFF0000 )==0) {  // 3-byte
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {  // 4-byte
+          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
+          *utf8_output++ = char((word>>18) | 0b11110000);
+          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf16.cpp
 /* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */
-template<endianness big_endian>
-std::pair<const char32_t*, char16_t*> avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output)
-{
-    const char32_t* end = buf + len;
+template <endianness big_endian>
+std::pair<const char32_t*, char16_t*> avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
+  const char32_t* end = buf + len;
 
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-    __m256i forbidden_bytemask = _mm256_setzero_si256();
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+  __m256i forbidden_bytemask = _mm256_setzero_si256();
 
-    while (buf + 8 + safety_margin <= end) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-        const __m256i v_00000000 = _mm256_setzero_si256();
-        const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+  while (buf + 8 + safety_margin <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-        // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-        if (saturation_bitmask == 0xffffffff) {
-            const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-            const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-            forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
+    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
 
-            __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
-            if (big_endian) {
-                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-            }
-            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-            utf16_output += 8;
-            buf += 8;
+    if (saturation_bitmask == 0xffffffff) {
+      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+      forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
+
+      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
+      if (big_endian) {
+        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 7;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFF0000)==0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
+          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
         } else {
-            size_t forward = 7;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFF0000) == 0) {
-                    // will not generate a surrogate pair
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(nullptr, utf16_output);
-                    }
-                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
-                } else {
-                    // will generate a surrogate pair
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(nullptr, utf16_output);
-                    }
-                    word -= 0x10000;
-                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-                    if (big_endian) {
-                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-                    }
-                    *utf16_output++ = char16_t(high_surrogate);
-                    *utf16_output++ = char16_t(low_surrogate);
-                }
-            }
-            buf += k;
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
         }
+      }
+      buf += k;
     }
+  }
 
-    // check for invalid input
-    if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
-        return std::make_pair(nullptr, utf16_output);
-    }
+  // check for invalid input
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
 
-    return std::make_pair(buf, utf16_output);
+  return std::make_pair(buf, utf16_output);
 }
 
-template<endianness big_endian>
-std::pair<result, char16_t*> avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)
-{
-    const char32_t* start = buf;
-    const char32_t* end = buf + len;
 
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+template <endianness big_endian>
+std::pair<result, char16_t*> avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
+  const char32_t* start = buf;
+  const char32_t* end = buf + len;
 
-    while (buf + 8 + safety_margin <= end) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
-
-        const __m256i v_00000000 = _mm256_setzero_si256();
-        const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-        // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-
-        if (saturation_bitmask == 0xffffffff) {
-            const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-            const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-            const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
-            if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
-                return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
-            }
+  while (buf + 8 + safety_margin <= end) {
+    __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-            __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
-            if (big_endian) {
-                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-            }
-            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-            utf16_output += 8;
-            buf += 8;
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+
+    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+    if (saturation_bitmask == 0xffffffff) {
+      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+      const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
+      }
+
+      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
+      if (big_endian) {
+        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 7;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFF0000)==0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
+          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
         } else {
-            size_t forward = 7;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFF0000) == 0) {
-                    // will not generate a surrogate pair
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output);
-                    }
-                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
-                } else {
-                    // will generate a surrogate pair
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output);
-                    }
-                    word -= 0x10000;
-                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-                    if (big_endian) {
-                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-                    }
-                    *utf16_output++ = char16_t(high_surrogate);
-                    *utf16_output++ = char16_t(low_surrogate);
-                }
-            }
-            buf += k;
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
         }
+      }
+      buf += k;
     }
+  }
 
-    return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
 /* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */
 } // unnamed namespace
@@ -24332,103 +22951,85 @@ namespace {
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-    simdutf_really_inline buf_block_reader(const uint8_t* _buf, size_t _len);
-    simdutf_really_inline size_t block_index();
-    simdutf_really_inline bool has_full_block() const;
-    simdutf_really_inline const uint8_t* full_block() const;
-    /**
-     * Get the last block, padded with spaces.
-     *
-     * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-     * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-     * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-     *
-     * @return the number of effective characters in the last block.
-     */
-    simdutf_really_inline size_t get_remainder(uint8_t* dst) const;
-    simdutf_really_inline void advance();
-
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
 private:
-    const uint8_t* buf;
-    const size_t len;
-    const size_t lenminusstep;
-    size_t idx;
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
 };
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char* format_input_text_64(const uint8_t* text)
-{
-    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-    }
-    buf[sizeof(simd8x64<uint8_t>)] = '\0';
-    return buf;
+simdutf_unused static char * format_input_text_64(const uint8_t *text) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char* format_input_text(const simd8x64<uint8_t>& in)
-{
-    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-    in.store(reinterpret_cast<uint8_t*>(buf));
-    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-        if (buf[i] < ' ') {
-            buf[i] = '_';
-        }
-    }
-    buf[sizeof(simd8x64<uint8_t>)] = '\0';
-    return buf;
+simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
-simdutf_unused static char* format_mask(uint64_t mask)
-{
-    static char* buf = reinterpret_cast<char*>(malloc(64 + 1));
-    for (size_t i = 0; i < 64; i++) {
-        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-    }
-    buf[64] = '\0';
-    return buf;
+simdutf_unused static char * format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t* _buf, size_t _len)
-    : buf { _buf }
-    , len { _len }
-    , lenminusstep { len < STEP_SIZE ? 0 : len - STEP_SIZE }
-    , idx { 0 }
-{
-}
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
 
 template<size_t STEP_SIZE>
 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
-{
-    return idx < lenminusstep;
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline const uint8_t* buf_block_reader<STEP_SIZE>::full_block() const
-{
-    return &buf[idx];
+simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t* dst) const
-{
-    if (len == idx) {
-        return 0;
-    } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-    std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-    std::memcpy(dst, buf + idx, len - idx);
-    return len - idx;
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance()
-{
-    idx += STEP_SIZE;
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
 }
 
 } // unnamed namespace
@@ -24444,22 +23045,21 @@ namespace utf8_validation {
 
 using namespace simd;
 
-simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
-{
-    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-    // Bit 1 = Too Long (ASCII followed by continuation)
-    // Bit 2 = Overlong 3-byte
-    // Bit 4 = Surrogate
-    // Bit 5 = Overlong 2-byte
-    // Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -24467,92 +23067,101 @@ simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> in
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-    // 11110101 1000____
-    // 1111011_ 1000____
-    // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-        // 0_______ ________ <ASCII in byte 1>
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        // 10______ ________ <continuation in byte 1>
-        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-        // 1100____ ________ <two byte lead in byte 1>
-        TOO_SHORT | OVERLONG_2,
-        // 1101____ ________ <two byte lead in byte 1>
-        TOO_SHORT,
-        // 1110____ ________ <three byte lead in byte 1>
-        TOO_SHORT | OVERLONG_3 | SURROGATE,
-        // 1111____ ________ <four+ byte lead in byte 1>
-        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-        // ____0000 ________
-        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-        // ____0001 ________
-        CARRY | OVERLONG_2,
-        // ____001_ ________
-        CARRY, CARRY,
-
-        // ____0100 ________
-        CARRY | TOO_LARGE,
-        // ____0101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____011_ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-        // ____1___ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____1101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ________ 0_______ <ASCII in byte 2>
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-        // ________ 1000____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-        // ________ 1001____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-        // ________ 101_____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-
-        // ________ 11______
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
     return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
-{
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-}
+  }
 
-//
-// Return nonzero if there are incomplete multibyte characters at the end of the block:
-// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-//
-simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input)
-{
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     static const uint8_t max_array[32] = {
-        255, 255, 255, 255, 255, 255, 255, 255,
-        255, 255, 255, 255, 255, 255, 255, 255,
-        255, 255, 255, 255, 255, 255, 255, 255,
-        255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
     return input.gt_bits(max_value);
-}
+  }
 
-struct utf8_checker {
+  struct utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
     // The last input we received
@@ -24563,54 +23172,51 @@ struct utf8_checker {
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-        simd8<uint8_t> sc = check_special_cases(input, prev1);
-        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof()
-    {
-        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-        // possibly finish them.
-        this->error |= this->prev_incomplete;
+    simdutf_really_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
     }
 
-    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input)
-    {
-        if (simdutf_likely(is_ascii(input))) {
-            this->error |= this->prev_incomplete;
-        } else {
-            // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-            static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                "We support either two or four chunks per 64-byte block.");
-            if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-            }
-            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
-            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdutf_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+
+      }
     }
 
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const
-    {
-        return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
     }
 
-}; // struct utf8_checker
+  }; // struct utf8_checker
 } // namespace utf8_validation
 
 using utf8_validation::utf8_checker;
@@ -24630,16 +23236,15 @@ namespace utf8_validation {
  * Validates that the string is actual UTF-8.
  */
 template<class checker>
-bool generic_validate_utf8(const uint8_t* input, size_t length)
-{
-    checker c {};
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
     buf_block_reader<64> reader(input, length);
     while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        c.check_next_input(in);
-        reader.advance();
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
     }
-    uint8_t block[64] {};
+    uint8_t block[64]{};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
@@ -24648,106 +23253,97 @@ bool generic_validate_utf8(const uint8_t* input, size_t length)
     return !c.errors();
 }
 
-bool generic_validate_utf8(const char* input, size_t length)
-{
-    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+bool generic_validate_utf8(const char * input, size_t length) {
+  return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 /**
  * Validates that the string is actual UTF-8 and stops on errors.
  */
 template<class checker>
-result generic_validate_utf8_with_errors(const uint8_t* input, size_t length)
-{
-    checker c {};
+result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
+    checker c{};
     buf_block_reader<64> reader(input, length);
-    size_t count { 0 };
+    size_t count{0};
     while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        c.check_next_input(in);
-        if (c.errors()) {
-            if (count != 0) {
-                count--;
-            } // Sometimes the error is only detected in the next chunk
-            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-            res.count += count;
-            return res;
-        }
-        reader.advance();
-        count += 64;
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      if(c.errors()) {
+        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+        res.count += count;
+        return res;
+      }
+      reader.advance();
+      count += 64;
     }
-    uint8_t block[64] {};
+    uint8_t block[64]{};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
     reader.advance();
     c.check_eof();
     if (c.errors()) {
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
-        res.count += count;
-        return res;
+      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
+      res.count += count;
+      return res;
     } else {
-        return result(error_code::SUCCESS, length);
+      return result(error_code::SUCCESS, length);
     }
 }
 
-result generic_validate_utf8_with_errors(const char* input, size_t length)
-{
-    return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+result generic_validate_utf8_with_errors(const char * input, size_t length) {
+  return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 template<class checker>
-bool generic_validate_ascii(const uint8_t* input, size_t length)
-{
+bool generic_validate_ascii(const uint8_t * input, size_t length) {
     buf_block_reader<64> reader(input, length);
-    uint8_t blocks[64] {};
+    uint8_t blocks[64]{};
     simd::simd8x64<uint8_t> running_or(blocks);
     while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        running_or |= in;
-        reader.advance();
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      running_or |= in;
+      reader.advance();
     }
-    uint8_t block[64] {};
+    uint8_t block[64]{};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     running_or |= in;
     return running_or.is_ascii();
 }
 
-bool generic_validate_ascii(const char* input, size_t length)
-{
-    return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+bool generic_validate_ascii(const char * input, size_t length) {
+  return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 template<class checker>
-result generic_validate_ascii_with_errors(const uint8_t* input, size_t length)
-{
-    buf_block_reader<64> reader(input, length);
-    size_t count { 0 };
-    while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        if (!in.is_ascii()) {
-            result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-            return result(res.error, count + res.count);
-        }
-        reader.advance();
-
-        count += 64;
-    }
-    uint8_t block[64] {};
-    reader.get_remainder(block);
-    simd::simd8x64<uint8_t> in(block);
+result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
     if (!in.is_ascii()) {
-        result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-        return result(res.error, count + res.count);
-    } else {
-        return result(error_code::SUCCESS, length);
+      result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+      return result(res.error, count + res.count);
     }
+    reader.advance();
+
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  if (!in.is_ascii()) {
+    result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+    return result(res.error, count + res.count);
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
 }
 
-result generic_validate_ascii_with_errors(const char* input, size_t length)
-{
-    return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+result generic_validate_ascii_with_errors(const char * input, size_t length) {
+  return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 } // namespace utf8_validation
@@ -24759,6 +23355,7 @@ result generic_validate_ascii_with_errors(const char* input, size_t length)
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
+
 namespace simdutf {
 namespace haswell {
 namespace {
@@ -24766,64 +23363,63 @@ namespace utf8_to_utf16 {
 
 using namespace simd;
 
-template<endianness endian>
+template <endianness endian>
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char16_t* utf16_output) noexcept
-{
-    // The implementation is not specific to haswell and should be moved to the generic directory.
-    size_t pos = 0;
-    char16_t* start { utf16_output };
-    const size_t safety_margin = 16; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-        // this loop could be unrolled further. For example, we could process the mask
-        // far more than 64 bytes.
-        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
-        if (in.is_ascii()) {
-            in.store_ascii_as_utf16<endian>(utf16_output);
-            utf16_output += 64;
-            pos += 64;
-        } else {
-            // Slow path. We hope that the compiler will recognize that this is a slow path.
-            // Anything that is not a continuation mask is a 'leading byte', that is, the
-            // start of a new code point.
-            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-            // The *start* of code points is not so useful, rather, we want the *end* of code points.
-            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-            // We process in blocks of up to 12 bytes except possibly
-            // for fast paths which may process up to 16 bytes. For the
-            // slow path to work, we should have at least 12 input bytes left.
-            size_t max_starting_point = (pos + 64) - 12;
-            // Next loop is going to run at least five times when using solely
-            // the slow/regular path, and at least four times if there are fast paths.
-            while (pos < max_starting_point) {
-                // Performance note: our ability to compute 'consumed' and
-                // then shift and recompute is critical. If there is a
-                // latency of, say, 4 cycles on getting 'consumed', then
-                // the inner loop might have a total latency of about 6 cycles.
-                // Yet we process between 6 to 12 inputs bytes, thus we get
-                // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                // for this section of the code. Hence, there is a limit
-                // to how much we can further increase this latency before
-                // it seriously harms performance.
-                //
-                // Thus we may allow convert_masked_utf8_to_utf16 to process
-                // more bytes at a time under a fast-path mode where 16 bytes
-                // are consumed at once (e.g., when encountering ASCII).
-                size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
-                    utf8_end_of_code_point_mask, utf16_output);
-                pos += consumed;
-                utf8_end_of_code_point_mask >>= consumed;
-            }
-            // At this point there may remain between 0 and 12 bytes in the
-            // 64-byte block. These bytes will be processed again. So we have an
-            // 80% efficiency (in the worst case). In practice we expect an
-            // 85% to 90% efficiency.
-        }
-    }
-    utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
-    return utf16_output - start;
+    char16_t* utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the generic directory.
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while(pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the mask
+    // far more than 64 bytes.
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if(in.is_ascii()) {
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow path.
+      // Anything that is not a continuation mask is a 'leading byte', that is, the
+      // start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end* of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while(pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
+  }
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
+  return utf16_output - start;
 }
 
 } // namespace utf8_to_utf16
@@ -24834,28 +23430,29 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
+
 namespace simdutf {
 namespace haswell {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
 
-simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
-{
-    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-    // Bit 1 = Too Long (ASCII followed by continuation)
-    // Bit 2 = Overlong 3-byte
-    // Bit 4 = Surrogate
-    // Bit 5 = Overlong 2-byte
-    // Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -24863,274 +23460,275 @@ simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> in
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-    // 11110101 1000____
-    // 1111011_ 1000____
-    // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-        // 0_______ ________ <ASCII in byte 1>
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        // 10______ ________ <continuation in byte 1>
-        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-        // 1100____ ________ <two byte lead in byte 1>
-        TOO_SHORT | OVERLONG_2,
-        // 1101____ ________ <two byte lead in byte 1>
-        TOO_SHORT,
-        // 1110____ ________ <three byte lead in byte 1>
-        TOO_SHORT | OVERLONG_3 | SURROGATE,
-        // 1111____ ________ <four+ byte lead in byte 1>
-        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-        // ____0000 ________
-        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-        // ____0001 ________
-        CARRY | OVERLONG_2,
-        // ____001_ ________
-        CARRY, CARRY,
-
-        // ____0100 ________
-        CARRY | TOO_LARGE,
-        // ____0101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____011_ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-        // ____1___ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____1101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ________ 0_______ <ASCII in byte 2>
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-        // ________ 1000____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-        // ________ 1001____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-        // ________ 101_____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-
-        // ________ 11______
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
     return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
-{
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-}
+  }
+
 
-struct validating_transcoder {
+  struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder()
-        : error(uint8_t(0))
-    {
-    }
+    validating_transcoder() : error(uint8_t(0)) {}
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-        simd8<uint8_t> sc = check_special_cases(input, prev1);
-        this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-    template<endianness endian>
-    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output)
-    {
-        size_t pos = 0;
-        char16_t* start { utf16_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 8; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf16<endian>(utf16_output);
-                utf16_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                        utf8_end_of_code_point_mask, utf16_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
-            return 0;
-        }
-        if (pos < size) {
-            size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
-            if (howmany == 0) {
-                return 0;
-            }
-            utf16_output += howmany;
-        }
-        return utf16_output - start;
-    }
-
-    template<endianness endian>
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output)
-    {
-        size_t pos = 0;
-        char16_t* start { utf16_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 8; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf16<endian>(utf16_output);
-                utf16_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                if (errors()) {
-                    // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-                    // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-                    result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-                    res.count += pos;
-                    return res;
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                        utf8_end_of_code_point_mask, utf16_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+
+    template <endianness endian>
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
+      size_t pos = 0;
+      char16_t* start{utf16_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 8; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf16<endian>(utf16_output);
+          utf16_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) { return 0; }
+      if(pos < size) {
+        size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
+        if(howmany == 0) { return 0; }
+        utf16_output += howmany;
+      }
+      return utf16_output - start;
+    }
+
+    template <endianness endian>
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
+      size_t pos = 0;
+      char16_t* start{utf16_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 8; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf16<endian>(utf16_output);
+          utf16_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          if (errors()) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) {
+        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+        res.count += pos;
+        return res;
+      }
+      if(pos < size) {
+        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+        if (res.error) {    // In case of error, we want the error position
+          res.count += pos;
+          return res;
+        } else {    // In case of success, we want the number of word written
+          utf16_output += res.count;
         }
-        if (pos < size) {
-            // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-            // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-            result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-            if (res.error) { // In case of error, we want the error position
-                res.count += pos;
-                return res;
-            } else { // In case of success, we want the number of word written
-                utf16_output += res.count;
-            }
-        }
-        return result(error_code::SUCCESS, utf16_output - start);
+      }
+      return result(error_code::SUCCESS, utf16_output - start);
     }
 
-    simdutf_really_inline bool errors() const
-    {
-        return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
     }
 
-}; // struct utf8_checker
+  }; // struct utf8_checker
 } // utf8_to_utf16 namespace
 } // unnamed namespace
 } // namespace haswell
@@ -25147,36 +23745,37 @@ namespace utf8_to_utf32 {
 
 using namespace simd;
 
+
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char32_t* utf32_output) noexcept
-{
-    size_t pos = 0;
-    char32_t* start { utf32_output };
-    const size_t safety_margin = 16; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
-        if (in.is_ascii()) {
-            in.store_ascii_as_utf32(utf32_output);
-            utf32_output += 64;
-            pos += 64;
-        } else {
-            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-            size_t max_starting_point = (pos + 64) - 12;
-            while (pos < max_starting_point) {
-                size_t consumed = convert_masked_utf8_to_utf32(input + pos,
-                    utf8_end_of_code_point_mask, utf32_output);
-                pos += consumed;
-                utf8_end_of_code_point_mask >>= consumed;
-            }
-        }
+    char32_t* utf32_output) noexcept {
+  size_t pos = 0;
+  char32_t* start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while(pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if(in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+    uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+    uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+    size_t max_starting_point = (pos + 64) - 12;
+    while(pos < max_starting_point) {
+      size_t consumed = convert_masked_utf8_to_utf32(input + pos,
+                          utf8_end_of_code_point_mask, utf32_output);
+      pos += consumed;
+      utf8_end_of_code_point_mask >>= consumed;
+      }
     }
-    utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
-    return utf32_output - start;
+  }
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
+  return utf32_output - start;
 }
 
+
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace haswell
@@ -25185,28 +23784,29 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
+
 namespace simdutf {
 namespace haswell {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
 
-simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
-{
-    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-    // Bit 1 = Too Long (ASCII followed by continuation)
-    // Bit 2 = Overlong 3-byte
-    // Bit 4 = Surrogate
-    // Bit 5 = Overlong 2-byte
-    // Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -25214,266 +23814,268 @@ simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> in
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-    // 11110101 1000____
-    // 1111011_ 1000____
-    // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-        // 0_______ ________ <ASCII in byte 1>
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        // 10______ ________ <continuation in byte 1>
-        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-        // 1100____ ________ <two byte lead in byte 1>
-        TOO_SHORT | OVERLONG_2,
-        // 1101____ ________ <two byte lead in byte 1>
-        TOO_SHORT,
-        // 1110____ ________ <three byte lead in byte 1>
-        TOO_SHORT | OVERLONG_3 | SURROGATE,
-        // 1111____ ________ <four+ byte lead in byte 1>
-        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-        // ____0000 ________
-        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-        // ____0001 ________
-        CARRY | OVERLONG_2,
-        // ____001_ ________
-        CARRY, CARRY,
-
-        // ____0100 ________
-        CARRY | TOO_LARGE,
-        // ____0101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____011_ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-        // ____1___ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____1101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ________ 0_______ <ASCII in byte 2>
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-        // ________ 1000____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-        // ________ 1001____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-        // ________ 101_____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-
-        // ________ 11______
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
     return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
-{
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-}
+  }
 
-struct validating_transcoder {
+
+  struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder()
-        : error(uint8_t(0))
-    {
-    }
+    validating_transcoder() : error(uint8_t(0)) {}
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-        simd8<uint8_t> sc = check_special_cases(input, prev1);
-        this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output)
-    {
-        size_t pos = 0;
-        char32_t* start { utf32_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 4; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf32(utf32_output);
-                utf32_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                        utf8_end_of_code_point_mask, utf32_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
-            return 0;
-        }
-        if (pos < size) {
-            size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-            if (howmany == 0) {
-                return 0;
-            }
-            utf32_output += howmany;
-        }
-        return utf32_output - start;
-    }
-
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output)
-    {
-        size_t pos = 0;
-        char32_t* start { utf32_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 4; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf32(utf32_output);
-                utf32_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                if (errors()) {
-                    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-                    res.count += pos;
-                    return res;
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                        utf8_end_of_code_point_mask, utf32_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
+      size_t pos = 0;
+      char32_t* start{utf32_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 4; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf32(utf32_output);
+          utf32_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                            utf8_end_of_code_point_mask, utf32_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) { return 0; }
+      if(pos < size) {
+        size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+        if(howmany == 0) { return 0; }
+        utf32_output += howmany;
+      }
+      return utf32_output - start;
+    }
+
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
+      size_t pos = 0;
+      char32_t* start{utf32_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 4; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf32(utf32_output);
+          utf32_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          if (errors()) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                            utf8_end_of_code_point_mask, utf32_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) {
+        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+        res.count += pos;
+        return res;
+      }
+      if(pos < size) {
+        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+        if (res.error) {    // In case of error, we want the error position
+          res.count += pos;
+          return res;
+        } else {    // In case of success, we want the number of word written
+          utf32_output += res.count;
         }
-        if (pos < size) {
-            result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-            if (res.error) { // In case of error, we want the error position
-                res.count += pos;
-                return res;
-            } else { // In case of success, we want the number of word written
-                utf32_output += res.count;
-            }
-        }
-        return result(error_code::SUCCESS, utf32_output - start);
+      }
+      return result(error_code::SUCCESS, utf32_output - start);
     }
 
-    simdutf_really_inline bool errors() const
-    {
-        return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
     }
 
-}; // struct utf8_checker
+  }; // struct utf8_checker
 } // utf8_to_utf32 namespace
 } // unnamed namespace
 } // namespace haswell
@@ -25490,37 +24092,36 @@ namespace utf8 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char* in, size_t size)
-{
+simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
-    for (; pos + 64 <= size; pos += 64) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        count += 64 - count_ones(utf8_continuation_mask);
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      count += 64 - count_ones(utf8_continuation_mask);
     }
     return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size)
-{
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for (; pos + 64 <= size; pos += 64) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        // We count one word for anything that is not a continuation (so
-        // leading bytes).
-        count += 64 - count_ones(utf8_continuation_mask);
-        int64_t utf8_4byte = input.gteq_unsigned(240);
-        count += count_ones(utf8_4byte);
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      // We count one word for anything that is not a continuation (so
+      // leading bytes).
+      count += 64 - count_ones(utf8_continuation_mask);
+      int64_t utf8_4byte = input.gteq_unsigned(240);
+      count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
 
-simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
-{
+
+simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
     return count_code_points(in, size);
 }
 } // utf8 namespace
@@ -25535,65 +24136,57 @@ namespace haswell {
 namespace {
 namespace utf16 {
 
-template<endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
-    for (; pos + 32 <= size; pos += 32) {
-        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
-        if (!match_system(big_endian)) {
-            input.swap_bytes();
-        }
-        uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-        count += count_ones(not_pair) / 2;
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      if (!match_system(big_endian)) { input.swap_bytes(); }
+      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+      count += count_ones(not_pair) / 2;
     }
     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
 }
 
-template<endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for (; pos + 32 <= size; pos += 32) {
-        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
-        if (!match_system(big_endian)) {
-            input.swap_bytes();
-        }
-        uint64_t ascii_mask = input.lteq(0x7F);
-        uint64_t twobyte_mask = input.lteq(0x7FF);
-        uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-        size_t ascii_count = count_ones(ascii_mask) / 2;
-        size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
-        size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
-        size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-        count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      if (!match_system(big_endian)) { input.swap_bytes(); }
+      uint64_t ascii_mask = input.lteq(0x7F);
+      uint64_t twobyte_mask = input.lteq(0x7FF);
+      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+      size_t ascii_count = count_ones(ascii_mask) / 2;
+      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
+      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
+      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
     }
     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
 }
 
-template<endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
     return count_code_points<big_endian>(in, size);
 }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output)
-{
-    size_t pos = 0;
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
+  size_t pos = 0;
 
-    while (pos + 32 <= size) {
-        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
-        input.swap_bytes();
-        input.store(reinterpret_cast<uint16_t*>(output));
-        pos += 32;
-        output += 32;
-    }
+  while (pos + 32 <= size) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
+  }
 
-    scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
 } // utf16
@@ -25605,661 +24198,590 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 namespace simdutf {
 namespace haswell {
 
-simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
-{
-    // If there is a BOM, then we trust it.
-    auto bom_encoding = simdutf::BOM::check_bom(input, length);
-    if (bom_encoding != encoding_type::unspecified) {
-        return bom_encoding;
-    }
-    if (length % 2 == 0) {
-        return avx2_detect_encodings<utf8_validation::utf8_checker>(input, length);
+simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
+  if (length % 2 == 0) {
+    return avx2_detect_encodings<utf8_validation::utf8_checker>(input, length);
+  } else {
+    if (implementation::validate_utf8(input, length)) {
+      return simdutf::encoding_type::UTF8;
     } else {
-        if (implementation::validate_utf8(input, length)) {
-            return simdutf::encoding_type::UTF8;
-        } else {
-            return simdutf::encoding_type::unspecified;
-        }
+      return simdutf::encoding_type::unspecified;
     }
+  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
-{
-    return haswell::utf8_validation::generic_validate_utf8(buf, len);
+simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_utf8(buf,len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
-{
-    return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_utf8_with_errors(buf,len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
-{
-    return haswell::utf8_validation::generic_validate_ascii(buf, len);
+simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_ascii(buf,len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
-{
-    return haswell::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_ascii_with_errors(buf,len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
-{
-    const char16_t* tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
-    if (tail) {
-        return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
-    } else {
-        return false;
-    }
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
+  const char16_t* tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
-{
-    const char16_t* tail = avx2_validate_utf16<endianness::BIG>(buf, len);
-    if (tail) {
-        return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
-    } else {
-        return false;
-    }
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
+  const char16_t* tail = avx2_validate_utf16<endianness::BIG>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
-{
-    result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
-    if (res.count != len) {
-        result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
-        return result(scalar_res.error, res.count + scalar_res.count);
-    } else {
-        return res;
-    }
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
+  result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
-{
-    result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
-    if (res.count != len) {
-        result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
-        return result(scalar_res.error, res.count + scalar_res.count);
-    } else {
-        return res;
-    }
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
+  result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
-{
-    const char32_t* tail = avx2_validate_utf32le(buf, len);
-    if (tail) {
-        return scalar::utf32::validate(tail, len - (tail - buf));
-    } else {
-        return false;
-    }
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  const char32_t* tail = avx2_validate_utf32le(buf, len);
+  if (tail) {
+    return scalar::utf32::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
-{
-    result res = avx2_validate_utf32le_with_errors(buf, len);
-    if (res.count != len) {
-        result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
-        return result(scalar_res.error, res.count + scalar_res.count);
-    } else {
-        return res;
-    }
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
+  result res = avx2_validate_utf32le_with_errors(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::latin1_to_utf8::convert(buf,len,utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept
-{
-    return scalar::latin1_to_utf32::convert(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept {
+  return scalar::latin1_to_utf32::convert(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16::validating_transcoder converter;
-    return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16::validating_transcoder converter;
-    return converter.convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16::validating_transcoder converter;
-    return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16::validating_transcoder converter;
-    return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept
-{
-    return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
+    char16_t* utf16_output) const noexcept {
+   return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept
-{
-    return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
+    char16_t* utf16_output) const noexcept {
+   return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    utf8_to_utf32::validating_transcoder converter;
-    return converter.convert(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    utf8_to_utf32::validating_transcoder converter;
-    return converter.convert_with_errors(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
-    char32_t* utf32_output) const noexcept
-{
-    return utf8_to_utf32::convert_valid(input, size, utf32_output);
+    char32_t* utf32_output) const noexcept {
+  return utf8_to_utf32::convert_valid(input, size,  utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
-}
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf8_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf8_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
-    if (ret.first.error) {
-        return ret.first;
-    } // Can return directly since scalar fallback already found correct ret.first.count
-    if (ret.first.count != len) { // All good so far, but not finished
-        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
-    return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
-    if (ret.first.error) {
-        return ret.first;
-    } // Can return directly since scalar fallback already found correct ret.first.count
-    if (ret.first.count != len) { // All good so far, but not finished
-        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    std::pair<const char32_t*, char*> ret = avx2_convert_utf32_to_utf8(buf, len, utf8_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf8_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
+  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    return saved_bytes;
+  }
+  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
+  return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf16le_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char32_t*, char*> ret = avx2_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char*> ret = haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-    if (ret.first.count != len) {
-        result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf32_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_with_errors(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf32_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_valid(buf,len,latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
-    if (ret.first.error) {
-        return ret.first;
-    } // Can return directly since scalar fallback already found correct ret.first.count
-    if (ret.first.count != len) { // All good so far, but not finished
-        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
-    return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
-    if (ret.first.error) {
-        return ret.first;
-    } // Can return directly since scalar fallback already found correct ret.first.count
-    if (ret.first.count != len) { // All good so far, but not finished
-        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char*> ret = haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
+  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
-    return ret.first;
+  }
+  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
+  return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return convert_utf32_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return convert_utf32_to_utf16le(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf16_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return convert_utf32_to_utf16be(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf16_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return convert_utf16le_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
-    if (ret.first.count != len) {
-        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return convert_utf16be_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
-    if (ret.first.count != len) {
-        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
-    return ret.first;
+void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
+  utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return convert_utf32_to_utf16le(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return convert_utf32_to_utf16be(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return convert_utf16le_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return convert_utf16be_to_utf32(buf, len, utf32_output);
-}
 
-void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
-{
-    utf16::change_endianness_utf16(input, length, output);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept {
+  return scalar::utf8::latin1_length_from_utf8(buf,len);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32( size_t length) const noexcept {
+  return scalar::utf32::latin1_length_from_utf32(length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
-{
-    return utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
-{
-    return scalar::utf8::latin1_length_from_utf8(buf, len);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
-{
-    return scalar::utf16::latin1_length_from_utf16(length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
-{
-    return scalar::utf32::latin1_length_from_utf32(length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
-}
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
-}
 
-simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
-{
-    return scalar::latin1::utf16_length_from_latin1(length);
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
-{
-    return utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char *input, size_t len) const noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+  size_t answer = len / sizeof(__m256i) * sizeof(__m256i);
+  size_t i = 0;
+  __m256i four_64bits = _mm256_setzero_si256();
+  while (i + sizeof(__m256i) <= len) {
+    __m256i runner = _mm256_setzero_si256();
+    // We can do up to 255 loops without overflow.
+    size_t iterations = (len - i) / sizeof(__m256i);
+    if (iterations > 255) {
+      iterations = 255;
+    }
+    size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
+    for (; i + 4*sizeof(__m256i) <= max_i; i += 4*sizeof(__m256i)) {
+      __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
+      __m256i input2 = _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
+      __m256i input3 = _mm256_loadu_si256((const __m256i *)(data + i + 2*sizeof(__m256i)));
+      __m256i input4 = _mm256_loadu_si256((const __m256i *)(data + i + 3*sizeof(__m256i)));
+      __m256i input12 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
+              _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2));
+      __m256i input23 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
+              _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4));
+      __m256i input1234 = _mm256_add_epi8(input12, input23);
+      runner = _mm256_sub_epi8(
+          runner, input1234);
+    }
+    for (; i <= max_i; i += sizeof(__m256i)) {
+      __m256i input_256_chunk = _mm256_loadu_si256((const __m256i *)(data + i));
+      runner = _mm256_sub_epi8(
+          runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk));
+    }
+    four_64bits = _mm256_add_epi64(
+        four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256()));
+  }
+  answer += _mm256_extract_epi64(four_64bits, 0) +
+            _mm256_extract_epi64(four_64bits, 1) +
+            _mm256_extract_epi64(four_64bits, 2) +
+            _mm256_extract_epi64(four_64bits, 3);
+  return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast<const char *>(data + i), len - i);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
-{
-    return scalar::latin1::utf32_length_from_latin1(length);
-}
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
+  const __m256i v_00000000 = _mm256_setzero_si256();
+  const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
+  const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  size_t pos = 0;
+  size_t count = 0;
+  for(;pos + 8 <= length; pos += 8) {
+    __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
+    const __m256i ascii_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
+    const __m256i one_two_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
+    const __m256i two_bytes_bytemask = _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    const __m256i one_two_three_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const __m256i three_bytes_bytemask = _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+    const uint32_t ascii_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
+    const uint32_t two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
+    const uint32_t three_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
-{
-    return scalar::latin1::utf8_length_from_latin1(input, length);
+    size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
+    size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
+    size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
+    count += 32 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
+  }
+  return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
-{
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
-    const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
-    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-    size_t pos = 0;
-    size_t count = 0;
-    for (; pos + 8 <= length; pos += 8) {
-        __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
-        const __m256i ascii_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
-        const __m256i one_two_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
-        const __m256i two_bytes_bytemask = _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
-        const __m256i one_two_three_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-        const __m256i three_bytes_bytemask = _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
-        const uint32_t ascii_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
-        const uint32_t two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
-        const uint32_t three_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
-
-        size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
-        size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
-        size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
-        count += 32 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
-    }
-    return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
-}
-
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
-{
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-    size_t pos = 0;
-    size_t count = 0;
-    for (; pos + 8 <= length; pos += 8) {
-        __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
-        const __m256i surrogate_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-        const uint32_t surrogate_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
-        size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4;
-        count += 8 + surrogate_count;
-    }
-    return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
-}
-
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
-{
-    return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
+  const __m256i v_00000000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  size_t pos = 0;
+  size_t count = 0;
+  for(;pos + 8 <= length; pos += 8) {
+    __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
+    const __m256i surrogate_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t surrogate_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
+    size_t surrogate_count = (32-count_ones(surrogate_bitmask))/4;
+    count += 8 + surrogate_count;
+  }
+  return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
+  return scalar::utf8::count_code_points(input, length);
 }
 
 } // namespace haswell
@@ -26273,6 +24795,7 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* in
 SIMDUTF_UNTARGET_REGION
 #endif
 
+
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
 SIMDUTF_POP_DISABLE_WARNINGS
 #endif // end of workaround
@@ -26283,6 +24806,10 @@ SIMDUTF_POP_DISABLE_WARNINGS
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=ppc64/implementation.cpp
 /* begin file src/ppc64/implementation.cpp */
 
+
+
+
+
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/begin.h
 /* begin file src/simdutf/ppc64/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "ppc64"
@@ -26296,27 +24823,25 @@ namespace {
 #endif
 using namespace simd;
 
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input)
-{
-    // careful: 0x80 is not ascii.
-    return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+  // careful: 0x80 is not ascii.
+  return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
 }
 
-simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
-{
-    simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
-    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
-    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
-    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-    return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
-{
-    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
-    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
-    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-    return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
 } // unnamed namespace
@@ -26333,103 +24858,85 @@ namespace {
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-    simdutf_really_inline buf_block_reader(const uint8_t* _buf, size_t _len);
-    simdutf_really_inline size_t block_index();
-    simdutf_really_inline bool has_full_block() const;
-    simdutf_really_inline const uint8_t* full_block() const;
-    /**
-     * Get the last block, padded with spaces.
-     *
-     * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-     * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-     * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-     *
-     * @return the number of effective characters in the last block.
-     */
-    simdutf_really_inline size_t get_remainder(uint8_t* dst) const;
-    simdutf_really_inline void advance();
-
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
 private:
-    const uint8_t* buf;
-    const size_t len;
-    const size_t lenminusstep;
-    size_t idx;
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
 };
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char* format_input_text_64(const uint8_t* text)
-{
-    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-    }
-    buf[sizeof(simd8x64<uint8_t>)] = '\0';
-    return buf;
+simdutf_unused static char * format_input_text_64(const uint8_t *text) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char* format_input_text(const simd8x64<uint8_t>& in)
-{
-    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-    in.store(reinterpret_cast<uint8_t*>(buf));
-    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-        if (buf[i] < ' ') {
-            buf[i] = '_';
-        }
-    }
-    buf[sizeof(simd8x64<uint8_t>)] = '\0';
-    return buf;
+simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
-simdutf_unused static char* format_mask(uint64_t mask)
-{
-    static char* buf = reinterpret_cast<char*>(malloc(64 + 1));
-    for (size_t i = 0; i < 64; i++) {
-        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-    }
-    buf[64] = '\0';
-    return buf;
+simdutf_unused static char * format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t* _buf, size_t _len)
-    : buf { _buf }
-    , len { _len }
-    , lenminusstep { len < STEP_SIZE ? 0 : len - STEP_SIZE }
-    , idx { 0 }
-{
-}
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
 
 template<size_t STEP_SIZE>
 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
-{
-    return idx < lenminusstep;
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline const uint8_t* buf_block_reader<STEP_SIZE>::full_block() const
-{
-    return &buf[idx];
+simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t* dst) const
-{
-    if (len == idx) {
-        return 0;
-    } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-    std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-    std::memcpy(dst, buf + idx, len - idx);
-    return len - idx;
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance()
-{
-    idx += STEP_SIZE;
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
 }
 
 } // unnamed namespace
@@ -26445,22 +24952,21 @@ namespace utf8_validation {
 
 using namespace simd;
 
-simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
-{
-    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-    // Bit 1 = Too Long (ASCII followed by continuation)
-    // Bit 2 = Overlong 3-byte
-    // Bit 4 = Surrogate
-    // Bit 5 = Overlong 2-byte
-    // Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -26468,92 +24974,101 @@ simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> in
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-    // 11110101 1000____
-    // 1111011_ 1000____
-    // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-        // 0_______ ________ <ASCII in byte 1>
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        // 10______ ________ <continuation in byte 1>
-        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-        // 1100____ ________ <two byte lead in byte 1>
-        TOO_SHORT | OVERLONG_2,
-        // 1101____ ________ <two byte lead in byte 1>
-        TOO_SHORT,
-        // 1110____ ________ <three byte lead in byte 1>
-        TOO_SHORT | OVERLONG_3 | SURROGATE,
-        // 1111____ ________ <four+ byte lead in byte 1>
-        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-        // ____0000 ________
-        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-        // ____0001 ________
-        CARRY | OVERLONG_2,
-        // ____001_ ________
-        CARRY, CARRY,
-
-        // ____0100 ________
-        CARRY | TOO_LARGE,
-        // ____0101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____011_ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-        // ____1___ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____1101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ________ 0_______ <ASCII in byte 2>
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-        // ________ 1000____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-        // ________ 1001____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-        // ________ 101_____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-
-        // ________ 11______
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
     return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
-{
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-}
+  }
 
-//
-// Return nonzero if there are incomplete multibyte characters at the end of the block:
-// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-//
-simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input)
-{
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     static const uint8_t max_array[32] = {
-        255, 255, 255, 255, 255, 255, 255, 255,
-        255, 255, 255, 255, 255, 255, 255, 255,
-        255, 255, 255, 255, 255, 255, 255, 255,
-        255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
     return input.gt_bits(max_value);
-}
+  }
 
-struct utf8_checker {
+  struct utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
     // The last input we received
@@ -26564,54 +25079,51 @@ struct utf8_checker {
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-        simd8<uint8_t> sc = check_special_cases(input, prev1);
-        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof()
-    {
-        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-        // possibly finish them.
-        this->error |= this->prev_incomplete;
+    simdutf_really_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
     }
 
-    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input)
-    {
-        if (simdutf_likely(is_ascii(input))) {
-            this->error |= this->prev_incomplete;
-        } else {
-            // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-            static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                "We support either two or four chunks per 64-byte block.");
-            if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-            }
-            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
-            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdutf_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+
+      }
     }
 
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const
-    {
-        return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
     }
 
-}; // struct utf8_checker
+  }; // struct utf8_checker
 } // namespace utf8_validation
 
 using utf8_validation::utf8_checker;
@@ -26631,16 +25143,15 @@ namespace utf8_validation {
  * Validates that the string is actual UTF-8.
  */
 template<class checker>
-bool generic_validate_utf8(const uint8_t* input, size_t length)
-{
-    checker c {};
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
     buf_block_reader<64> reader(input, length);
     while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        c.check_next_input(in);
-        reader.advance();
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
     }
-    uint8_t block[64] {};
+    uint8_t block[64]{};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
@@ -26649,106 +25160,97 @@ bool generic_validate_utf8(const uint8_t* input, size_t length)
     return !c.errors();
 }
 
-bool generic_validate_utf8(const char* input, size_t length)
-{
-    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+bool generic_validate_utf8(const char * input, size_t length) {
+  return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 /**
  * Validates that the string is actual UTF-8 and stops on errors.
  */
 template<class checker>
-result generic_validate_utf8_with_errors(const uint8_t* input, size_t length)
-{
-    checker c {};
+result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
+    checker c{};
     buf_block_reader<64> reader(input, length);
-    size_t count { 0 };
+    size_t count{0};
     while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        c.check_next_input(in);
-        if (c.errors()) {
-            if (count != 0) {
-                count--;
-            } // Sometimes the error is only detected in the next chunk
-            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-            res.count += count;
-            return res;
-        }
-        reader.advance();
-        count += 64;
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      if(c.errors()) {
+        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+        res.count += count;
+        return res;
+      }
+      reader.advance();
+      count += 64;
     }
-    uint8_t block[64] {};
+    uint8_t block[64]{};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
     reader.advance();
     c.check_eof();
     if (c.errors()) {
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
-        res.count += count;
-        return res;
+      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
+      res.count += count;
+      return res;
     } else {
-        return result(error_code::SUCCESS, length);
+      return result(error_code::SUCCESS, length);
     }
 }
 
-result generic_validate_utf8_with_errors(const char* input, size_t length)
-{
-    return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+result generic_validate_utf8_with_errors(const char * input, size_t length) {
+  return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 template<class checker>
-bool generic_validate_ascii(const uint8_t* input, size_t length)
-{
+bool generic_validate_ascii(const uint8_t * input, size_t length) {
     buf_block_reader<64> reader(input, length);
-    uint8_t blocks[64] {};
+    uint8_t blocks[64]{};
     simd::simd8x64<uint8_t> running_or(blocks);
     while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        running_or |= in;
-        reader.advance();
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      running_or |= in;
+      reader.advance();
     }
-    uint8_t block[64] {};
+    uint8_t block[64]{};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     running_or |= in;
     return running_or.is_ascii();
 }
 
-bool generic_validate_ascii(const char* input, size_t length)
-{
-    return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+bool generic_validate_ascii(const char * input, size_t length) {
+  return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 template<class checker>
-result generic_validate_ascii_with_errors(const uint8_t* input, size_t length)
-{
-    buf_block_reader<64> reader(input, length);
-    size_t count { 0 };
-    while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        if (!in.is_ascii()) {
-            result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-            return result(res.error, count + res.count);
-        }
-        reader.advance();
-
-        count += 64;
-    }
-    uint8_t block[64] {};
-    reader.get_remainder(block);
-    simd::simd8x64<uint8_t> in(block);
+result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
     if (!in.is_ascii()) {
-        result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-        return result(res.error, count + res.count);
-    } else {
-        return result(error_code::SUCCESS, length);
+      result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+      return result(res.error, count + res.count);
     }
+    reader.advance();
+
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  if (!in.is_ascii()) {
+    result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+    return result(res.error, count + res.count);
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
 }
 
-result generic_validate_ascii_with_errors(const char* input, size_t length)
-{
-    return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+result generic_validate_ascii_with_errors(const char * input, size_t length) {
+  return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 } // namespace utf8_validation
@@ -26760,6 +25262,7 @@ result generic_validate_ascii_with_errors(const char* input, size_t length)
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
+
 namespace simdutf {
 namespace ppc64 {
 namespace {
@@ -26767,64 +25270,63 @@ namespace utf8_to_utf16 {
 
 using namespace simd;
 
-template<endianness endian>
+template <endianness endian>
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char16_t* utf16_output) noexcept
-{
-    // The implementation is not specific to haswell and should be moved to the generic directory.
-    size_t pos = 0;
-    char16_t* start { utf16_output };
-    const size_t safety_margin = 16; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-        // this loop could be unrolled further. For example, we could process the mask
-        // far more than 64 bytes.
-        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
-        if (in.is_ascii()) {
-            in.store_ascii_as_utf16<endian>(utf16_output);
-            utf16_output += 64;
-            pos += 64;
-        } else {
-            // Slow path. We hope that the compiler will recognize that this is a slow path.
-            // Anything that is not a continuation mask is a 'leading byte', that is, the
-            // start of a new code point.
-            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-            // The *start* of code points is not so useful, rather, we want the *end* of code points.
-            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-            // We process in blocks of up to 12 bytes except possibly
-            // for fast paths which may process up to 16 bytes. For the
-            // slow path to work, we should have at least 12 input bytes left.
-            size_t max_starting_point = (pos + 64) - 12;
-            // Next loop is going to run at least five times when using solely
-            // the slow/regular path, and at least four times if there are fast paths.
-            while (pos < max_starting_point) {
-                // Performance note: our ability to compute 'consumed' and
-                // then shift and recompute is critical. If there is a
-                // latency of, say, 4 cycles on getting 'consumed', then
-                // the inner loop might have a total latency of about 6 cycles.
-                // Yet we process between 6 to 12 inputs bytes, thus we get
-                // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                // for this section of the code. Hence, there is a limit
-                // to how much we can further increase this latency before
-                // it seriously harms performance.
-                //
-                // Thus we may allow convert_masked_utf8_to_utf16 to process
-                // more bytes at a time under a fast-path mode where 16 bytes
-                // are consumed at once (e.g., when encountering ASCII).
-                size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
-                    utf8_end_of_code_point_mask, utf16_output);
-                pos += consumed;
-                utf8_end_of_code_point_mask >>= consumed;
-            }
-            // At this point there may remain between 0 and 12 bytes in the
-            // 64-byte block. These bytes will be processed again. So we have an
-            // 80% efficiency (in the worst case). In practice we expect an
-            // 85% to 90% efficiency.
-        }
-    }
-    utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
-    return utf16_output - start;
+    char16_t* utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the generic directory.
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while(pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the mask
+    // far more than 64 bytes.
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if(in.is_ascii()) {
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow path.
+      // Anything that is not a continuation mask is a 'leading byte', that is, the
+      // start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end* of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while(pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
+  }
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
+  return utf16_output - start;
 }
 
 } // namespace utf8_to_utf16
@@ -26835,28 +25337,29 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
+
 namespace simdutf {
 namespace ppc64 {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
 
-simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
-{
-    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-    // Bit 1 = Too Long (ASCII followed by continuation)
-    // Bit 2 = Overlong 3-byte
-    // Bit 4 = Surrogate
-    // Bit 5 = Overlong 2-byte
-    // Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -26864,274 +25367,275 @@ simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> in
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-    // 11110101 1000____
-    // 1111011_ 1000____
-    // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-        // 0_______ ________ <ASCII in byte 1>
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        // 10______ ________ <continuation in byte 1>
-        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-        // 1100____ ________ <two byte lead in byte 1>
-        TOO_SHORT | OVERLONG_2,
-        // 1101____ ________ <two byte lead in byte 1>
-        TOO_SHORT,
-        // 1110____ ________ <three byte lead in byte 1>
-        TOO_SHORT | OVERLONG_3 | SURROGATE,
-        // 1111____ ________ <four+ byte lead in byte 1>
-        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-        // ____0000 ________
-        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-        // ____0001 ________
-        CARRY | OVERLONG_2,
-        // ____001_ ________
-        CARRY, CARRY,
-
-        // ____0100 ________
-        CARRY | TOO_LARGE,
-        // ____0101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____011_ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-        // ____1___ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____1101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ________ 0_______ <ASCII in byte 2>
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-        // ________ 1000____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-        // ________ 1001____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-        // ________ 101_____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-
-        // ________ 11______
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
     return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
-{
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-}
+  }
+
 
-struct validating_transcoder {
+  struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder()
-        : error(uint8_t(0))
-    {
-    }
+    validating_transcoder() : error(uint8_t(0)) {}
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-        simd8<uint8_t> sc = check_special_cases(input, prev1);
-        this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-    template<endianness endian>
-    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output)
-    {
-        size_t pos = 0;
-        char16_t* start { utf16_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 8; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf16<endian>(utf16_output);
-                utf16_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                        utf8_end_of_code_point_mask, utf16_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
-            return 0;
-        }
-        if (pos < size) {
-            size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
-            if (howmany == 0) {
-                return 0;
-            }
-            utf16_output += howmany;
-        }
-        return utf16_output - start;
-    }
-
-    template<endianness endian>
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output)
-    {
-        size_t pos = 0;
-        char16_t* start { utf16_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 8; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf16<endian>(utf16_output);
-                utf16_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                if (errors()) {
-                    // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-                    // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-                    result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-                    res.count += pos;
-                    return res;
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                        utf8_end_of_code_point_mask, utf16_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+
+    template <endianness endian>
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
+      size_t pos = 0;
+      char16_t* start{utf16_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 8; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf16<endian>(utf16_output);
+          utf16_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) { return 0; }
+      if(pos < size) {
+        size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
+        if(howmany == 0) { return 0; }
+        utf16_output += howmany;
+      }
+      return utf16_output - start;
+    }
+
+    template <endianness endian>
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
+      size_t pos = 0;
+      char16_t* start{utf16_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 8; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf16<endian>(utf16_output);
+          utf16_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          if (errors()) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) {
+        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+        res.count += pos;
+        return res;
+      }
+      if(pos < size) {
+        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+        if (res.error) {    // In case of error, we want the error position
+          res.count += pos;
+          return res;
+        } else {    // In case of success, we want the number of word written
+          utf16_output += res.count;
         }
-        if (pos < size) {
-            // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-            // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-            result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-            if (res.error) { // In case of error, we want the error position
-                res.count += pos;
-                return res;
-            } else { // In case of success, we want the number of word written
-                utf16_output += res.count;
-            }
-        }
-        return result(error_code::SUCCESS, utf16_output - start);
+      }
+      return result(error_code::SUCCESS, utf16_output - start);
     }
 
-    simdutf_really_inline bool errors() const
-    {
-        return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
     }
 
-}; // struct utf8_checker
+  }; // struct utf8_checker
 } // utf8_to_utf16 namespace
 } // unnamed namespace
 } // namespace ppc64
@@ -27148,36 +25652,37 @@ namespace utf8_to_utf32 {
 
 using namespace simd;
 
+
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char32_t* utf32_output) noexcept
-{
-    size_t pos = 0;
-    char32_t* start { utf32_output };
-    const size_t safety_margin = 16; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
-        if (in.is_ascii()) {
-            in.store_ascii_as_utf32(utf32_output);
-            utf32_output += 64;
-            pos += 64;
-        } else {
-            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-            size_t max_starting_point = (pos + 64) - 12;
-            while (pos < max_starting_point) {
-                size_t consumed = convert_masked_utf8_to_utf32(input + pos,
-                    utf8_end_of_code_point_mask, utf32_output);
-                pos += consumed;
-                utf8_end_of_code_point_mask >>= consumed;
-            }
-        }
+    char32_t* utf32_output) noexcept {
+  size_t pos = 0;
+  char32_t* start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while(pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if(in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+    uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+    uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+    size_t max_starting_point = (pos + 64) - 12;
+    while(pos < max_starting_point) {
+      size_t consumed = convert_masked_utf8_to_utf32(input + pos,
+                          utf8_end_of_code_point_mask, utf32_output);
+      pos += consumed;
+      utf8_end_of_code_point_mask >>= consumed;
+      }
     }
-    utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
-    return utf32_output - start;
+  }
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
+  return utf32_output - start;
 }
 
+
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace ppc64
@@ -27186,28 +25691,29 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
+
 namespace simdutf {
 namespace ppc64 {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
 
-simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
-{
-    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-    // Bit 1 = Too Long (ASCII followed by continuation)
-    // Bit 2 = Overlong 3-byte
-    // Bit 4 = Surrogate
-    // Bit 5 = Overlong 2-byte
-    // Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -27215,266 +25721,268 @@ simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> in
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-    // 11110101 1000____
-    // 1111011_ 1000____
-    // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-        // 0_______ ________ <ASCII in byte 1>
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        // 10______ ________ <continuation in byte 1>
-        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-        // 1100____ ________ <two byte lead in byte 1>
-        TOO_SHORT | OVERLONG_2,
-        // 1101____ ________ <two byte lead in byte 1>
-        TOO_SHORT,
-        // 1110____ ________ <three byte lead in byte 1>
-        TOO_SHORT | OVERLONG_3 | SURROGATE,
-        // 1111____ ________ <four+ byte lead in byte 1>
-        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-        // ____0000 ________
-        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-        // ____0001 ________
-        CARRY | OVERLONG_2,
-        // ____001_ ________
-        CARRY, CARRY,
-
-        // ____0100 ________
-        CARRY | TOO_LARGE,
-        // ____0101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____011_ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-        // ____1___ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____1101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ________ 0_______ <ASCII in byte 2>
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-        // ________ 1000____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-        // ________ 1001____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-        // ________ 101_____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-
-        // ________ 11______
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
     return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
-{
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-}
+  }
+
 
-struct validating_transcoder {
+  struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder()
-        : error(uint8_t(0))
-    {
-    }
+    validating_transcoder() : error(uint8_t(0)) {}
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-        simd8<uint8_t> sc = check_special_cases(input, prev1);
-        this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output)
-    {
-        size_t pos = 0;
-        char32_t* start { utf32_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 4; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf32(utf32_output);
-                utf32_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                        utf8_end_of_code_point_mask, utf32_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
-            return 0;
-        }
-        if (pos < size) {
-            size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-            if (howmany == 0) {
-                return 0;
-            }
-            utf32_output += howmany;
-        }
-        return utf32_output - start;
-    }
-
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output)
-    {
-        size_t pos = 0;
-        char32_t* start { utf32_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 4; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf32(utf32_output);
-                utf32_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                if (errors()) {
-                    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-                    res.count += pos;
-                    return res;
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                        utf8_end_of_code_point_mask, utf32_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
+      size_t pos = 0;
+      char32_t* start{utf32_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 4; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf32(utf32_output);
+          utf32_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                            utf8_end_of_code_point_mask, utf32_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) { return 0; }
+      if(pos < size) {
+        size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+        if(howmany == 0) { return 0; }
+        utf32_output += howmany;
+      }
+      return utf32_output - start;
+    }
+
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
+      size_t pos = 0;
+      char32_t* start{utf32_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 4; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf32(utf32_output);
+          utf32_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          if (errors()) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                            utf8_end_of_code_point_mask, utf32_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) {
+        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+        res.count += pos;
+        return res;
+      }
+      if(pos < size) {
+        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+        if (res.error) {    // In case of error, we want the error position
+          res.count += pos;
+          return res;
+        } else {    // In case of success, we want the number of word written
+          utf32_output += res.count;
         }
-        if (pos < size) {
-            result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-            if (res.error) { // In case of error, we want the error position
-                res.count += pos;
-                return res;
-            } else { // In case of success, we want the number of word written
-                utf32_output += res.count;
-            }
-        }
-        return result(error_code::SUCCESS, utf32_output - start);
+      }
+      return result(error_code::SUCCESS, utf32_output - start);
     }
 
-    simdutf_really_inline bool errors() const
-    {
-        return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
     }
 
-}; // struct utf8_checker
+  }; // struct utf8_checker
 } // utf8_to_utf32 namespace
 } // unnamed namespace
 } // namespace ppc64
@@ -27491,37 +25999,36 @@ namespace utf8 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char* in, size_t size)
-{
+simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
-    for (; pos + 64 <= size; pos += 64) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        count += 64 - count_ones(utf8_continuation_mask);
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      count += 64 - count_ones(utf8_continuation_mask);
     }
     return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size)
-{
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for (; pos + 64 <= size; pos += 64) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        // We count one word for anything that is not a continuation (so
-        // leading bytes).
-        count += 64 - count_ones(utf8_continuation_mask);
-        int64_t utf8_4byte = input.gteq_unsigned(240);
-        count += count_ones(utf8_4byte);
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      // We count one word for anything that is not a continuation (so
+      // leading bytes).
+      count += 64 - count_ones(utf8_continuation_mask);
+      int64_t utf8_4byte = input.gteq_unsigned(240);
+      count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
 
-simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
-{
+
+simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
     return count_code_points(in, size);
 }
 } // utf8 namespace
@@ -27536,65 +26043,57 @@ namespace ppc64 {
 namespace {
 namespace utf16 {
 
-template<endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
-    for (; pos + 32 <= size; pos += 32) {
-        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
-        if (!match_system(big_endian)) {
-            input.swap_bytes();
-        }
-        uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-        count += count_ones(not_pair) / 2;
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      if (!match_system(big_endian)) { input.swap_bytes(); }
+      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+      count += count_ones(not_pair) / 2;
     }
     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
 }
 
-template<endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for (; pos + 32 <= size; pos += 32) {
-        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
-        if (!match_system(big_endian)) {
-            input.swap_bytes();
-        }
-        uint64_t ascii_mask = input.lteq(0x7F);
-        uint64_t twobyte_mask = input.lteq(0x7FF);
-        uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-        size_t ascii_count = count_ones(ascii_mask) / 2;
-        size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
-        size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
-        size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-        count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      if (!match_system(big_endian)) { input.swap_bytes(); }
+      uint64_t ascii_mask = input.lteq(0x7F);
+      uint64_t twobyte_mask = input.lteq(0x7FF);
+      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+      size_t ascii_count = count_ones(ascii_mask) / 2;
+      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
+      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
+      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
     }
     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
 }
 
-template<endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
     return count_code_points<big_endian>(in, size);
 }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output)
-{
-    size_t pos = 0;
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
+  size_t pos = 0;
 
-    while (pos + 32 <= size) {
-        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
-        input.swap_bytes();
-        input.store(reinterpret_cast<uint16_t*>(output));
-        pos += 32;
-        output += 32;
-    }
+  while (pos + 32 <= size) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
+  }
 
-    scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
 } // utf16
@@ -27609,289 +26108,228 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 namespace simdutf {
 namespace ppc64 {
 
-simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
-{
-    // If there is a BOM, then we trust it.
-    auto bom_encoding = simdutf::BOM::check_bom(input, length);
-    if (bom_encoding != encoding_type::unspecified) {
-        return bom_encoding;
-    }
-    int out = 0;
-    if (validate_utf8(input, length)) {
-        out |= encoding_type::UTF8;
-    }
-    if ((length % 2) == 0) {
-        if (validate_utf16(reinterpret_cast<const char16_t*>(input), length / 2)) {
-            out |= encoding_type::UTF16_LE;
-        }
-    }
-    if ((length % 4) == 0) {
-        if (validate_utf32(reinterpret_cast<const char32_t*>(input), length / 4)) {
-            out |= encoding_type::UTF32_LE;
-        }
-    }
+simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
+  int out = 0;
+  if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
+  if((length % 2) == 0) {
+    if(validate_utf16(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
+  }
+  if((length % 4) == 0) {
+    if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
+  }
 
-    return out;
+  return out;
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
-{
-    return ppc64::utf8_validation::generic_validate_utf8(buf, len);
+simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_utf8(buf,len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
-{
-    return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf,len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
-{
-    return ppc64::utf8_validation::generic_validate_ascii(buf, len);
+simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_ascii(buf,len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
-{
-    return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf,len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
-{
-    return scalar::utf16::validate<endianness::LITTLE>(buf, len);
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate<endianness::LITTLE>(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
-{
-    return scalar::utf16::validate<endianness::BIG>(buf, len);
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate<endianness::BIG>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
-{
-    return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
-{
-    return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
-{
-    return scalar::utf32::validate_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
+  return scalar::utf32::validate_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char16_t* buf, size_t len) const noexcept
-{
-    return scalar::utf32::validate(buf, len);
+simdutf_warn_unused bool implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf32::validate(buf, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
-{
-    return 0; // stub
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
+  return 0; // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
-{
-    return 0; // stub
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
+  return 0; // stub
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
-{
-    return result(error_code::OTHER, 0); // stub
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
+  return result(error_code::OTHER, 0); // stub
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
-{
-    return result(error_code::OTHER, 0); // stub
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
+  return result(error_code::OTHER, 0); // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
-{
-    return 0; // stub
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
+  return 0; // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
-{
-    return 0; // stub
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
+  return 0; // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept
-{
-    return 0; // stub
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
+  return 0; // stub
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept
-{
-    return result(error_code::OTHER, 0); // stub
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
+  return result(error_code::OTHER, 0); // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept
-{
-    return 0; // stub
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
+  return 0; // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
 }
 
-void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
-{
-    scalar::utf16::change_endianness_utf16(input, length, output);
+void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
+  scalar::utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
-{
-    return utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
+  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
-{
-    return scalar::utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
+  return scalar::utf8::utf16_length_from_utf8(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
-{
-    return scalar::utf32::utf8_length_from_utf32(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
+  return scalar::utf32::utf8_length_from_utf32(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
-{
-    return scalar::utf32::utf16_length_from_utf32(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
+  return scalar::utf32::utf16_length_from_utf32(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
-{
-    return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
+  return scalar::utf8::count_code_points(input, length);
 }
 
 } // namespace ppc64
@@ -27924,34 +26362,30 @@ namespace {
 #endif
 using namespace simd;
 
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input)
-{
-    return input.reduce_or().is_ascii();
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+  return input.reduce_or().is_ascii();
 }
 
-simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
-{
-    simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
-    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
-    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
-    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-    return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
-{
-    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
-    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
-    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-    return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_detect_encodings.cpp
 /* begin file src/westmere/sse_detect_encodings.cpp */
 template<class checker>
 // len is known to be a multiple of 2 when this is called
-int sse_detect_encodings(const char* buf, size_t len)
-{
+int sse_detect_encodings(const char * buf, size_t len) {
     const char* start = buf;
     const char* end = buf + len;
 
@@ -27966,13 +26400,13 @@ int sse_detect_encodings(const char* buf, size_t len)
 
     __m128i currentmax = _mm_setzero_si128();
 
-    checker check {};
+    checker check{};
 
-    while (buf + 64 <= end) {
+    while(buf + 64 <= end) {
         __m128i in = _mm_loadu_si128((__m128i*)buf);
-        __m128i secondin = _mm_loadu_si128((__m128i*)buf + 1);
-        __m128i thirdin = _mm_loadu_si128((__m128i*)buf + 2);
-        __m128i fourthin = _mm_loadu_si128((__m128i*)buf + 3);
+        __m128i secondin = _mm_loadu_si128((__m128i*)buf+1);
+        __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
+        __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
 
         const auto u0 = simd16<uint16_t>(in);
         const auto u1 = simd16<uint16_t>(secondin);
@@ -28006,15 +26440,15 @@ int sse_detect_encodings(const char* buf, size_t len)
                 is_utf32 = false;
                 // Code from sse_validate_utf16le.cpp
                 // Not efficient, we do not process surrogates_bitmask1
-                const char16_t* input = reinterpret_cast<const char16_t*>(buf);
-                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len / 2;
+                const char16_t * input = reinterpret_cast<const char16_t*>(buf);
+                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
 
                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
                 const uint16_t V0 = static_cast<uint16_t>(~surrogates_bitmask0);
 
-                const auto vH0 = (in16 & v_fc) == v_dc;
+                const auto    vH0 = (in16 & v_fc) == v_dc;
                 const uint16_t H0 = static_cast<uint16_t>(vH0.to_bitmask());
 
                 const uint16_t L0 = static_cast<uint16_t>(~H0 & surrogates_bitmask0);
@@ -28050,7 +26484,7 @@ int sse_detect_encodings(const char* buf, size_t len)
                     } else {
                         const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
-                        const auto vH = (in_16 & v_fc) == v_dc;
+                        const auto    vH = (in_16 & v_fc) == v_dc;
                         const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
                         const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
@@ -28075,8 +26509,8 @@ int sse_detect_encodings(const char* buf, size_t len)
                 is_utf16 = false;
                 // Check for UTF-32
                 if (len % 4 == 0) {
-                    const char32_t* input = reinterpret_cast<const char32_t*>(buf);
-                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len / 4;
+                    const char32_t * input = reinterpret_cast<const char32_t*>(buf);
+                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
 
                     // Must start checking for surrogates
                     __m128i currentoffsetmax = _mm_setzero_si128();
@@ -28094,14 +26528,14 @@ int sse_detect_encodings(const char* buf, size_t len)
                     currentoffsetmax = _mm_max_epu32(_mm_add_epi32(fourthin, offset), currentoffsetmax);
 
                     while (input + 4 < end32) {
-                        const __m128i in32 = _mm_loadu_si128((__m128i*)input);
-                        currentmax = _mm_max_epu32(in32, currentmax);
+                        const __m128i in32 = _mm_loadu_si128((__m128i *)input);
+                        currentmax = _mm_max_epu32(in32,currentmax);
                         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in32, offset), currentoffsetmax);
                         input += 4;
                     }
 
                     __m128i forbidden_words = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-                    if (_mm_testz_si128(forbidden_words, forbidden_words) == 0) {
+                    if(_mm_testz_si128(forbidden_words, forbidden_words) == 0) {
                         is_utf32 = false;
                     }
                 } else {
@@ -28130,7 +26564,7 @@ int sse_detect_encodings(const char* buf, size_t len)
 
     if (is_utf8) {
         if (static_cast<size_t>(buf - start) != len) {
-            uint8_t block[64] {};
+            uint8_t block[64]{};
             std::memset(block, 0x20, 64);
             std::memcpy(block, buf, len - (buf - start));
             simd::simd8x64<uint8_t> in(block);
@@ -28141,14 +26575,14 @@ int sse_detect_encodings(const char* buf, size_t len)
         }
     }
 
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start)) / 2)) {
+    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
         out |= simdutf::encoding_type::UTF16_LE;
     }
 
     if (is_utf32 && (len % 4 == 0)) {
         const __m128i standardmax = _mm_set1_epi32(0x10ffff);
         __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-        if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start)) / 4)) {
+        if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
             out |= simdutf::encoding_type::UTF32_LE;
         }
     }
@@ -28204,9 +26638,8 @@ int sse_detect_encodings(const char* buf, size_t len)
    - pointer to the last unprocessed character (a scalar fallback should check the rest);
    - nullptr if an error was detected.
 */
-template<endianness big_endian>
-const char16_t* sse_validate_utf16(const char16_t* input, size_t size)
-{
+template <endianness big_endian>
+const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
     const char16_t* end = input + size;
 
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -28247,19 +26680,19 @@ const char16_t* sse_validate_utf16(const char16_t* input, size_t size)
             const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto vH = (in & v_fc) == v_dc;
+            const auto    vH = (in & v_fc) == v_dc;
             const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
             // L - word mask for low surrogates
             //     L = not H and surrogates_wordmask
             const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
 
-            const uint16_t a = static_cast<uint16_t>(L & (H >> 1)); // A low surrogate must be followed by high one.
-                                                                    // (A low surrogate placed in the 7th register's word
-                                                                    // is an exception we handle.)
-            const uint16_t b = static_cast<uint16_t>(a << 1); // Just mark that the opinput - startite fact is hold,
-                                                              // thanks to that we have only two masks for valid case.
-            const uint16_t c = static_cast<uint16_t>(V | a | b); // Combine all the masks into the final one.
+            const uint16_t a = static_cast<uint16_t>(L & (H >> 1));  // A low surrogate must be followed by high one.
+                                              // (A low surrogate placed in the 7th register's word
+                                              // is an exception we handle.)
+            const uint16_t b = static_cast<uint16_t>(a << 1);        // Just mark that the opinput - startite fact is hold,
+                                              // thanks to that we have only two masks for valid case.
+            const uint16_t c = static_cast<uint16_t>(V | a | b);     // Combine all the masks into the final one.
 
             if (c == 0xffff) {
                 // The whole input register contains valid UTF-16, i.e.,
@@ -28280,9 +26713,9 @@ const char16_t* sse_validate_utf16(const char16_t* input, size_t size)
     return input;
 }
 
-template<endianness big_endian>
-const result sse_validate_utf16_with_errors(const char16_t* input, size_t size)
-{
+
+template <endianness big_endian>
+const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) {
     const char16_t* start = input;
     const char16_t* end = input + size;
 
@@ -28325,19 +26758,19 @@ const result sse_validate_utf16_with_errors(const char16_t* input, size_t size)
             const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto vH = (in & v_fc) == v_dc;
+            const auto    vH = (in & v_fc) == v_dc;
             const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
             // L - word mask for low surrogates
             //     L = not H and surrogates_wordmask
             const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
 
-            const uint16_t a = static_cast<uint16_t>(L & (H >> 1)); // A low surrogate must be followed by high one.
-                                                                    // (A low surrogate placed in the 7th register's word
-                                                                    // is an exception we handle.)
-            const uint16_t b = static_cast<uint16_t>(a << 1); // Just mark that the opinput - startite fact is hold,
-                                                              // thanks to that we have only two masks for valid case.
-            const uint16_t c = static_cast<uint16_t>(V | a | b); // Combine all the masks into the final one.
+            const uint16_t a = static_cast<uint16_t>(L & (H >> 1));  // A low surrogate must be followed by high one.
+                                              // (A low surrogate placed in the 7th register's word
+                                              // is an exception we handle.)
+            const uint16_t b = static_cast<uint16_t>(a << 1);        // Just mark that the opinput - startite fact is hold,
+                                              // thanks to that we have only two masks for valid case.
+            const uint16_t c = static_cast<uint16_t>(V | a | b);     // Combine all the masks into the final one.
 
             if (c == 0xffff) {
                 // The whole input register contains valid UTF-16, i.e.,
@@ -28364,8 +26797,7 @@ const result sse_validate_utf16_with_errors(const char16_t* input, size_t size)
    - pointer to the last unprocessed character (a scalar fallback should check the rest);
    - nullptr if an error was detected.
 */
-const char32_t* sse_validate_utf32le(const char32_t* input, size_t size)
-{
+const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
     const char32_t* end = input + size;
 
     const __m128i standardmax = _mm_set1_epi32(0x10ffff);
@@ -28375,26 +26807,26 @@ const char32_t* sse_validate_utf32le(const char32_t* input, size_t size)
     __m128i currentoffsetmax = _mm_setzero_si128();
 
     while (input + 4 < end) {
-        const __m128i in = _mm_loadu_si128((__m128i*)input);
-        currentmax = _mm_max_epu32(in, currentmax);
+        const __m128i in = _mm_loadu_si128((__m128i *)input);
+        currentmax = _mm_max_epu32(in,currentmax);
         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
         input += 4;
     }
     __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+    if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
         return nullptr;
     }
 
     is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+    if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
         return nullptr;
     }
 
     return input;
 }
 
-const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size)
-{
+
+const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size) {
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -28405,17 +26837,17 @@ const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size
     __m128i currentoffsetmax = _mm_setzero_si128();
 
     while (input + 4 < end) {
-        const __m128i in = _mm_loadu_si128((__m128i*)input);
-        currentmax = _mm_max_epu32(in, currentmax);
+        const __m128i in = _mm_loadu_si128((__m128i *)input);
+        currentmax = _mm_max_epu32(in,currentmax);
         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
 
         __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-        if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+        if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
             return result(error_code::TOO_LARGE, input - start);
         }
 
         is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-        if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+        if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
             return result(error_code::SURROGATE, input - start);
         }
         input += 4;
@@ -28429,283 +26861,316 @@ const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size
 /* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
+
 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-template<endianness big_endian>
-size_t convert_masked_utf8_to_utf16(const char* input,
-    uint64_t utf8_end_of_code_point_mask,
-    char16_t*& utf16_output)
-{
-    // we use an approach where we try to process up to 12 input bytes.
-    // Why 12 input bytes and not 16? Because we are concerned with the size of
-    // the lookup tables. Also 12 is nicely divisible by two and three.
-    //
-    //
-    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-    // beneficial to have fast paths that depend on branch prediction but have less latency.
-    // This results in more instructions but, potentially, also higher speeds.
-    //
-    // We first try a few fast paths.
-    const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i in = _mm_loadu_si128((__m128i*)input);
-    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
-    if (((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-        // We process the data in chunks of 16 bytes.
-        __m128i ascii_first = _mm_cvtepu8_epi16(in);
-        __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in, 8));
-        if (big_endian) {
-            ascii_first = _mm_shuffle_epi8(ascii_first, swap);
-            ascii_second = _mm_shuffle_epi8(ascii_second, swap);
-        }
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output), ascii_first);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + 8), ascii_second);
-        utf16_output += 16; // We wrote 16 16-bit characters.
-        return 16; // We consumed 16 bytes.
-    }
-    if (((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
-        // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
-        // There is probably a more efficient sequence, but the following might do.
-        const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-        __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-        if (big_endian)
-            composed = _mm_shuffle_epi8(composed, swap);
-        _mm_storeu_si128((__m128i*)utf16_output, composed);
-        utf16_output += 8; // We wrote 16 bytes, 8 code points.
-        return 16;
-    }
-    if (input_utf8_end_of_code_point_mask == 0x924) {
-        // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
-        // There is probably a more efficient sequence, but the following might do.
-        const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-        __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-        if (big_endian)
-            composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-        _mm_storeu_si128((__m128i*)utf16_output, composed_repacked);
-        utf16_output += 4;
-        return 12;
-    }
-    /// We do not have a fast path available, so we fallback.
-
-    const uint8_t idx = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-    const uint8_t consumed = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-    if (idx < 64) {
-        // SIX (6) input code-words
-        // this is a relatively easy scenario
-        // we process SIX (6) input code-words. The max length in bytes of six code
-        // words spanning between 1 and 2 bytes each is 12 bytes. On processors
-        // where pdep/pext is fast, we might be able to use a small lookup table.
-        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-        __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-        if (big_endian)
-            composed = _mm_shuffle_epi8(composed, swap);
-        _mm_storeu_si128((__m128i*)utf16_output, composed);
-        utf16_output += 6; // We wrote 12 bytes, 6 code points.
-    } else if (idx < 145) {
-        // FOUR (4) input code-words
-        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-        __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-        if (big_endian)
-            composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-        _mm_storeu_si128((__m128i*)utf16_output, composed_repacked);
-        utf16_output += 4;
-    } else if (idx < 209) {
-        // TWO (2) input code-words
-        //////////////
-        // There might be garbage inputs where a leading byte mascarades as a four-byte
-        // leading byte (by being followed by 3 continuation byte), but is not greater than
-        // 0xf0. This could trigger a buffer overflow if we only counted leading
-        // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
-        // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
-        // We do as at the cost of an extra mask.
-        /////////////
-        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-        __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-        // correct for spurious high bit
-        const __m128i correct = _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-        middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-        const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-        // We deliberately carry the leading four bits in highbyte if they are present,
-        // we remove them later when computing hightenbits.
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
-        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-        // When we need to generate a surrogate pair (leading byte > 0xF0), then
-        // the corresponding 32-bit value in 'composed'  will be greater than
-        // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
-        // location of the surrogate pairs.
-        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-            _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-        const __m128i composedminus = _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
-        const __m128i lowtenbits = _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
-        // Notice the 0x3ff mask:
-        const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
-        const __m128i lowtenbitsadd = _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
-        const __m128i hightenbitsadd = _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
-        const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
-        __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
-        uint32_t basic_buffer[4];
-        uint32_t basic_buffer_swap[4];
-        if (big_endian) {
-            _mm_storeu_si128((__m128i*)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
-            surrogates = _mm_shuffle_epi8(surrogates, swap);
-        }
-        _mm_storeu_si128((__m128i*)basic_buffer, composed);
-        uint32_t surrogate_buffer[4];
-        _mm_storeu_si128((__m128i*)surrogate_buffer, surrogates);
-        for (size_t i = 0; i < 3; i++) {
-            if (basic_buffer[i] > 0x3c00000) {
-                utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
-                utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
-                utf16_output += 2;
-            } else {
-                utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
-                utf16_output++;
-            }
-        }
-    } else {
-        // here we know that there is an error but we do not handle errors
-    }
-    return consumed;
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+                           uint64_t utf8_end_of_code_point_mask,
+                           char16_t *&utf16_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+  // beneficial to have fast paths that depend on branch prediction but have less latency.
+  // This results in more instructions but, potentially, also higher speeds.
+  //
+  // We first try a few fast paths.
+  const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
+    // We process the data in chunks of 16 bytes.
+    __m128i ascii_first = _mm_cvtepu8_epi16(in);
+    __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in,8));
+    if (big_endian) {
+      ascii_first = _mm_shuffle_epi8(ascii_first, swap);
+      ascii_second = _mm_shuffle_epi8(ascii_second, swap);
+    }
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), ascii_second);
+    utf16_output += 16; // We wrote 16 16-bit characters.
+    return 16; // We consumed 16 bytes.
+  }
+  if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
+    // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
+    // There is probably a more efficient sequence, but the following might do.
+    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed);
+    utf16_output += 8; // We wrote 16 bytes, 8 code points.
+    return 16;
+  }
+  if(input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
+    // There is probably a more efficient sequence, but the following might do.
+    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+    if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+    utf16_output += 4;
+    return 12;
+  }
+  /// We do not have a fast path available, so we fallback.
+
+  const uint8_t idx =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  if (idx < 64) {
+    // SIX (6) input code-words
+    // this is a relatively easy scenario
+    // we process SIX (6) input code-words. The max length in bytes of six code
+    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+    // where pdep/pext is fast, we might be able to use a small lookup table.
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed);
+    utf16_output += 6; // We wrote 12 bytes, 6 code points.
+  } else if (idx < 145) {
+    // FOUR (4) input code-words
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+     __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+    if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+    utf16_output += 4;
+  } else if (idx < 209) {
+    // TWO (2) input code-words
+    //////////////
+    // There might be garbage inputs where a leading byte mascarades as a four-byte
+    // leading byte (by being followed by 3 continuation byte), but is not greater than
+    // 0xf0. This could trigger a buffer overflow if we only counted leading
+    // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
+    // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
+    // We do as at the cost of an extra mask.
+    /////////////
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+    // correct for spurious high bit
+    const __m128i correct =
+        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+    // We deliberately carry the leading four bits in highbyte if they are present,
+    // we remove them later when computing hightenbits.
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+    // When we need to generate a surrogate pair (leading byte > 0xF0), then
+    // the corresponding 32-bit value in 'composed'  will be greater than
+    // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+    // location of the surrogate pairs.
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+    const __m128i composedminus =
+        _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+    const __m128i lowtenbits =
+        _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+    // Notice the 0x3ff mask:
+    const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
+    const __m128i lowtenbitsadd =
+        _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+    const __m128i hightenbitsadd =
+        _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+    const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+    __m128i surrogates =
+        _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+    uint32_t basic_buffer[4];
+    uint32_t basic_buffer_swap[4];
+    if (big_endian) {
+      _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
+      surrogates = _mm_shuffle_epi8(surrogates, swap);
+    }
+    _mm_storeu_si128((__m128i *)basic_buffer, composed);
+    uint32_t surrogate_buffer[4];
+    _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
+    for (size_t i = 0; i < 3; i++) {
+      if(basic_buffer[i] > 0x3c00000) {
+        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+        utf16_output += 2;
+      } else {
+        utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
+        utf16_output++;
+      }
+    }
+  } else {
+    // here we know that there is an error but we do not handle errors
+  }
+  return consumed;
 }
 /* end file src/westmere/sse_convert_utf8_to_utf16.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf8_to_utf32.cpp
 /* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
+
 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_utf32(const char* input,
-    uint64_t utf8_end_of_code_point_mask,
-    char32_t*& utf32_output)
-{
-    // we use an approach where we try to process up to 12 input bytes.
-    // Why 12 input bytes and not 16? Because we are concerned with the size of
-    // the lookup tables. Also 12 is nicely divisible by two and three.
-    //
-    //
-    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-    // beneficial to have fast paths that depend on branch prediction but have less latency.
-    // This results in more instructions but, potentially, also higher speeds.
-    //
-    // We first try a few fast paths.
-    const __m128i in = _mm_loadu_si128((__m128i*)input);
-    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
-    if (((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-        // We process the data in chunks of 16 bytes.
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu8_epi32(in));
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu8_epi32(_mm_srli_si128(in, 4)));
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 8), _mm_cvtepu8_epi32(_mm_srli_si128(in, 8)));
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 12), _mm_cvtepu8_epi32(_mm_srli_si128(in, 12)));
-        utf32_output += 16; // We wrote 16 32-bit characters.
-        return 16; // We consumed 16 bytes.
-    }
-    if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
-        // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
-        // There is probably a more efficient sequence, but the following might do.
-        const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-        const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu16_epi32(composed));
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
-        utf32_output += 8; // We wrote 32 bytes, 8 code points.
-        return 16;
-    }
-    if (input_utf8_end_of_code_point_mask == 0x924) {
-        // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
-        // There is probably a more efficient sequence, but the following might do.
-        const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-        _mm_storeu_si128((__m128i*)utf32_output, composed);
-        utf32_output += 4;
-        return 12;
-    }
-    /// We do not have a fast path available, so we fallback.
-
-    const uint8_t idx = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-    const uint8_t consumed = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-    if (idx < 64) {
-        // SIX (6) input code-words
-        // this is a relatively easy scenario
-        // we process SIX (6) input code-words. The max length in bytes of six code
-        // words spanning between 1 and 2 bytes each is 12 bytes. On processors
-        // where pdep/pext is fast, we might be able to use a small lookup table.
-        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-        const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu16_epi32(composed));
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
-        utf32_output += 6; // We wrote 12 bytes, 6 code points.
-    } else if (idx < 145) {
-        // FOUR (4) input code-words
-        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-        _mm_storeu_si128((__m128i*)utf32_output, composed);
-        utf32_output += 4;
-    } else if (idx < 209) {
-        // TWO (2) input code-words
-        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
-        const __m128i perm = _mm_shuffle_epi8(in, sh);
-        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-        __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-        // correct for spurious high bit
-        const __m128i correct = _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-        middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-        const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
-        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-            _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-        _mm_storeu_si128((__m128i*)utf32_output, composed);
-        utf32_output += 3;
-    } else {
-        // here we know that there is an error but we do not handle errors
-    }
-    return consumed;
+size_t convert_masked_utf8_to_utf32(const char *input,
+                           uint64_t utf8_end_of_code_point_mask,
+                           char32_t *&utf32_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+  // beneficial to have fast paths that depend on branch prediction but have less latency.
+  // This results in more instructions but, potentially, also higher speeds.
+  //
+  // We first try a few fast paths.
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
+    // We process the data in chunks of 16 bytes.
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu8_epi32(in));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu8_epi32(_mm_srli_si128(in,4)));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+8), _mm_cvtepu8_epi32(_mm_srli_si128(in,8)));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+12), _mm_cvtepu8_epi32(_mm_srli_si128(in,12)));
+    utf32_output += 16; // We wrote 16 32-bit characters.
+    return 16; // We consumed 16 bytes.
+  }
+  if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+    // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
+    // There is probably a more efficient sequence, but the following might do.
+    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
+    utf32_output += 8; // We wrote 32 bytes, 8 code points.
+    return 16;
+  }
+  if(input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
+    // There is probably a more efficient sequence, but the following might do.
+    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    _mm_storeu_si128((__m128i *)utf32_output, composed);
+    utf32_output += 4;
+    return 12;
+  }
+  /// We do not have a fast path available, so we fallback.
+
+  const uint8_t idx =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  if (idx < 64) {
+    // SIX (6) input code-words
+    // this is a relatively easy scenario
+    // we process SIX (6) input code-words. The max length in bytes of six code
+    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+    // where pdep/pext is fast, we might be able to use a small lookup table.
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
+    utf32_output += 6; // We wrote 12 bytes, 6 code points.
+  } else if (idx < 145) {
+    // FOUR (4) input code-words
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    _mm_storeu_si128((__m128i *)utf32_output, composed);
+    utf32_output += 4;
+  } else if (idx < 209) {
+    // TWO (2) input code-words
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+    // correct for spurious high bit
+    const __m128i correct =
+        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+    _mm_storeu_si128((__m128i *)utf32_output, composed);
+    utf32_output += 3;
+  } else {
+    // here we know that there is an error but we do not handle errors
+  }
+  return consumed;
 }
 /* end file src/westmere/sse_convert_utf8_to_utf32.cpp */
 
@@ -28764,482 +27229,477 @@ size_t convert_masked_utf8_to_utf32(const char* input,
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template<endianness big_endian>
-std::pair<const char16_t*, char*> sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output)
-{
+template <endianness big_endian>
+std::pair<const char16_t*, char*> sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
 
-    const char16_t* end = buf + len;
+  const char16_t* end = buf + len;
 
-    const __m128i v_0000 = _mm_setzero_si128();
-    const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-    const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
-    const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+  const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-    while (buf + 16 + safety_margin <= end) {
-        __m128i in = _mm_loadu_si128((__m128i*)buf);
+  while (buf + 16 + safety_margin <= end) {
+    __m128i in = _mm_loadu_si128((__m128i*)buf);
+    if (big_endian) {
+      const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
+    }
+    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+    if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+        __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
         if (big_endian) {
-            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-            in = _mm_shuffle_epi8(in, swap);
-        }
-        // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-        const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
-        if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
-            __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
-            if (big_endian) {
-                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-                nextin = _mm_shuffle_epi8(nextin, swap);
-            }
-            if (!_mm_testz_si128(nextin, v_ff80)) {
-                // 1. pack the bytes
-                // obviously suboptimal.
-                const __m128i utf8_packed = _mm_packus_epi16(in, in);
-                // 2. store (16 bytes)
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 8;
-                utf8_output += 8;
-                in = nextin;
-            } else {
-                // 1. pack the bytes
-                // obviously suboptimal.
-                const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
-                // 2. store (16 bytes)
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 16;
-                utf8_output += 16;
-                continue; // we are done for this round!
-            }
-        }
-
-        // no bits set above 7th bit
-        const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
-        const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-        // no bits set above 11th bit
-        const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
-        const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-        if (one_or_two_bytes_bitmask == 0xffff) {
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-            const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const __m128i t0 = _mm_slli_epi16(in, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const __m128i t1 = _mm_and_si128(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const __m128i t2 = _mm_and_si128(in, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const __m128i t3 = _mm_or_si128(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-            // 2. merge ASCII and 2-byte codewords
-            const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
-
-            // 3. prepare bitmask for 8-bit lookup
-            //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
-            const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
-            const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
-            const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
-            // 4. pack the bytes
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-            const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-            // 5. store bytes
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-
-            // 6. adjust pointers
-            buf += 8;
-            utf8_output += row[0];
-            continue;
-        }
-
-        // 1. Check if there are any surrogate word in the input chunk.
-        //    We have also deal with situation when there is a surrogate word
-        //    at the end of a chunk.
-        const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-        // bitmask = 0x0000 if there are no surrogates
-        //         = 0xc000 if the last word is a surrogate
-        const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
-        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-        // it is likely an uncommon occurrence.
-        if (surrogates_bitmask == 0x0000) {
-            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-            const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-            /* In this branch we handle three cases:
-               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-              We expand the input word (16-bit) into two words (32-bit), thus
-              we have room for four bytes. However, we need five distinct bit
-              layouts. Note that the last byte in cases #2 and #3 is the same.
-
-              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-              in register t2.
-
-              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-              either byte 1 for case #2 or byte 2 for case #3. Note that they
-              differ by exactly one bit.
-
-              Finally from these two words we build proper UTF-8 sequence, taking
-              into account the case (i.e, the number of bytes to write).
-            */
-            /**
-             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-             * t2 => [0ccc|cccc] [10cc|cccc]
-             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-             */
+          const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+          nextin = _mm_shuffle_epi8(nextin, swap);
+        }
+        if(!_mm_testz_si128(nextin, v_ff80)) {
+          // 1. pack the bytes
+          // obviously suboptimal.
+          const __m128i utf8_packed = _mm_packus_epi16(in,in);
+          // 2. store (16 bytes)
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 8;
+          utf8_output += 8;
+          in = nextin;
+        } else {
+          // 1. pack the bytes
+          // obviously suboptimal.
+          const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
+          // 2. store (16 bytes)
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 16;
+          utf8_output += 16;
+          continue; // we are done for this round!
+        }
+    }
+
+    // no bits set above 7th bit
+    const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 11th bit
+    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+    if (one_or_two_bytes_bitmask == 0xffff) {
+          // 1. prepare 2-byte values
+          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+          // expected output   : [110a|aaaa|10bb|bbbb] x 8
+          const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+          const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+          // t0 = [000a|aaaa|bbbb|bb00]
+          const __m128i t0 = _mm_slli_epi16(in, 2);
+          // t1 = [000a|aaaa|0000|0000]
+          const __m128i t1 = _mm_and_si128(t0, v_1f00);
+          // t2 = [0000|0000|00bb|bbbb]
+          const __m128i t2 = _mm_and_si128(in, v_003f);
+          // t3 = [000a|aaaa|00bb|bbbb]
+          const __m128i t3 = _mm_or_si128(t1, t2);
+          // t4 = [110a|aaaa|10bb|bbbb]
+          const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+          // 2. merge ASCII and 2-byte codewords
+          const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
+
+          // 3. prepare bitmask for 8-bit lookup
+          //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+          const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
+          const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
+          const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
+          // 4. pack the bytes
+          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+          const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+          // 5. store bytes
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+          // 6. adjust pointers
+          buf += 8;
+          utf8_output += row[0];
+          continue;
+
+    }
+
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x0000) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+        const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+        /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+          We expand the input word (16-bit) into two words (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+          either byte 1 for case #2 or byte 2 for case #3. Note that they
+          differ by exactly one bit.
+
+          Finally from these two words we build proper UTF-8 sequence, taking
+          into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
 #define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-            const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
-            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-            const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
-            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-            const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
-
-            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-            const __m128i s0 = _mm_srli_epi16(in, 4);
-            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-            const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
-            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-            const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
-            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-            const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
-            const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
-            const __m128i s4 = _mm_xor_si128(s3, m0);
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
+
+        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+        const __m128i s0 = _mm_srli_epi16(in, 4);
+        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+        const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+        const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+        const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+        const __m128i s4 = _mm_xor_si128(s3, m0);
 #undef simdutf_vec
 
-            // 4. expand words 16-bit => 32-bit
-            const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-            const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
-
-            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-            const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
-            if (mask == 0) {
-                // We only have three-byte words. Use fast path.
-                const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, -1, -1, -1, -1);
-                const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-                const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-                utf8_output += 12;
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-                utf8_output += 12;
-                buf += 8;
-                continue;
-            }
-            const uint8_t mask0 = uint8_t(mask);
-
-            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-            const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-
-            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-            const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-            utf8_output += row0[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-            utf8_output += row1[0];
-
-            buf += 8;
-            // surrogate pair(s) in a register
+        // 4. expand words 16-bit => 32-bit
+        const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+        const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint16_t mask = (one_byte_bitmask & 0x5555) |
+                              (one_or_two_bytes_bitmask & 0xaaaa);
+        if(mask == 0) {
+          // We only have three-byte words. Use fast path.
+          const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+          const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+          const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+          utf8_output += 12;
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+          utf8_output += 12;
+          buf += 8;
+          continue;
+        }
+        const uint8_t mask0 = uint8_t(mask);
+
+        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+        utf8_output += row0[0];
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+        utf8_output += row1[0];
+
+        buf += 8;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if((word & 0xFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word &0xF800 ) != 0xD800) {
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-                if ((word & 0xFF80) == 0) {
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xF800) == 0) {
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xF800) != 0xD800) {
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else {
-                    // must be a surrogate pair
-                    uint16_t diff = uint16_t(word - 0xD800);
-                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-                    k++;
-                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
-                    if ((diff | diff2) > 0x3FF) {
-                        return std::make_pair(nullptr, utf8_output);
-                    }
-                    uint32_t value = (diff << 10) + diff2 + 0x10000;
-                    *utf8_output++ = char((value >> 18) | 0b11110000);
-                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf8_output); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value>>18) | 0b11110000);
+          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
         }
-    } // while
+      }
+      buf += k;
+    }
+  } // while
 
-    return std::make_pair(buf, utf8_output);
+  return std::make_pair(buf, utf8_output);
 }
 
+
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template<endianness big_endian>
-std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output)
-{
-    const char16_t* start = buf;
-    const char16_t* end = buf + len;
-
-    const __m128i v_0000 = _mm_setzero_si128();
-    const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-    const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
-    const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-    while (buf + 16 + safety_margin <= end) {
-        __m128i in = _mm_loadu_si128((__m128i*)buf);
+template <endianness big_endian>
+std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
+  const char16_t* start = buf;
+  const char16_t* end = buf + len;
+
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+  const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin <= end) {
+    __m128i in = _mm_loadu_si128((__m128i*)buf);
+    if (big_endian) {
+      const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
+    }
+    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+    if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+        __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
         if (big_endian) {
-            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-            in = _mm_shuffle_epi8(in, swap);
-        }
-        // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-        const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
-        if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
-            __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
-            if (big_endian) {
-                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-                nextin = _mm_shuffle_epi8(nextin, swap);
-            }
-            if (!_mm_testz_si128(nextin, v_ff80)) {
-                // 1. pack the bytes
-                // obviously suboptimal.
-                const __m128i utf8_packed = _mm_packus_epi16(in, in);
-                // 2. store (16 bytes)
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 8;
-                utf8_output += 8;
-                in = nextin;
-            } else {
-                // 1. pack the bytes
-                // obviously suboptimal.
-                const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
-                // 2. store (16 bytes)
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 16;
-                utf8_output += 16;
-                continue; // we are done for this round!
-            }
-        }
-
-        // no bits set above 7th bit
-        const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
-        const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-        // no bits set above 11th bit
-        const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
-        const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-        if (one_or_two_bytes_bitmask == 0xffff) {
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-            const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const __m128i t0 = _mm_slli_epi16(in, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const __m128i t1 = _mm_and_si128(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const __m128i t2 = _mm_and_si128(in, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const __m128i t3 = _mm_or_si128(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-            // 2. merge ASCII and 2-byte codewords
-            const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
-
-            // 3. prepare bitmask for 8-bit lookup
-            //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
-            const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
-            const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
-            const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
-            // 4. pack the bytes
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-            const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-            // 5. store bytes
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-
-            // 6. adjust pointers
-            buf += 8;
-            utf8_output += row[0];
-            continue;
-        }
-
-        // 1. Check if there are any surrogate word in the input chunk.
-        //    We have also deal with situation when there is a surrogate word
-        //    at the end of a chunk.
-        const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-        // bitmask = 0x0000 if there are no surrogates
-        //         = 0xc000 if the last word is a surrogate
-        const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
-        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-        // it is likely an uncommon occurrence.
-        if (surrogates_bitmask == 0x0000) {
-            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-            const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-            /* In this branch we handle three cases:
-               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-              We expand the input word (16-bit) into two words (32-bit), thus
-              we have room for four bytes. However, we need five distinct bit
-              layouts. Note that the last byte in cases #2 and #3 is the same.
-
-              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-              in register t2.
-
-              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-              either byte 1 for case #2 or byte 2 for case #3. Note that they
-              differ by exactly one bit.
-
-              Finally from these two words we build proper UTF-8 sequence, taking
-              into account the case (i.e, the number of bytes to write).
-            */
-            /**
-             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-             * t2 => [0ccc|cccc] [10cc|cccc]
-             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-             */
+          const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+          nextin = _mm_shuffle_epi8(nextin, swap);
+        }
+        if(!_mm_testz_si128(nextin, v_ff80)) {
+          // 1. pack the bytes
+          // obviously suboptimal.
+          const __m128i utf8_packed = _mm_packus_epi16(in,in);
+          // 2. store (16 bytes)
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 8;
+          utf8_output += 8;
+          in = nextin;
+        } else {
+          // 1. pack the bytes
+          // obviously suboptimal.
+          const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
+          // 2. store (16 bytes)
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+          // 3. adjust pointers
+          buf += 16;
+          utf8_output += 16;
+          continue; // we are done for this round!
+        }
+    }
+
+    // no bits set above 7th bit
+    const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 11th bit
+    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+    if (one_or_two_bytes_bitmask == 0xffff) {
+          // 1. prepare 2-byte values
+          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+          // expected output   : [110a|aaaa|10bb|bbbb] x 8
+          const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+          const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+          // t0 = [000a|aaaa|bbbb|bb00]
+          const __m128i t0 = _mm_slli_epi16(in, 2);
+          // t1 = [000a|aaaa|0000|0000]
+          const __m128i t1 = _mm_and_si128(t0, v_1f00);
+          // t2 = [0000|0000|00bb|bbbb]
+          const __m128i t2 = _mm_and_si128(in, v_003f);
+          // t3 = [000a|aaaa|00bb|bbbb]
+          const __m128i t3 = _mm_or_si128(t1, t2);
+          // t4 = [110a|aaaa|10bb|bbbb]
+          const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+          // 2. merge ASCII and 2-byte codewords
+          const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
+
+          // 3. prepare bitmask for 8-bit lookup
+          //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+          const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
+          const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
+          const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
+          // 4. pack the bytes
+          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+          const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+          // 5. store bytes
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+          // 6. adjust pointers
+          buf += 8;
+          utf8_output += row[0];
+          continue;
+
+    }
+
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x0000) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+        const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+        /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+          We expand the input word (16-bit) into two words (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+          either byte 1 for case #2 or byte 2 for case #3. Note that they
+          differ by exactly one bit.
+
+          Finally from these two words we build proper UTF-8 sequence, taking
+          into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
 #define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-            const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
-            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-            const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
-            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-            const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
-
-            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-            const __m128i s0 = _mm_srli_epi16(in, 4);
-            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-            const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
-            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-            const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
-            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-            const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
-            const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
-            const __m128i s4 = _mm_xor_si128(s3, m0);
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
+
+        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+        const __m128i s0 = _mm_srli_epi16(in, 4);
+        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+        const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+        const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+        const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+        const __m128i s4 = _mm_xor_si128(s3, m0);
 #undef simdutf_vec
 
-            // 4. expand words 16-bit => 32-bit
-            const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-            const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
-
-            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-            const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
-            if (mask == 0) {
-                // We only have three-byte words. Use fast path.
-                const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, -1, -1, -1, -1);
-                const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-                const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-                utf8_output += 12;
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-                utf8_output += 12;
-                buf += 8;
-                continue;
-            }
-            const uint8_t mask0 = uint8_t(mask);
-
-            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-            const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-
-            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-            const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-            utf8_output += row0[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-            utf8_output += row1[0];
-
-            buf += 8;
-            // surrogate pair(s) in a register
+        // 4. expand words 16-bit => 32-bit
+        const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+        const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint16_t mask = (one_byte_bitmask & 0x5555) |
+                              (one_or_two_bytes_bitmask & 0xaaaa);
+        if(mask == 0) {
+          // We only have three-byte words. Use fast path.
+          const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+          const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+          const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+          utf8_output += 12;
+          _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+          utf8_output += 12;
+          buf += 8;
+          continue;
+        }
+        const uint8_t mask0 = uint8_t(mask);
+
+        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+        utf8_output += row0[0];
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+        utf8_output += row1[0];
+
+        buf += 8;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if((word & 0xFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word &0xF800 ) != 0xD800) {
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-                if ((word & 0xFF80) == 0) {
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xF800) == 0) {
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xF800) != 0xD800) {
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else {
-                    // must be a surrogate pair
-                    uint16_t diff = uint16_t(word - 0xD800);
-                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-                    k++;
-                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
-                    if ((diff | diff2) > 0x3FF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output);
-                    }
-                    uint32_t value = (diff << 10) + diff2 + 0x10000;
-                    *utf8_output++ = char((value >> 18) | 0b11110000);
-                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value>>18) | 0b11110000);
+          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
         }
-    } // while
+      }
+      buf += k;
+    }
+  } // while
 
-    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/westmere/sse_convert_utf16_to_utf8.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf16_to_utf32.cpp
@@ -29297,816 +27757,781 @@ std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* b
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template<endianness big_endian>
-std::pair<const char16_t*, char32_t*> sse_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output)
-{
-    const char16_t* end = buf + len;
-
-    const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-    const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
-
-    while (buf + 16 <= end) {
-        __m128i in = _mm_loadu_si128((__m128i*)buf);
+template <endianness big_endian>
+std::pair<const char16_t*, char32_t*> sse_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
+  const char16_t* end = buf + len;
 
-        if (big_endian) {
-            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-            in = _mm_shuffle_epi8(in, swap);
-        }
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
 
-        // 1. Check if there are any surrogate word in the input chunk.
-        //    We have also deal with situation when there is a surrogate word
-        //    at the end of a chunk.
-        const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+  while (buf + 16 <= end) {
+    __m128i in = _mm_loadu_si128((__m128i*)buf);
 
-        // bitmask = 0x0000 if there are no surrogates
-        //         = 0xc000 if the last word is a surrogate
-        const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
-        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-        // it is likely an uncommon occurrence.
-        if (surrogates_bitmask == 0x0000) {
-            // case: no surrogate pair, extend 16-bit words to 32-bit words
-            _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu16_epi32(in));
-            _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
-            utf32_output += 8;
-            buf += 8;
-            // surrogate pair(s) in a register
+    if (big_endian) {
+      const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
+    }
+
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x0000) {
+      // case: no surrogate pair, extend 16-bit words to 32-bit words
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
+        utf32_output += 8;
+        buf += 8;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if((word &0xF800 ) != 0xD800) {
+          *utf32_output++ = char32_t(word);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-                if ((word & 0xF800) != 0xD800) {
-                    *utf32_output++ = char32_t(word);
-                } else {
-                    // must be a surrogate pair
-                    uint16_t diff = uint16_t(word - 0xD800);
-                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-                    k++;
-                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
-                    if ((diff | diff2) > 0x3FF) {
-                        return std::make_pair(nullptr, utf32_output);
-                    }
-                    uint32_t value = (diff << 10) + diff2 + 0x10000;
-                    *utf32_output++ = char32_t(value);
-                }
-            }
-            buf += k;
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf32_output); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
         }
-    } // while
-    return std::make_pair(buf, utf32_output);
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(buf, utf32_output);
 }
 
+
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template<endianness big_endian>
-std::pair<result, char32_t*> sse_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output)
-{
-    const char16_t* start = buf;
-    const char16_t* end = buf + len;
+template <endianness big_endian>
+std::pair<result, char32_t*> sse_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
+  const char16_t* start = buf;
+  const char16_t* end = buf + len;
 
-    const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-    const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
 
-    while (buf + 16 <= end) {
-        __m128i in = _mm_loadu_si128((__m128i*)buf);
+  while (buf + 16 <= end) {
+    __m128i in = _mm_loadu_si128((__m128i*)buf);
 
-        if (big_endian) {
-            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-            in = _mm_shuffle_epi8(in, swap);
-        }
-
-        // 1. Check if there are any surrogate word in the input chunk.
-        //    We have also deal with situation when there is a surrogate word
-        //    at the end of a chunk.
-        const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-        // bitmask = 0x0000 if there are no surrogates
-        //         = 0xc000 if the last word is a surrogate
-        const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
-        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-        // it is likely an uncommon occurrence.
-        if (surrogates_bitmask == 0x0000) {
-            // case: no surrogate pair, extend 16-bit words to 32-bit words
-            _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu16_epi32(in));
-            _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
-            utf32_output += 8;
-            buf += 8;
-            // surrogate pair(s) in a register
+    if (big_endian) {
+      const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
+    }
+
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+    // it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x0000) {
+      // case: no surrogate pair, extend 16-bit words to 32-bit words
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
+        utf32_output += 8;
+        buf += 8;
+    // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if((word &0xF800 ) != 0xD800) {
+          *utf32_output++ = char32_t(word);
         } else {
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-                if ((word & 0xF800) != 0xD800) {
-                    *utf32_output++ = char32_t(word);
-                } else {
-                    // must be a surrogate pair
-                    uint16_t diff = uint16_t(word - 0xD800);
-                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-                    k++;
-                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
-                    if ((diff | diff2) > 0x3FF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output);
-                    }
-                    uint32_t value = (diff << 10) + diff2 + 0x10000;
-                    *utf32_output++ = char32_t(value);
-                }
-            }
-            buf += k;
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
         }
-    } // while
-    return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
 }
 /* end file src/westmere/sse_convert_utf16_to_utf32.cpp */
 
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf32_to_utf8.cpp
 /* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */
-std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output)
-{
-    const char32_t* end = buf + len;
-
-    const __m128i v_0000 = _mm_setzero_si128(); //__m128 = 128 bits
-    const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); // 1111 1000 0000 0000
-    const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); // 1100 0000 1000 0000
-    const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); // 1111 1111 1000 0000
-    const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); // 1111 1111 1111 1111 0000 0000 0000 0000
-    const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 0111 1111 1111 1111 1111 1111 1111 1111
-    __m128i running_max = _mm_setzero_si128();
-    __m128i forbidden_bytemask = _mm_setzero_si128();
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-    while (buf + 16 + safety_margin <= end) { // buf is a char32_t pointer, each char32_t has 4 bytes or 32 bits, thus buf + 16 * char_32t = 512 bits = 64 bytes
-        // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
-        __m128i in = _mm_loadu_si128((__m128i*)buf);
-        __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1); // These two values can hold only 8 UTF32 chars
-        running_max = _mm_max_epu32(
-            _mm_max_epu32(in, running_max), // take element-wise max char32_t from in and running_max vector
-            nextin); // and take element-wise max element from nextin and running_max vector
-
-        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-        __m128i in_16 = _mm_packus_epi32(
-            _mm_and_si128(in, v_7fffffff),
-            _mm_and_si128(nextin, v_7fffffff)); // in this context pack the two __m128 into a single
-        // By ensuring the highest bit is set to 0(&v_7fffffff), we're making sure all values are interpreted as non-negative, or specifically, the values are within the range of valid Unicode code points.
-        // remember : having leading byte 0 means a positive number by the two complements system. Unicode is well beneath the range where you'll start getting issues so that's OK.
-
-        // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
-
-        // Check for ASCII fast path
-
-        // ASCII fast path!!!!
-        // We eagerly load another 32 bytes, hoping that they will be ASCII too.
-        // The intuition is that we try to collect 16 ASCII characters which requires
-        // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
-        // as our new inputs.
-        if (_mm_testz_si128(in_16, v_ff80)) { // if the first two blocks are ASCII
-            __m128i thirdin = _mm_loadu_si128((__m128i*)buf + 2);
-            __m128i fourthin = _mm_loadu_si128((__m128i*)buf + 3);
-            running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin); // take the running max of all 4 vectors thus far
-            __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff)); // pack into 1 vector, now you have two
-            if (!_mm_testz_si128(nextin_16, v_ff80)) { // checks if the second packed vector is ASCII, if not:
-                // 1. pack the bytes
-                // obviously suboptimal.
-                const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16); // creates two copy of in_16 in 1 vector
-                // 2. store (16 bytes)
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); // put them into the output
-                // 3. adjust pointers
-                buf += 8; // the char32_t buffer pointer goes up 8 char32_t chars* 32 bits =  256 bits
-                utf8_output += 8; // same with output, e.g. lift the first two blocks alone.
-                // Proceed with next input
-                in_16 = nextin_16;
-                // We need to update in and nextin because they are used later.
-                in = thirdin;
-                nextin = fourthin;
-            } else {
-                // 1. pack the bytes
-                const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
-                // 2. store (16 bytes)
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 16;
-                utf8_output += 16;
-                continue; // we are done for this round!
-            }
-        }
-
-        // no bits set above 7th bit -- find out all the ASCII characters
-        const __m128i one_byte_bytemask = _mm_cmpeq_epi16( // this takes four bytes at a time and compares:
-            _mm_and_si128(in_16, v_ff80), // the vector that get only the first 9 bits of each 16-bit/2-byte units
-            v_0000 //
-        ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is of format 0000 0000 0000 0XXX XXXX
-        // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and 0000 0000 0000 0000 if not for each 16-bit/2-byte units
-        const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask)); // collect the MSB from previous vector and put them into uint16_t mas
-
-        // no bits set above 11th bit
-        const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
-        const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-        if (one_or_two_bytes_bitmask == 0xffff) {
-            // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
-            const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111
-
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
-            // t1 = [000a|aaaa|0000|0000]
-            const __m128i t1 = _mm_and_si128(t0, v_1f00); // potentital first utf8 byte
-            // t2 = [0000|0000|00bb|bbbb]
-            const __m128i t2 = _mm_and_si128(in_16, v_003f); // potential second utf8 byte
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const __m128i t3 = _mm_or_si128(t1, t2); // first and second potential utf8 byte together
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const __m128i t4 = _mm_or_si128(t3, v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit
-
-            // 2. merge ASCII and 2-byte codewords
-            const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-            // 3. prepare bitmask for 8-bit lookup
-            //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
-            const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
-            const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
-            const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
-            // 4. pack the bytes
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-            const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-            // 5. store bytes
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-
-            // 6. adjust pointers
-            buf += 8;
-            utf8_output += row[0];
-            continue;
-        }
-
-        // Check for overflow in packing
-
-        const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
-        if (saturation_bitmask == 0xffff) {
-            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-            const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-            forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
-
-            const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-            /* In this branch we handle three cases:
-                1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-                2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-                3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-              We expand the input word (16-bit) into two words (32-bit), thus
-              we have room for four bytes. However, we need five distinct bit
-              layouts. Note that the last byte in cases #2 and #3 is the same.
-
-              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-              in register t2.
-
-              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-              either byte 1 for case #2 or byte 2 for case #3. Note that they
-              differ by exactly one bit.
-
-              Finally from these two words we build proper UTF-8 sequence, taking
-              into account the case (i.e, the number of bytes to write).
-            */
-            /**
-             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-             * t2 => [0ccc|cccc] [10cc|cccc]
-             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-             */
+std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
+  const char32_t* end = buf + len;
+
+  const __m128i v_0000 = _mm_setzero_si128();//__m128 = 128 bits
+  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); //1111 1000 0000 0000
+  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); //1100 0000 1000 0000
+  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); //1111 1111 1000 0000
+  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); //1111 1111 1111 1111 0000 0000 0000 0000
+  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); //0111 1111 1111 1111 1111 1111 1111 1111 
+  __m128i running_max = _mm_setzero_si128();
+  __m128i forbidden_bytemask = _mm_setzero_si128();
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin <= end) { //buf is a char32_t pointer, each char32_t has 4 bytes or 32 bits, thus buf + 16 * char_32t = 512 bits = 64 bytes
+    // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
+    __m128i in = _mm_loadu_si128((__m128i*)buf);
+    __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);//These two values can hold only 8 UTF32 chars
+    running_max = _mm_max_epu32(
+                                _mm_max_epu32(in, running_max), //take element-wise max char32_t from in and running_max vector
+                                 nextin); //and take element-wise max element from nextin and running_max vector
+
+    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+    __m128i in_16 = _mm_packus_epi32(
+                                      _mm_and_si128(in, v_7fffffff), 
+                                      _mm_and_si128(nextin, v_7fffffff)
+                                      );//in this context pack the two __m128 into a single 
+    //By ensuring the highest bit is set to 0(&v_7fffffff), we're making sure all values are interpreted as non-negative, or specifically, the values are within the range of valid Unicode code points.
+    //remember : having leading byte 0 means a positive number by the two complements system. Unicode is well beneath the range where you'll start getting issues so that's OK. 
+
+    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
+
+    // Check for ASCII fast path 
+
+    // ASCII fast path!!!!
+      // We eagerly load another 32 bytes, hoping that they will be ASCII too.
+      // The intuition is that we try to collect 16 ASCII characters which requires
+      // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
+      // as our new inputs.
+    if(_mm_testz_si128(in_16, v_ff80)) {  //if the first two blocks are ASCII
+      __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
+      __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
+      running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin);//take the running max of all 4 vectors thus far
+      __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));//pack into 1 vector, now you have two
+      if(!_mm_testz_si128(nextin_16, v_ff80)) {  //checks if the second packed vector is ASCII, if not:
+        // 1. pack the bytes
+        // obviously suboptimal.
+        const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16); //creates two copy of in_16 in 1 vector
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); //put them into the output
+        // 3. adjust pointers
+        buf += 8; //the char32_t buffer pointer goes up 8 char32_t chars* 32 bits =  256 bits
+        utf8_output += 8; //same with output, e.g. lift the first two blocks alone.
+        // Proceed with next input
+        in_16 = nextin_16;
+        // We need to update in and nextin because they are used later.
+        in = thirdin;
+        nextin = fourthin;
+      } else {
+        // 1. pack the bytes
+        const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      }
+    }
+
+    // no bits set above 7th bit -- find out all the ASCII characters
+    const __m128i one_byte_bytemask = _mm_cmpeq_epi16( // this takes four bytes at a time and compares: 
+                                                      _mm_and_si128(in_16, v_ff80), // the vector that get only the first 9 bits of each 16-bit/2-byte units
+                                                       v_0000 //
+                                                       ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is of format 0000 0000 0000 0XXX XXXX
+    // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and 0000 0000 0000 0000 if not for each 16-bit/2-byte units
+    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask)); // collect the MSB from previous vector and put them into uint16_t mas
+
+    // no bits set above 11th bit
+    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+    if (one_or_two_bytes_bitmask == 0xffff) {
+      // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
+      const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111
+
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
+      // t1 = [000a|aaaa|0000|0000]
+      const __m128i t1 = _mm_and_si128(t0, v_1f00); // potentital first utf8 byte
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m128i t2 = _mm_and_si128(in_16, v_003f);// potential second utf8 byte 
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m128i t3 = _mm_or_si128(t1, t2); // first and second potential utf8 byte together 
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m128i t4 = _mm_or_si128(t3, v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit 
+
+      // 2. merge ASCII and 2-byte codewords
+      const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+      // 3. prepare bitmask for 8-bit lookup
+      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+      const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
+      const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
+      const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
+      // 4. pack the bytes
+      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+      // 5. store bytes
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+      // 6. adjust pointers
+      buf += 8;
+      utf8_output += row[0];
+      continue;
+    }
+
+    // Check for overflow in packing
+
+    const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffff) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+      forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
+
+      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+      /* In this branch we handle three cases:
+          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+        We expand the input word (16-bit) into two words (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
+
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
+
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
+
+        Finally from these two words we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
 #define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-            const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
-            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-            const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
-            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-            const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
-
-            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-            const __m128i s0 = _mm_srli_epi16(in_16, 4);
-            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-            const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
-            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-            const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
-            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-            const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
-            const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
-            const __m128i s4 = _mm_xor_si128(s3, m0);
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
+
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m128i s0 = _mm_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+      const __m128i s4 = _mm_xor_si128(s3, m0);
 #undef simdutf_vec
 
-            // 4. expand words 16-bit => 32-bit
-            const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-            const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
-
-            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-            const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
-            if (mask == 0) {
-                // We only have three-byte words. Use fast path.
-                const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, -1, -1, -1, -1);
-                const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-                const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-                utf8_output += 12;
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-                utf8_output += 12;
-                buf += 8;
-                continue;
-            }
-            const uint8_t mask0 = uint8_t(mask);
-
-            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-            const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-
-            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-            const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-            utf8_output += row0[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-            utf8_output += row1[0];
-
-            buf += 8;
+      // 4. expand words 16-bit => 32-bit
+      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint16_t mask = (one_byte_bitmask & 0x5555) |
+                            (one_or_two_bytes_bitmask & 0xaaaa);
+      if(mask == 0) {
+        // We only have three-byte words. Use fast path.
+        const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+        utf8_output += 12;
+        buf += 8;
+        continue;
+      }
+      const uint8_t mask0 = uint8_t(mask);
+
+      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+      utf8_output += row1[0];
+
+      buf += 8;
+    } else {
+      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFFFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xFFFFF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word &0xFFFF0000 )==0) {
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else {
-            // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFFFF80) == 0) {
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xFFFFF800) == 0) {
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xFFFF0000) == 0) {
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(nullptr, utf8_output);
-                    }
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else {
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(nullptr, utf8_output);
-                    }
-                    *utf8_output++ = char((word >> 18) | 0b11110000);
-                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
-        }
-    } // while
-
-    // check for invalid input
-    const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
-    if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(_mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
-        return std::make_pair(nullptr, utf8_output);
-    }
-
-    if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-        return std::make_pair(nullptr, utf8_output);
-    }
-
-    return std::make_pair(buf, utf8_output);
-}
-
-std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output)
-{
-
-    const char32_t* end = buf + len;
-    const char32_t* start = buf;
-
-    const __m128i v_0000 = _mm_setzero_si128();
-    const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-    const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
-    const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
-    const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-    const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
-    const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
-
-    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-    while (buf + 16 + safety_margin <= end) {
-        // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
-        __m128i in = _mm_loadu_si128((__m128i*)buf);
-        __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
-
-        // Check for too large input
-        __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
-        if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
-            return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-        }
-
-        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-        __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
-
-        // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
-
-        // Check for ASCII fast path
-        if (_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
-            // We eagerly load another 32 bytes, hoping that they will be ASCII too.
-            // The intuition is that we try to collect 16 ASCII characters which requires
-            // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
-            // as our new inputs.
-            __m128i thirdin = _mm_loadu_si128((__m128i*)buf + 2);
-            __m128i fourthin = _mm_loadu_si128((__m128i*)buf + 3);
-            __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
-            if (!_mm_testz_si128(nextin_16, v_ff80)) {
-                // 1. pack the bytes
-                // obviously suboptimal.
-                const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16);
-                // 2. store (16 bytes)
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 8;
-                utf8_output += 8;
-                // Proceed with next input
-                in_16 = nextin_16;
-                __m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff);
-                if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(next_max_input, v_10ffff))) != 0xffff) {
-                    return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-                }
-                // We need to update in and nextin because they are used later.
-                in = thirdin;
-                nextin = fourthin;
-            } else {
-                // 1. pack the bytes
-                const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
-                // 2. store (16 bytes)
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-                // 3. adjust pointers
-                buf += 16;
-                utf8_output += 16;
-                continue; // we are done for this round!
-            }
-        }
-
-        // no bits set above 7th bit
-        const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
-        const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-        // no bits set above 11th bit
-        const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
-        const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-        if (one_or_two_bytes_bitmask == 0xffff) {
-            // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-            const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const __m128i t0 = _mm_slli_epi16(in_16, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const __m128i t1 = _mm_and_si128(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const __m128i t2 = _mm_and_si128(in_16, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const __m128i t3 = _mm_or_si128(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-            // 2. merge ASCII and 2-byte codewords
-            const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-            // 3. prepare bitmask for 8-bit lookup
-            //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
-            const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
-            const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
-            const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
-            // 4. pack the bytes
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-            const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-            // 5. store bytes
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-
-            // 6. adjust pointers
-            buf += 8;
-            utf8_output += row[0];
-            continue;
-        }
-
-        // Check for overflow in packing
-        const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
-
-        if (saturation_bitmask == 0xffff) {
-            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-
-            // Check for illegal surrogate words
-            const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-            const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
-            if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-                return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
-            }
-
-            const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-            /* In this branch we handle three cases:
-                1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-                2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-                3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-              We expand the input word (16-bit) into two words (32-bit), thus
-              we have room for four bytes. However, we need five distinct bit
-              layouts. Note that the last byte in cases #2 and #3 is the same.
-
-              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-              in register t2.
-
-              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-              either byte 1 for case #2 or byte 2 for case #3. Note that they
-              differ by exactly one bit.
-
-              Finally from these two words we build proper UTF-8 sequence, taking
-              into account the case (i.e, the number of bytes to write).
-            */
-            /**
-             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-             * t2 => [0ccc|cccc] [10cc|cccc]
-             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-             */
+          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
+          *utf8_output++ = char((word>>18) | 0b11110000);
+          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  // check for invalid input
+  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
+  if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(_mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
+    return std::make_pair(nullptr, utf8_output);
+  }
+
+  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
+
+  return std::make_pair(buf, utf8_output);
+}
+
+
+std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
+
+  const char32_t* end = buf + len;
+  const char32_t* start = buf;
+
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
+  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
+  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
+  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
+
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin <= end) {
+    // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
+    __m128i in = _mm_loadu_si128((__m128i*)buf);
+    __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
+
+    // Check for too large input
+    __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
+    if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
+    }
+
+    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+    __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
+
+    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
+
+    // Check for ASCII fast path
+    if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
+      // We eagerly load another 32 bytes, hoping that they will be ASCII too.
+      // The intuition is that we try to collect 16 ASCII characters which requires
+      // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
+      // as our new inputs.
+      __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
+      __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
+      __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
+      if(!_mm_testz_si128(nextin_16, v_ff80)) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        // Proceed with next input
+        in_16 = nextin_16;
+        __m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff);
+        if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(next_max_input, v_10ffff))) != 0xffff) {
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
+        }
+        // We need to update in and nextin because they are used later.
+        in = thirdin;
+        nextin = fourthin;
+      } else {
+        // 1. pack the bytes
+        const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      }
+    }
+
+    // no bits set above 7th bit
+    const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
+    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 11th bit
+    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+    if (one_or_two_bytes_bitmask == 0xffff) {
+      // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+      const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m128i t0 = _mm_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m128i t1 = _mm_and_si128(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m128i t2 = _mm_and_si128(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m128i t3 = _mm_or_si128(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+      // 2. merge ASCII and 2-byte codewords
+      const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+      // 3. prepare bitmask for 8-bit lookup
+      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+      const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
+      const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
+      const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
+      // 4. pack the bytes
+      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+      // 5. store bytes
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+      // 6. adjust pointers
+      buf += 8;
+      utf8_output += row[0];
+      continue;
+    }
+
+
+    // Check for overflow in packing
+    const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+
+    if (saturation_bitmask == 0xffff) {
+      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+
+      // Check for illegal surrogate words
+      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+      const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
+      }
+
+      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+      /* In this branch we handle three cases:
+          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+        We expand the input word (16-bit) into two words (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
+
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
+
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
+
+        Finally from these two words we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
 #define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-            const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
-            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-            const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
-            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-            const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
-
-            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-            const __m128i s0 = _mm_srli_epi16(in_16, 4);
-            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-            const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
-            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-            const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
-            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-            const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
-            const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
-            const __m128i s4 = _mm_xor_si128(s3, m0);
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m128i t2 = _mm_or_si128 (t1, simdutf_vec(0b1000000000000000));
+
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m128i s0 = _mm_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+      const __m128i s4 = _mm_xor_si128(s3, m0);
 #undef simdutf_vec
 
-            // 4. expand words 16-bit => 32-bit
-            const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-            const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
-
-            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-            const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
-            if (mask == 0) {
-                // We only have three-byte words. Use fast path.
-                const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, -1, -1, -1, -1);
-                const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-                const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-                utf8_output += 12;
-                _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-                utf8_output += 12;
-                buf += 8;
-                continue;
-            }
-            const uint8_t mask0 = uint8_t(mask);
-
-            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-            const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-
-            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-            const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-            utf8_output += row0[0];
-            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-            utf8_output += row1[0];
-
-            buf += 8;
+      // 4. expand words 16-bit => 32-bit
+      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint16_t mask = (one_byte_bitmask & 0x5555) |
+                            (one_or_two_bytes_bitmask & 0xaaaa);
+      if(mask == 0) {
+        // We only have three-byte words. Use fast path.
+        const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+        utf8_output += 12;
+        buf += 8;
+        continue;
+      }
+      const uint8_t mask0 = uint8_t(mask);
+
+      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+      utf8_output += row1[0];
+
+      buf += 8;
+    } else {
+      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFFFF80)==0) {
+          *utf8_output++ = char(word);
+        } else if((word & 0xFFFFF800)==0) {
+          *utf8_output++ = char((word>>6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if((word &0xFFFF0000 )==0) {
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
+          *utf8_output++ = char((word>>12) | 0b11100000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else {
-            // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
-            // Let us do a scalar fallback.
-            // It may seem wasteful to use scalar code, but being efficient with SIMD
-            // in the presence of surrogate pairs may require non-trivial tables.
-            size_t forward = 15;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFFFF80) == 0) {
-                    *utf8_output++ = char(word);
-                } else if ((word & 0xFFFFF800) == 0) {
-                    *utf8_output++ = char((word >> 6) | 0b11000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else if ((word & 0xFFFF0000) == 0) {
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output);
-                    }
-                    *utf8_output++ = char((word >> 12) | 0b11100000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                } else {
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output);
-                    }
-                    *utf8_output++ = char((word >> 18) | 0b11110000);
-                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
-                }
-            }
-            buf += k;
+          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf- start + k), utf8_output); }
+          *utf8_output++ = char((word>>18) | 0b11110000);
+          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
         }
-    } // while
+      }
+      buf += k;
+    }
+  } // while
 
-    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf32_to_utf16.cpp
 /* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
-template<endianness big_endian>
-std::pair<const char32_t*, char16_t*> sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output)
-{
+template <endianness big_endian>
+std::pair<const char32_t*, char16_t*> sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
 
-    const char32_t* end = buf + len;
+  const char32_t* end = buf + len;
 
-    const __m128i v_0000 = _mm_setzero_si128();
-    const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
-    __m128i forbidden_bytemask = _mm_setzero_si128();
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+  __m128i forbidden_bytemask = _mm_setzero_si128();
 
-    while (buf + 8 <= end) {
-        __m128i in = _mm_loadu_si128((__m128i*)buf);
-        __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
-        const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
-
-        // Check if no bits set above 16th
-        if (saturation_bitmask == 0xffff) {
-            // Pack UTF-32 to UTF-16
-            __m128i utf16_packed = _mm_packus_epi32(in, nextin);
-
-            const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-            const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-            forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
-
-            if (big_endian) {
-                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-            }
+  while (buf + 8 <= end) {
+    __m128i in = _mm_loadu_si128((__m128i*)buf);
+    __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
+    const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
 
-            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-            utf16_output += 8;
-            buf += 8;
+    // Check if no bits set above 16th
+    if (saturation_bitmask == 0xffff) {
+      // Pack UTF-32 to UTF-16
+      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+
+      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+      forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
+
+      if (big_endian) {
+        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+
+      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 7;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFF0000)==0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
+          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
         } else {
-            size_t forward = 7;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFF0000) == 0) {
-                    // will not generate a surrogate pair
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(nullptr, utf16_output);
-                    }
-                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
-                } else {
-                    // will generate a surrogate pair
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(nullptr, utf16_output);
-                    }
-                    word -= 0x10000;
-                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-                    if (big_endian) {
-                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-                    }
-                    *utf16_output++ = char16_t(high_surrogate);
-                    *utf16_output++ = char16_t(low_surrogate);
-                }
-            }
-            buf += k;
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
         }
+      }
+      buf += k;
     }
+  }
 
-    // check for invalid input
-    if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-        return std::make_pair(nullptr, utf16_output);
-    }
+  // check for invalid input
+  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
 
-    return std::make_pair(buf, utf16_output);
+  return std::make_pair(buf, utf16_output);
 }
 
-template<endianness big_endian>
-std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)
-{
-    const char32_t* start = buf;
-    const char32_t* end = buf + len;
 
-    const __m128i v_0000 = _mm_setzero_si128();
-    const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+template <endianness big_endian>
+std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
+  const char32_t* start = buf;
+  const char32_t* end = buf + len;
 
-    while (buf + 8 <= end) {
-        __m128i in = _mm_loadu_si128((__m128i*)buf);
-        __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
-        const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
-
-        // Check if no bits set above 16th
-        if (saturation_bitmask == 0xffff) {
-            // Pack UTF-32 to UTF-16
-            __m128i utf16_packed = _mm_packus_epi32(in, nextin);
-
-            const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-            const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-            const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
-            if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-                return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
-            }
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
 
-            if (big_endian) {
-                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-            }
+  while (buf + 8 <= end) {
+    __m128i in = _mm_loadu_si128((__m128i*)buf);
+    __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
+    const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
 
-            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-            utf16_output += 8;
-            buf += 8;
+    // Check if no bits set above 16th
+    if (saturation_bitmask == 0xffff) {
+      // Pack UTF-32 to UTF-16
+      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+
+      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+      const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
+      }
+
+      if (big_endian) {
+        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+
+      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 7;
+      size_t k = 0;
+      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
+      for(; k < forward; k++) {
+        uint32_t word = buf[k];
+        if((word & 0xFFFF0000)==0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
+          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
         } else {
-            size_t forward = 7;
-            size_t k = 0;
-            if (size_t(end - buf) < forward + 1) {
-                forward = size_t(end - buf - 1);
-            }
-            for (; k < forward; k++) {
-                uint32_t word = buf[k];
-                if ((word & 0xFFFF0000) == 0) {
-                    // will not generate a surrogate pair
-                    if (word >= 0xD800 && word <= 0xDFFF) {
-                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output);
-                    }
-                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
-                } else {
-                    // will generate a surrogate pair
-                    if (word > 0x10FFFF) {
-                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output);
-                    }
-                    word -= 0x10000;
-                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-                    if (big_endian) {
-                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-                    }
-                    *utf16_output++ = char16_t(high_surrogate);
-                    *utf16_output++ = char16_t(low_surrogate);
-                }
-            }
-            buf += k;
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
         }
+      }
+      buf += k;
     }
+  }
 
-    return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
 /* end file src/westmere/sse_convert_utf32_to_utf16.cpp */
 
@@ -30124,103 +28549,85 @@ namespace {
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-    simdutf_really_inline buf_block_reader(const uint8_t* _buf, size_t _len);
-    simdutf_really_inline size_t block_index();
-    simdutf_really_inline bool has_full_block() const;
-    simdutf_really_inline const uint8_t* full_block() const;
-    /**
-     * Get the last block, padded with spaces.
-     *
-     * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-     * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-     * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-     *
-     * @return the number of effective characters in the last block.
-     */
-    simdutf_really_inline size_t get_remainder(uint8_t* dst) const;
-    simdutf_really_inline void advance();
-
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
 private:
-    const uint8_t* buf;
-    const size_t len;
-    const size_t lenminusstep;
-    size_t idx;
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
 };
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char* format_input_text_64(const uint8_t* text)
-{
-    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-    }
-    buf[sizeof(simd8x64<uint8_t>)] = '\0';
-    return buf;
+simdutf_unused static char * format_input_text_64(const uint8_t *text) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char* format_input_text(const simd8x64<uint8_t>& in)
-{
-    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-    in.store(reinterpret_cast<uint8_t*>(buf));
-    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-        if (buf[i] < ' ') {
-            buf[i] = '_';
-        }
-    }
-    buf[sizeof(simd8x64<uint8_t>)] = '\0';
-    return buf;
+simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
-simdutf_unused static char* format_mask(uint64_t mask)
-{
-    static char* buf = reinterpret_cast<char*>(malloc(64 + 1));
-    for (size_t i = 0; i < 64; i++) {
-        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-    }
-    buf[64] = '\0';
-    return buf;
+simdutf_unused static char * format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t* _buf, size_t _len)
-    : buf { _buf }
-    , len { _len }
-    , lenminusstep { len < STEP_SIZE ? 0 : len - STEP_SIZE }
-    , idx { 0 }
-{
-}
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
 
 template<size_t STEP_SIZE>
 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
-{
-    return idx < lenminusstep;
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline const uint8_t* buf_block_reader<STEP_SIZE>::full_block() const
-{
-    return &buf[idx];
+simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t* dst) const
-{
-    if (len == idx) {
-        return 0;
-    } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-    std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-    std::memcpy(dst, buf + idx, len - idx);
-    return len - idx;
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance()
-{
-    idx += STEP_SIZE;
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
 }
 
 } // unnamed namespace
@@ -30236,22 +28643,21 @@ namespace utf8_validation {
 
 using namespace simd;
 
-simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
-{
-    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-    // Bit 1 = Too Long (ASCII followed by continuation)
-    // Bit 2 = Overlong 3-byte
-    // Bit 4 = Surrogate
-    // Bit 5 = Overlong 2-byte
-    // Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -30259,92 +28665,101 @@ simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> in
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-    // 11110101 1000____
-    // 1111011_ 1000____
-    // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-        // 0_______ ________ <ASCII in byte 1>
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        // 10______ ________ <continuation in byte 1>
-        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-        // 1100____ ________ <two byte lead in byte 1>
-        TOO_SHORT | OVERLONG_2,
-        // 1101____ ________ <two byte lead in byte 1>
-        TOO_SHORT,
-        // 1110____ ________ <three byte lead in byte 1>
-        TOO_SHORT | OVERLONG_3 | SURROGATE,
-        // 1111____ ________ <four+ byte lead in byte 1>
-        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-        // ____0000 ________
-        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-        // ____0001 ________
-        CARRY | OVERLONG_2,
-        // ____001_ ________
-        CARRY, CARRY,
-
-        // ____0100 ________
-        CARRY | TOO_LARGE,
-        // ____0101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____011_ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-        // ____1___ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____1101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ________ 0_______ <ASCII in byte 2>
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-        // ________ 1000____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-        // ________ 1001____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-        // ________ 101_____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-
-        // ________ 11______
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
     return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
-{
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-}
+  }
 
-//
-// Return nonzero if there are incomplete multibyte characters at the end of the block:
-// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-//
-simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input)
-{
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     static const uint8_t max_array[32] = {
-        255, 255, 255, 255, 255, 255, 255, 255,
-        255, 255, 255, 255, 255, 255, 255, 255,
-        255, 255, 255, 255, 255, 255, 255, 255,
-        255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
     return input.gt_bits(max_value);
-}
+  }
 
-struct utf8_checker {
+  struct utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
     // The last input we received
@@ -30355,54 +28770,51 @@ struct utf8_checker {
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-        simd8<uint8_t> sc = check_special_cases(input, prev1);
-        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof()
-    {
-        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-        // possibly finish them.
-        this->error |= this->prev_incomplete;
+    simdutf_really_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
     }
 
-    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input)
-    {
-        if (simdutf_likely(is_ascii(input))) {
-            this->error |= this->prev_incomplete;
-        } else {
-            // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-            static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                "We support either two or four chunks per 64-byte block.");
-            if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-            }
-            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
-            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdutf_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+
+      }
     }
 
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const
-    {
-        return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
     }
 
-}; // struct utf8_checker
+  }; // struct utf8_checker
 } // namespace utf8_validation
 
 using utf8_validation::utf8_checker;
@@ -30422,16 +28834,15 @@ namespace utf8_validation {
  * Validates that the string is actual UTF-8.
  */
 template<class checker>
-bool generic_validate_utf8(const uint8_t* input, size_t length)
-{
-    checker c {};
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
     buf_block_reader<64> reader(input, length);
     while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        c.check_next_input(in);
-        reader.advance();
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
     }
-    uint8_t block[64] {};
+    uint8_t block[64]{};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
@@ -30440,106 +28851,97 @@ bool generic_validate_utf8(const uint8_t* input, size_t length)
     return !c.errors();
 }
 
-bool generic_validate_utf8(const char* input, size_t length)
-{
-    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+bool generic_validate_utf8(const char * input, size_t length) {
+  return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 /**
  * Validates that the string is actual UTF-8 and stops on errors.
  */
 template<class checker>
-result generic_validate_utf8_with_errors(const uint8_t* input, size_t length)
-{
-    checker c {};
+result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
+    checker c{};
     buf_block_reader<64> reader(input, length);
-    size_t count { 0 };
+    size_t count{0};
     while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        c.check_next_input(in);
-        if (c.errors()) {
-            if (count != 0) {
-                count--;
-            } // Sometimes the error is only detected in the next chunk
-            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-            res.count += count;
-            return res;
-        }
-        reader.advance();
-        count += 64;
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      if(c.errors()) {
+        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+        res.count += count;
+        return res;
+      }
+      reader.advance();
+      count += 64;
     }
-    uint8_t block[64] {};
+    uint8_t block[64]{};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
     reader.advance();
     c.check_eof();
     if (c.errors()) {
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
-        res.count += count;
-        return res;
+      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
+      res.count += count;
+      return res;
     } else {
-        return result(error_code::SUCCESS, length);
+      return result(error_code::SUCCESS, length);
     }
 }
 
-result generic_validate_utf8_with_errors(const char* input, size_t length)
-{
-    return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+result generic_validate_utf8_with_errors(const char * input, size_t length) {
+  return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 template<class checker>
-bool generic_validate_ascii(const uint8_t* input, size_t length)
-{
+bool generic_validate_ascii(const uint8_t * input, size_t length) {
     buf_block_reader<64> reader(input, length);
-    uint8_t blocks[64] {};
+    uint8_t blocks[64]{};
     simd::simd8x64<uint8_t> running_or(blocks);
     while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        running_or |= in;
-        reader.advance();
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      running_or |= in;
+      reader.advance();
     }
-    uint8_t block[64] {};
+    uint8_t block[64]{};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     running_or |= in;
     return running_or.is_ascii();
 }
 
-bool generic_validate_ascii(const char* input, size_t length)
-{
-    return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+bool generic_validate_ascii(const char * input, size_t length) {
+  return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 template<class checker>
-result generic_validate_ascii_with_errors(const uint8_t* input, size_t length)
-{
-    buf_block_reader<64> reader(input, length);
-    size_t count { 0 };
-    while (reader.has_full_block()) {
-        simd::simd8x64<uint8_t> in(reader.full_block());
-        if (!in.is_ascii()) {
-            result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-            return result(res.error, count + res.count);
-        }
-        reader.advance();
-
-        count += 64;
-    }
-    uint8_t block[64] {};
-    reader.get_remainder(block);
-    simd::simd8x64<uint8_t> in(block);
+result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
     if (!in.is_ascii()) {
-        result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-        return result(res.error, count + res.count);
-    } else {
-        return result(error_code::SUCCESS, length);
+      result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+      return result(res.error, count + res.count);
     }
+    reader.advance();
+
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  if (!in.is_ascii()) {
+    result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+    return result(res.error, count + res.count);
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
 }
 
-result generic_validate_ascii_with_errors(const char* input, size_t length)
-{
-    return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
+result generic_validate_ascii_with_errors(const char * input, size_t length) {
+  return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
 }
 
 } // namespace utf8_validation
@@ -30551,6 +28953,7 @@ result generic_validate_ascii_with_errors(const char* input, size_t length)
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
+
 namespace simdutf {
 namespace westmere {
 namespace {
@@ -30558,64 +28961,63 @@ namespace utf8_to_utf16 {
 
 using namespace simd;
 
-template<endianness endian>
+template <endianness endian>
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char16_t* utf16_output) noexcept
-{
-    // The implementation is not specific to haswell and should be moved to the generic directory.
-    size_t pos = 0;
-    char16_t* start { utf16_output };
-    const size_t safety_margin = 16; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-        // this loop could be unrolled further. For example, we could process the mask
-        // far more than 64 bytes.
-        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
-        if (in.is_ascii()) {
-            in.store_ascii_as_utf16<endian>(utf16_output);
-            utf16_output += 64;
-            pos += 64;
-        } else {
-            // Slow path. We hope that the compiler will recognize that this is a slow path.
-            // Anything that is not a continuation mask is a 'leading byte', that is, the
-            // start of a new code point.
-            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-            // The *start* of code points is not so useful, rather, we want the *end* of code points.
-            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-            // We process in blocks of up to 12 bytes except possibly
-            // for fast paths which may process up to 16 bytes. For the
-            // slow path to work, we should have at least 12 input bytes left.
-            size_t max_starting_point = (pos + 64) - 12;
-            // Next loop is going to run at least five times when using solely
-            // the slow/regular path, and at least four times if there are fast paths.
-            while (pos < max_starting_point) {
-                // Performance note: our ability to compute 'consumed' and
-                // then shift and recompute is critical. If there is a
-                // latency of, say, 4 cycles on getting 'consumed', then
-                // the inner loop might have a total latency of about 6 cycles.
-                // Yet we process between 6 to 12 inputs bytes, thus we get
-                // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                // for this section of the code. Hence, there is a limit
-                // to how much we can further increase this latency before
-                // it seriously harms performance.
-                //
-                // Thus we may allow convert_masked_utf8_to_utf16 to process
-                // more bytes at a time under a fast-path mode where 16 bytes
-                // are consumed at once (e.g., when encountering ASCII).
-                size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
-                    utf8_end_of_code_point_mask, utf16_output);
-                pos += consumed;
-                utf8_end_of_code_point_mask >>= consumed;
-            }
-            // At this point there may remain between 0 and 12 bytes in the
-            // 64-byte block. These bytes will be processed again. So we have an
-            // 80% efficiency (in the worst case). In practice we expect an
-            // 85% to 90% efficiency.
-        }
-    }
-    utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
-    return utf16_output - start;
+    char16_t* utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the generic directory.
+  size_t pos = 0;
+  char16_t* start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while(pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the mask
+    // far more than 64 bytes.
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if(in.is_ascii()) {
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow path.
+      // Anything that is not a continuation mask is a 'leading byte', that is, the
+      // start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end* of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while(pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
+  }
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
+  return utf16_output - start;
 }
 
 } // namespace utf8_to_utf16
@@ -30626,28 +29028,29 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
+
 namespace simdutf {
 namespace westmere {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
 
-simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
-{
-    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-    // Bit 1 = Too Long (ASCII followed by continuation)
-    // Bit 2 = Overlong 3-byte
-    // Bit 4 = Surrogate
-    // Bit 5 = Overlong 2-byte
-    // Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -30655,274 +29058,275 @@ simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> in
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-    // 11110101 1000____
-    // 1111011_ 1000____
-    // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-        // 0_______ ________ <ASCII in byte 1>
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        // 10______ ________ <continuation in byte 1>
-        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-        // 1100____ ________ <two byte lead in byte 1>
-        TOO_SHORT | OVERLONG_2,
-        // 1101____ ________ <two byte lead in byte 1>
-        TOO_SHORT,
-        // 1110____ ________ <three byte lead in byte 1>
-        TOO_SHORT | OVERLONG_3 | SURROGATE,
-        // 1111____ ________ <four+ byte lead in byte 1>
-        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-        // ____0000 ________
-        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-        // ____0001 ________
-        CARRY | OVERLONG_2,
-        // ____001_ ________
-        CARRY, CARRY,
-
-        // ____0100 ________
-        CARRY | TOO_LARGE,
-        // ____0101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____011_ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-        // ____1___ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____1101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ________ 0_______ <ASCII in byte 2>
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-        // ________ 1000____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-        // ________ 1001____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-        // ________ 101_____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-
-        // ________ 11______
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
     return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
-{
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-}
+  }
+
 
-struct validating_transcoder {
+  struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder()
-        : error(uint8_t(0))
-    {
-    }
+    validating_transcoder() : error(uint8_t(0)) {}
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-        simd8<uint8_t> sc = check_special_cases(input, prev1);
-        this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-    template<endianness endian>
-    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output)
-    {
-        size_t pos = 0;
-        char16_t* start { utf16_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 8; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf16<endian>(utf16_output);
-                utf16_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                        utf8_end_of_code_point_mask, utf16_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
-            return 0;
-        }
-        if (pos < size) {
-            size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
-            if (howmany == 0) {
-                return 0;
-            }
-            utf16_output += howmany;
-        }
-        return utf16_output - start;
-    }
-
-    template<endianness endian>
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output)
-    {
-        size_t pos = 0;
-        char16_t* start { utf16_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 8; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf16<endian>(utf16_output);
-                utf16_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                if (errors()) {
-                    // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-                    // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-                    result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-                    res.count += pos;
-                    return res;
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                        utf8_end_of_code_point_mask, utf16_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+
+    template <endianness endian>
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
+      size_t pos = 0;
+      char16_t* start{utf16_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 8; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf16<endian>(utf16_output);
+          utf16_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) { return 0; }
+      if(pos < size) {
+        size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
+        if(howmany == 0) { return 0; }
+        utf16_output += howmany;
+      }
+      return utf16_output - start;
+    }
+
+    template <endianness endian>
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
+      size_t pos = 0;
+      char16_t* start{utf16_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 8; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf16<endian>(utf16_output);
+          utf16_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          if (errors()) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                            utf8_end_of_code_point_mask, utf16_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) {
+        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+        res.count += pos;
+        return res;
+      }
+      if(pos < size) {
+        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+        if (res.error) {    // In case of error, we want the error position
+          res.count += pos;
+          return res;
+        } else {    // In case of success, we want the number of word written
+          utf16_output += res.count;
         }
-        if (pos < size) {
-            // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-            // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-            result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-            if (res.error) { // In case of error, we want the error position
-                res.count += pos;
-                return res;
-            } else { // In case of success, we want the number of word written
-                utf16_output += res.count;
-            }
-        }
-        return result(error_code::SUCCESS, utf16_output - start);
+      }
+      return result(error_code::SUCCESS, utf16_output - start);
     }
 
-    simdutf_really_inline bool errors() const
-    {
-        return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
     }
 
-}; // struct utf8_checker
+  }; // struct utf8_checker
 } // utf8_to_utf16 namespace
 } // unnamed namespace
 } // namespace westmere
@@ -30939,36 +29343,37 @@ namespace utf8_to_utf32 {
 
 using namespace simd;
 
+
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char32_t* utf32_output) noexcept
-{
-    size_t pos = 0;
-    char32_t* start { utf32_output };
-    const size_t safety_margin = 16; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
-        if (in.is_ascii()) {
-            in.store_ascii_as_utf32(utf32_output);
-            utf32_output += 64;
-            pos += 64;
-        } else {
-            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-            size_t max_starting_point = (pos + 64) - 12;
-            while (pos < max_starting_point) {
-                size_t consumed = convert_masked_utf8_to_utf32(input + pos,
-                    utf8_end_of_code_point_mask, utf32_output);
-                pos += consumed;
-                utf8_end_of_code_point_mask >>= consumed;
-            }
-        }
+    char32_t* utf32_output) noexcept {
+  size_t pos = 0;
+  char32_t* start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while(pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if(in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+    uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+    uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+    size_t max_starting_point = (pos + 64) - 12;
+    while(pos < max_starting_point) {
+      size_t consumed = convert_masked_utf8_to_utf32(input + pos,
+                          utf8_end_of_code_point_mask, utf32_output);
+      pos += consumed;
+      utf8_end_of_code_point_mask >>= consumed;
+      }
     }
-    utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
-    return utf32_output - start;
+  }
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
+  return utf32_output - start;
 }
 
+
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace westmere
@@ -30977,28 +29382,29 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
+
 namespace simdutf {
 namespace westmere {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
 
-simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
-{
-    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-    // Bit 1 = Too Long (ASCII followed by continuation)
-    // Bit 2 = Overlong 3-byte
-    // Bit 4 = Surrogate
-    // Bit 5 = Overlong 2-byte
-    // Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
+
+  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -31006,266 +29412,268 @@ simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> in
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-    // 11110101 1000____
-    // 1111011_ 1000____
-    // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-        // 0_______ ________ <ASCII in byte 1>
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-        // 10______ ________ <continuation in byte 1>
-        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-        // 1100____ ________ <two byte lead in byte 1>
-        TOO_SHORT | OVERLONG_2,
-        // 1101____ ________ <two byte lead in byte 1>
-        TOO_SHORT,
-        // 1110____ ________ <three byte lead in byte 1>
-        TOO_SHORT | OVERLONG_3 | SURROGATE,
-        // 1111____ ________ <four+ byte lead in byte 1>
-        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-        // ____0000 ________
-        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-        // ____0001 ________
-        CARRY | OVERLONG_2,
-        // ____001_ ________
-        CARRY, CARRY,
-
-        // ____0100 ________
-        CARRY | TOO_LARGE,
-        // ____0101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____011_ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-        // ____1___ ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
-        // ____1101 ________
-        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-        // ________ 0_______ <ASCII in byte 2>
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-        // ________ 1000____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-        // ________ 1001____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-        // ________ 101_____
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-
-        // ________ 11______
-        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
     return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
-{
+  }
+  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-}
+  }
 
-struct validating_transcoder {
+
+  struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder()
-        : error(uint8_t(0))
-    {
-    }
+    validating_transcoder() : error(uint8_t(0)) {}
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
-    {
-        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-        simd8<uint8_t> sc = check_special_cases(input, prev1);
-        this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output)
-    {
-        size_t pos = 0;
-        char32_t* start { utf32_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 4; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf32(utf32_output);
-                utf32_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                        utf8_end_of_code_point_mask, utf32_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
-            return 0;
-        }
-        if (pos < size) {
-            size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-            if (howmany == 0) {
-                return 0;
-            }
-            utf32_output += howmany;
-        }
-        return utf32_output - start;
-    }
-
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output)
-    {
-        size_t pos = 0;
-        char32_t* start { utf32_output };
-        // In the worst case, we have the haswell kernel which can cause an overflow of
-        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
-        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
-        // much more than 8 bytes. However, you cannot generally assume that you have valid
-        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
-        // to give us a good margin.
-        size_t leading_byte = 0;
-        size_t margin = size;
-        for (; margin > 0 && leading_byte < 4; margin--) {
-            leading_byte += (int8_t(in[margin - 1]) > -65);
-        }
-        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
-        const size_t safety_margin = size - margin + 1; // to avoid overruns!
-        while (pos + 64 + safety_margin <= size) {
-            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-            if (input.is_ascii()) {
-                input.store_ascii_as_utf32(utf32_output);
-                utf32_output += 64;
-                pos += 64;
-            } else {
-                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-                auto zero = simd8<uint8_t> { uint8_t(0) };
-                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-                    this->check_utf8_bytes(input.chunks[0], zero);
-                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-                }
-                if (errors()) {
-                    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-                    res.count += pos;
-                    return res;
-                }
-                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-                // We process in blocks of up to 12 bytes except possibly
-                // for fast paths which may process up to 16 bytes. For the
-                // slow path to work, we should have at least 12 input bytes left.
-                size_t max_starting_point = (pos + 64) - 12;
-                // Next loop is going to run at least five times.
-                while (pos < max_starting_point) {
-                    // Performance note: our ability to compute 'consumed' and
-                    // then shift and recompute is critical. If there is a
-                    // latency of, say, 4 cycles on getting 'consumed', then
-                    // the inner loop might have a total latency of about 6 cycles.
-                    // Yet we process between 6 to 12 inputs bytes, thus we get
-                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-                    // for this section of the code. Hence, there is a limit
-                    // to how much we can further increase this latency before
-                    // it seriously harms performance.
-                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                        utf8_end_of_code_point_mask, utf32_output);
-                    pos += consumed;
-                    utf8_end_of_code_point_mask >>= consumed;
-                }
-                // At this point there may remain between 0 and 12 bytes in the
-                // 64-byte block. These bytes will be processed again. So we have an
-                // 80% efficiency (in the worst case). In practice we expect an
-                // 85% to 90% efficiency.
-            }
-        }
-        if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
+      size_t pos = 0;
+      char32_t* start{utf32_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 4; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf32(utf32_output);
+          utf32_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                            utf8_end_of_code_point_mask, utf32_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) { return 0; }
+      if(pos < size) {
+        size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+        if(howmany == 0) { return 0; }
+        utf32_output += howmany;
+      }
+      return utf32_output - start;
+    }
+
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
+      size_t pos = 0;
+      char32_t* start{utf32_output};
+      // In the worst case, we have the haswell kernel which can cause an overflow of
+      // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+      // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+      // much more than 8 bytes. However, you cannot generally assume that you have valid
+      // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+      // to give us a good margin.
+      size_t leading_byte = 0;
+      size_t margin = size;
+      for(; margin > 0 && leading_byte < 4; margin--) {
+        leading_byte += (int8_t(in[margin-1]) > -65);
+      }
+      // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+      const size_t safety_margin = size - margin + 1; // to avoid overruns!
+      while(pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+        if(input.is_ascii()) {
+          input.store_ascii_as_utf32(utf32_output);
+          utf32_output += 64;
+          pos += 64;
+        } else {
+          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+              "We support either two or four chunks per 64-byte block.");
+          auto zero = simd8<uint8_t>{uint8_t(0)};
+          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+            this->check_utf8_bytes(input.chunks[0], zero);
+            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+          }
+          if (errors()) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
+          }
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
+          // We process in blocks of up to 12 bytes except possibly
+          // for fast paths which may process up to 16 bytes. For the
+          // slow path to work, we should have at least 12 input bytes left.
+          size_t max_starting_point = (pos + 64) - 12;
+          // Next loop is going to run at least five times.
+          while(pos < max_starting_point) {
+            // Performance note: our ability to compute 'consumed' and
+            // then shift and recompute is critical. If there is a
+            // latency of, say, 4 cycles on getting 'consumed', then
+            // the inner loop might have a total latency of about 6 cycles.
+            // Yet we process between 6 to 12 inputs bytes, thus we get
+            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+            // for this section of the code. Hence, there is a limit
+            // to how much we can further increase this latency before
+            // it seriously harms performance.
+            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                            utf8_end_of_code_point_mask, utf32_output);
+            pos += consumed;
+            utf8_end_of_code_point_mask >>= consumed;
+          }
+          // At this point there may remain between 0 and 12 bytes in the
+          // 64-byte block. These bytes will be processed again. So we have an
+          // 80% efficiency (in the worst case). In practice we expect an
+          // 85% to 90% efficiency.
+        }
+      }
+      if(errors()) {
+        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+        res.count += pos;
+        return res;
+      }
+      if(pos < size) {
+        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+        if (res.error) {    // In case of error, we want the error position
+          res.count += pos;
+          return res;
+        } else {    // In case of success, we want the number of word written
+          utf32_output += res.count;
         }
-        if (pos < size) {
-            result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-            if (res.error) { // In case of error, we want the error position
-                res.count += pos;
-                return res;
-            } else { // In case of success, we want the number of word written
-                utf32_output += res.count;
-            }
-        }
-        return result(error_code::SUCCESS, utf32_output - start);
+      }
+      return result(error_code::SUCCESS, utf32_output - start);
     }
 
-    simdutf_really_inline bool errors() const
-    {
-        return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const {
+      return this->error.any_bits_set_anywhere();
     }
 
-}; // struct utf8_checker
+  }; // struct utf8_checker
 } // utf8_to_utf32 namespace
 } // unnamed namespace
 } // namespace westmere
@@ -31282,37 +29690,36 @@ namespace utf8 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char* in, size_t size)
-{
+simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
-    for (; pos + 64 <= size; pos += 64) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        count += 64 - count_ones(utf8_continuation_mask);
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      count += 64 - count_ones(utf8_continuation_mask);
     }
     return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size)
-{
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for (; pos + 64 <= size; pos += 64) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        // We count one word for anything that is not a continuation (so
-        // leading bytes).
-        count += 64 - count_ones(utf8_continuation_mask);
-        int64_t utf8_4byte = input.gteq_unsigned(240);
-        count += count_ones(utf8_4byte);
+    for(;pos + 64 <= size; pos += 64) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+      // We count one word for anything that is not a continuation (so
+      // leading bytes).
+      count += 64 - count_ones(utf8_continuation_mask);
+      int64_t utf8_4byte = input.gteq_unsigned(240);
+      count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
 
-simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
-{
+
+simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
     return count_code_points(in, size);
 }
 } // utf8 namespace
@@ -31327,65 +29734,57 @@ namespace westmere {
 namespace {
 namespace utf16 {
 
-template<endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
-    for (; pos + 32 <= size; pos += 32) {
-        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
-        if (!match_system(big_endian)) {
-            input.swap_bytes();
-        }
-        uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-        count += count_ones(not_pair) / 2;
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      if (!match_system(big_endian)) { input.swap_bytes(); }
+      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+      count += count_ones(not_pair) / 2;
     }
     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
 }
 
-template<endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for (; pos + 32 <= size; pos += 32) {
-        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
-        if (!match_system(big_endian)) {
-            input.swap_bytes();
-        }
-        uint64_t ascii_mask = input.lteq(0x7F);
-        uint64_t twobyte_mask = input.lteq(0x7FF);
-        uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-        size_t ascii_count = count_ones(ascii_mask) / 2;
-        size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
-        size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
-        size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-        count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    for(;pos + 32 <= size; pos += 32) {
+      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+      if (!match_system(big_endian)) { input.swap_bytes(); }
+      uint64_t ascii_mask = input.lteq(0x7F);
+      uint64_t twobyte_mask = input.lteq(0x7FF);
+      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+      size_t ascii_count = count_ones(ascii_mask) / 2;
+      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
+      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
+      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
     }
     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
 }
 
-template<endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size)
-{
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
     return count_code_points<big_endian>(in, size);
 }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output)
-{
-    size_t pos = 0;
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
+  size_t pos = 0;
 
-    while (pos + 32 <= size) {
-        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
-        input.swap_bytes();
-        input.store(reinterpret_cast<uint16_t*>(output));
-        pos += 32;
-        output += 32;
-    }
+  while (pos + 32 <= size) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
+  }
 
-    scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
 } // utf16
@@ -31400,661 +29799,592 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 namespace simdutf {
 namespace westmere {
 
-simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
-{
-    // If there is a BOM, then we trust it.
-    auto bom_encoding = simdutf::BOM::check_bom(input, length);
-    if (bom_encoding != encoding_type::unspecified) {
-        return bom_encoding;
-    }
-    if (length % 2 == 0) {
-        return sse_detect_encodings<utf8_validation::utf8_checker>(input, length);
+simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
+  if (length % 2 == 0) {
+    return sse_detect_encodings<utf8_validation::utf8_checker>(input, length);
+  } else {
+    if (implementation::validate_utf8(input, length)) {
+      return simdutf::encoding_type::UTF8;
     } else {
-        if (implementation::validate_utf8(input, length)) {
-            return simdutf::encoding_type::UTF8;
-        } else {
-            return simdutf::encoding_type::unspecified;
-        }
+      return simdutf::encoding_type::unspecified;
     }
+  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
-{
-    return westmere::utf8_validation::generic_validate_utf8(buf, len);
+simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return westmere::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
-{
-    return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
+  return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
-{
-    return westmere::utf8_validation::generic_validate_ascii(buf, len);
+simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return westmere::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
-{
-    return westmere::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
+  return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
-{
-    const char16_t* tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
-    if (tail) {
-        return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
-    } else {
-        return false;
-    }
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
+  const char16_t* tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
-{
-    const char16_t* tail = sse_validate_utf16<endianness::BIG>(buf, len);
-    if (tail) {
-        return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
-    } else {
-        return false;
-    }
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
+  const char16_t* tail = sse_validate_utf16<endianness::BIG>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
-{
-    result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
-    if (res.count != len) {
-        result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
-        return result(scalar_res.error, res.count + scalar_res.count);
-    } else {
-        return res;
-    }
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
+  result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
-{
-    result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
-    if (res.count != len) {
-        result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
-        return result(scalar_res.error, res.count + scalar_res.count);
-    } else {
-        return res;
-    }
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
+  result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
-{
-    const char32_t* tail = sse_validate_utf32le(buf, len);
-    if (tail) {
-        return scalar::utf32::validate(tail, len - (tail - buf));
-    } else {
-        return false;
-    }
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  const char32_t* tail = sse_validate_utf32le(buf, len);
+  if (tail) {
+    return scalar::utf32::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
-{
-    result res = sse_validate_utf32le_with_errors(buf, len);
-    if (res.count != len) {
-        result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
-        return result(scalar_res.error, res.count + scalar_res.count);
-    } else {
-        return res;
-    }
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
+  result res = sse_validate_utf32le_with_errors(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
-{
-    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept {
+  return scalar::latin1_to_utf8::convert(buf,len,utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept
-{
-    return scalar::latin1_to_utf32::convert(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept {
+  return scalar::latin1_to_utf32::convert(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16::validating_transcoder converter;
-    return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16::validating_transcoder converter;
-    return converter.convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16::validating_transcoder converter;
-    return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    utf8_to_utf16::validating_transcoder converter;
-    return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
+
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept
-{
-    return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
+    char16_t* utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept
-{
-    return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
+    char16_t* utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    utf8_to_utf32::validating_transcoder converter;
-    return converter.convert(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    utf8_to_utf32::validating_transcoder converter;
-    return converter.convert_with_errors(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
-    char32_t* utf32_output) const noexcept
-{
-    return utf8_to_utf32::convert_valid(input, size, utf32_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf8_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+    char32_t* utf32_output) const noexcept {
+  return utf8_to_utf32::convert_valid(input, size,  utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf8_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
-    if (ret.first.error) {
-        return ret.first;
-    } // Can return directly since scalar fallback already found correct ret.first.count
-    if (ret.first.count != len) { // All good so far, but not finished
-        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
-    return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
-    if (ret.first.error) {
-        return ret.first;
-    } // Can return directly since scalar fallback already found correct ret.first.count
-    if (ret.first.count != len) { // All good so far, but not finished
-        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
-{
-    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    std::pair<const char32_t*, char*> ret = sse_convert_utf32_to_utf8(buf, len, utf8_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf8_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char*> ret = westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-    if (ret.first.count != len) {
-        result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
+  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
-    return ret.first;
+  }
+  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
+  return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf32_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf16le_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf32_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
-    if (ret.first.error) {
-        return ret.first;
-    } // Can return directly since scalar fallback already found correct ret.first.count
-    if (ret.first.count != len) { // All good so far, but not finished
-        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
-    return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
-    if (ret.first.error) {
-        return ret.first;
-    } // Can return directly since scalar fallback already found correct ret.first.count
-    if (ret.first.count != len) { // All good so far, but not finished
-        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
-{
-    return convert_utf32_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_with_errors(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf16_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_valid(buf,len,latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-    if (ret.first == nullptr) {
-        return 0;
-    }
-    size_t saved_bytes = ret.second - utf16_output;
-    if (ret.first != buf + len) {
-        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-        if (scalar_saved_bytes == 0) {
-            return 0;
-        }
-        saved_bytes += scalar_saved_bytes;
-    }
-    return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
-    if (ret.first.count != len) {
-        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
-    }
-    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
-    return ret.first;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  std::pair<const char32_t*, char*> ret = sse_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    // ret.first.count is always the position in the buffer, not the number of words written even if finished
-    std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
-    if (ret.first.count != len) {
-        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-        if (scalar_res.error) {
-            scalar_res.count += ret.first.count;
-            return scalar_res;
-        } else {
-            ret.second += scalar_res.count;
-        }
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char*> ret = westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
+  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) { return 0; }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
+                                        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) { return 0; }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of words written even if finished
+  std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+                                        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
-    return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return convert_utf32_to_utf16le(buf, len, utf16_output);
+  }
+  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
+  return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
-{
-    return convert_utf32_to_utf16be(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return convert_utf32_to_utf16le(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return convert_utf16le_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
+  return convert_utf32_to_utf16be(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
-{
-    return convert_utf16be_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return convert_utf16le_to_utf32(buf, len, utf32_output);
 }
 
-void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
-{
-    utf16::change_endianness_utf16(input, length, output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
+  return convert_utf16be_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::count_code_points<endianness::LITTLE>(input, length);
+void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
+  utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
-{
-    return utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
-{
-    return scalar::utf8::latin1_length_from_utf8(buf, len);
+simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
-{
-    return scalar::utf16::latin1_length_from_utf16(length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept {
+  return scalar::utf8::latin1_length_from_utf8(buf,len);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
-{
-    return scalar::utf32::latin1_length_from_utf32(length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32( size_t length) const noexcept {
+  return scalar::utf32::latin1_length_from_utf32(length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
-{
-    return scalar::latin1::utf16_length_from_latin1(length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
-{
-    return scalar::latin1::utf32_length_from_latin1(length);
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
-{
-    return scalar::latin1::utf8_length_from_latin1(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t len) const noexcept {
+  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
+  size_t answer = len / sizeof(__m128i) * sizeof(__m128i);
+  size_t i = 0;
+  __m128i two_64bits = _mm_setzero_si128();
+  while (i + sizeof(__m128i) <= len) {
+    __m128i runner = _mm_setzero_si128();
+    size_t iterations = (len - i) / sizeof(__m128i);
+    if (iterations > 255) {
+      iterations = 255;
+    }
+    size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i);
+    for (; i + 4*sizeof(__m128i) <= max_i; i += 4*sizeof(__m128i)) {
+      __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i));
+      __m128i input2 = _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i)));
+      __m128i input3 = _mm_loadu_si128((const __m128i *)(str + i + 2*sizeof(__m128i)));
+      __m128i input4 = _mm_loadu_si128((const __m128i *)(str + i + 3*sizeof(__m128i)));
+      __m128i input12 = _mm_add_epi8(
+                                      _mm_cmpgt_epi8(
+                                                    _mm_setzero_si128(), 
+                                                    input1),
+                                      _mm_cmpgt_epi8(
+                                                    _mm_setzero_si128(),
+                                                    input2));
+      __m128i input34 = _mm_add_epi8(
+                                      _mm_cmpgt_epi8(
+                                                    _mm_setzero_si128(),
+                                                    input3),
+                                      _mm_cmpgt_epi8(
+                                                    _mm_setzero_si128(),
+                                                    input4));
+      __m128i input1234 = _mm_add_epi8(input12, input34);
+      runner = _mm_sub_epi8(runner, input1234);
+    }
+    for (; i <= max_i; i += sizeof(__m128i)) {
+      __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i));
+      runner = _mm_sub_epi8(
+          runner, _mm_cmpgt_epi8(_mm_setzero_si128(), more_input));
+    }
+    two_64bits = _mm_add_epi64(
+        two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128()));
+  }
+  answer += _mm_extract_epi64(two_64bits, 0) +
+            _mm_extract_epi64(two_64bits, 1);
+  return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast<const char *>(str + i), len - i);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
-{
-    return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
-{
-    return utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
-{
-    const __m128i v_00000000 = _mm_setzero_si128();
-    const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
-    const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
-    const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-    size_t pos = 0;
-    size_t count = 0;
-    for (; pos + 4 <= length; pos += 4) {
-        __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
-        const __m128i ascii_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
-        const __m128i one_two_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
-        const __m128i two_bytes_bytemask = _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
-        const __m128i one_two_three_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
-        const __m128i three_bytes_bytemask = _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
-        const uint16_t ascii_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
-        const uint16_t two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
-        const uint16_t three_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
-
-        size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
-        size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
-        size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
-        count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
-    }
-    return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
-}
-
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
-{
-    const __m128i v_00000000 = _mm_setzero_si128();
-    const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-    size_t pos = 0;
-    size_t count = 0;
-    for (; pos + 4 <= length; pos += 4) {
-        __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
-        const __m128i surrogate_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
-        const uint16_t surrogate_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
-        size_t surrogate_count = (16 - count_ones(surrogate_bitmask)) / 4;
-        count += 4 + surrogate_count;
-    }
-    return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
-{
-    return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
+  const __m128i v_00000000 = _mm_setzero_si128();
+  const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
+  const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
+  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+  size_t pos = 0;
+  size_t count = 0;
+  for(;pos + 4 <= length; pos += 4) {
+    __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
+    const __m128i ascii_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
+    const __m128i one_two_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
+    const __m128i two_bytes_bytemask = _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    const __m128i one_two_three_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
+    const __m128i three_bytes_bytemask = _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+    const uint16_t ascii_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
+    const uint16_t two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
+    const uint16_t three_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
+
+    size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
+    size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
+    size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
+    count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
+  }
+  return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
+  const __m128i v_00000000 = _mm_setzero_si128();
+  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+  size_t pos = 0;
+  size_t count = 0;
+  for(;pos + 4 <= length; pos += 4) {
+    __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
+    const __m128i surrogate_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
+    const uint16_t surrogate_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
+    size_t surrogate_count = (16-count_ones(surrogate_bitmask))/4;
+    count += 4 + surrogate_count;
+  }
+  return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
+  return scalar::utf8::count_code_points(input, length);
 }
 
 } // namespace westmere
diff --git a/src/bun.js/bindings/simdutf.h b/src/bun.js/bindings/simdutf.h
index 4c04ae467..7ad28528d 100644
--- a/src/bun.js/bindings/simdutf.h
+++ b/src/bun.js/bindings/simdutf.h
@@ -1,4 +1,4 @@
-/* auto-generated on 2023-06-21 08:09:45 -0400. Do not edit! */
+/* auto-generated on 2023-08-08 16:23:39 -0400. Do not edit! */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf.h
 /* begin file include/simdutf.h */
 #ifndef SIMDUTF_H
@@ -78,7 +78,7 @@
 #include <machine/endian.h>
 #elif defined(sun) || defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__)
 #include <sys/byteorder.h>
-#else // defined(__APPLE__) || defined(__FreeBSD__)
+#else  // defined(__APPLE__) || defined(__FreeBSD__)
 
 #ifdef __has_include
 #if __has_include(<endian.h>)
@@ -88,6 +88,7 @@
 
 #endif // defined(__APPLE__) || defined(__FreeBSD__)
 
+
 #ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__)
 #define SIMDUTF_IS_BIG_ENDIAN 0
 #endif
@@ -100,6 +101,7 @@
 
 #endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
 
+
 /**
  * At this point in time, SIMDUTF_IS_BIG_ENDIAN is defined.
  */
@@ -137,9 +139,9 @@
 #elif defined(__aarch64__) || defined(_M_ARM64)
 #define SIMDUTF_IS_ARM64 1
 #elif defined(__PPC64__) || defined(_M_PPC64)
-// #define SIMDUTF_IS_PPC64 1
-//  The simdutf library does yet support SIMD acceleration under
-//  POWER processors. Please see https://github.com/lemire/simdutf/issues/51
+//#define SIMDUTF_IS_PPC64 1
+// The simdutf library does yet support SIMD acceleration under
+// POWER processors. Please see https://github.com/lemire/simdutf/issues/51
 #elif defined(__s390__)
 // s390 IBM system. Big endian.
 #elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64
@@ -198,14 +200,14 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
 // warning: clang attribute push can't be used within a namespace in clang up
 // til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be *outside* of a
 // namespace.
-#define SIMDUTF_TARGET_REGION(T) \
-    _Pragma(SIMDUTF_STRINGIFY(   \
-        clang attribute push(__attribute__((target(T))), apply_to = function)))
+#define SIMDUTF_TARGET_REGION(T)                                                       \
+  _Pragma(SIMDUTF_STRINGIFY(                                                           \
+      clang attribute push(__attribute__((target(T))), apply_to = function)))
 #define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop")
 #elif defined(__GNUC__)
 // GCC is easier
-#define SIMDUTF_TARGET_REGION(T) \
-    _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T)))
+#define SIMDUTF_TARGET_REGION(T)                                                       \
+  _Pragma("GCC push_options") _Pragma(SIMDUTF_STRINGIFY(GCC target(T)))
 #define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options")
 #endif // clang then gcc
 
@@ -258,11 +260,7 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
 #define SIMDUTF_ASSUME(COND) __assume(COND)
 #else
 #define SIMDUTF_UNREACHABLE() __builtin_unreachable();
-#define SIMDUTF_ASSUME(COND)         \
-    do {                             \
-        if (!(COND))                 \
-            __builtin_unreachable(); \
-    } while (0)
+#define SIMDUTF_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0)
 #endif
 
 #else // NDEBUG
@@ -272,12 +270,14 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
 
 #endif
 
+
 #if defined(__GNUC__) && !defined(__clang__)
 #if __GNUC__ >= 11
 #define SIMDUTF_GCC11ORMORE 1
 #endif //  __GNUC__ >= 11
 #endif // defined(__GNUC__) && !defined(__clang__)
 
+
 #endif // SIMDUTF_PORTABILITY_H
 /* end file include/simdutf/portability.h */
 // dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/avx512.h
@@ -295,86 +295,84 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
 */
 
 #ifndef SIMDUTF_HAS_AVX512F
-#if defined(__AVX512F__) && __AVX512F__ == 1
-#define SIMDUTF_HAS_AVX512F 1
-#endif
+# if defined(__AVX512F__) && __AVX512F__ == 1
+#   define SIMDUTF_HAS_AVX512F 1
+# endif
 #endif
 
 #ifndef SIMDUTF_HAS_AVX512DQ
-#if defined(__AVX512DQ__) && __AVX512DQ__ == 1
-#define SIMDUTF_HAS_AVX512DQ 1
-#endif
+# if defined(__AVX512DQ__) && __AVX512DQ__ == 1
+#   define SIMDUTF_HAS_AVX512DQ 1
+# endif
 #endif
 
 #ifndef SIMDUTF_HAS_AVX512IFMA
-#if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1
-#define SIMDUTF_HAS_AVX512IFMA 1
-#endif
+# if defined(__AVX512IFMA__) && __AVX512IFMA__ == 1
+#   define SIMDUTF_HAS_AVX512IFMA 1
+# endif
 #endif
 
 #ifndef SIMDUTF_HAS_AVX512CD
-#if defined(__AVX512CD__) && __AVX512CD__ == 1
-#define SIMDUTF_HAS_AVX512CD 1
-#endif
+# if defined(__AVX512CD__) && __AVX512CD__ == 1
+#   define SIMDUTF_HAS_AVX512CD 1
+# endif
 #endif
 
 #ifndef SIMDUTF_HAS_AVX512BW
-#if defined(__AVX512BW__) && __AVX512BW__ == 1
-#define SIMDUTF_HAS_AVX512BW 1
-#endif
+# if defined(__AVX512BW__) && __AVX512BW__ == 1
+#   define SIMDUTF_HAS_AVX512BW 1
+# endif
 #endif
 
 #ifndef SIMDUTF_HAS_AVX512VL
-#if defined(__AVX512VL__) && __AVX512VL__ == 1
-#define SIMDUTF_HAS_AVX512VL 1
-#endif
+# if defined(__AVX512VL__) && __AVX512VL__ == 1
+#   define SIMDUTF_HAS_AVX512VL 1
+# endif
 #endif
 
 #ifndef SIMDUTF_HAS_AVX512VBMI
-#if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1
-#define SIMDUTF_HAS_AVX512VBMI 1
-#endif
+# if defined(__AVX512VBMI__) && __AVX512VBMI__ == 1
+#   define SIMDUTF_HAS_AVX512VBMI 1
+# endif
 #endif
 
 #ifndef SIMDUTF_HAS_AVX512VBMI2
-#if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1
-#define SIMDUTF_HAS_AVX512VBMI2 1
-#endif
+# if defined(__AVX512VBMI2__) && __AVX512VBMI2__ == 1
+#   define SIMDUTF_HAS_AVX512VBMI2 1
+# endif
 #endif
 
 #ifndef SIMDUTF_HAS_AVX512VNNI
-#if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1
-#define SIMDUTF_HAS_AVX512VNNI 1
-#endif
+# if defined(__AVX512VNNI__) && __AVX512VNNI__ == 1
+#   define SIMDUTF_HAS_AVX512VNNI 1
+# endif
 #endif
 
 #ifndef SIMDUTF_HAS_AVX512BITALG
-#if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1
-#define SIMDUTF_HAS_AVX512BITALG 1
-#endif
+# if defined(__AVX512BITALG__) && __AVX512BITALG__ == 1
+#   define SIMDUTF_HAS_AVX512BITALG 1
+# endif
 #endif
 
 #ifndef SIMDUTF_HAS_AVX512VPOPCNTDQ
-#if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1
-#define SIMDUTF_HAS_AVX512VPOPCNTDQ 1
-#endif
+# if defined(__AVX512VPOPCNTDQ__) && __AVX512VPOPCNTDQ__ == 1
+#   define SIMDUTF_HAS_AVX512VPOPCNTDQ 1
+# endif
 #endif
 
 #endif // SIMDUTF_AVX512_H_
 /* end file include/simdutf/avx512.h */
 
+
 #if defined(__GNUC__)
-// Marks a block with a name so that MCA analysis can see it.
-#define SIMDUTF_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
-#define SIMDUTF_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
-#define SIMDUTF_DEBUG_BLOCK(name, block) \
-    BEGIN_DEBUG_BLOCK(name);             \
-    block;                               \
-    END_DEBUG_BLOCK(name);
+  // Marks a block with a name so that MCA analysis can see it.
+  #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
+  #define SIMDUTF_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
+  #define SIMDUTF_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
 #else
-#define SIMDUTF_BEGIN_DEBUG_BLOCK(name)
-#define SIMDUTF_END_DEBUG_BLOCK(name)
-#define SIMDUTF_DEBUG_BLOCK(name, block)
+  #define SIMDUTF_BEGIN_DEBUG_BLOCK(name)
+  #define SIMDUTF_END_DEBUG_BLOCK(name)
+  #define SIMDUTF_DEBUG_BLOCK(name, block)
 #endif
 
 // Align to N-byte boundary
@@ -385,108 +383,103 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
 
 #if defined(SIMDUTF_REGULAR_VISUAL_STUDIO)
 
-#define simdutf_really_inline __forceinline
-#define simdutf_never_inline __declspec(noinline)
-
-#define simdutf_unused
-#define simdutf_warn_unused
-
-#ifndef simdutf_likely
-#define simdutf_likely(x) x
-#endif
-#ifndef simdutf_unlikely
-#define simdutf_unlikely(x) x
-#endif
-
-#define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning(push))
-#define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning(push, 0))
-#define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning(disable \
-                                                                    : WARNING_NUMBER))
-// Get rid of Intellisense-only warnings (Code Analysis)
-// Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910).
-#ifdef __has_include
-#if __has_include(<CppCoreCheck\Warnings.h>)
-#include <CppCoreCheck\Warnings.h>
-#define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
-#endif
-#endif
-
-#ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS
-#define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
-#endif
-
-#define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996)
-#define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING
-#define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning(pop))
+  #define simdutf_really_inline __forceinline
+  #define simdutf_never_inline __declspec(noinline)
+
+  #define simdutf_unused
+  #define simdutf_warn_unused
+
+  #ifndef simdutf_likely
+  #define simdutf_likely(x) x
+  #endif
+  #ifndef simdutf_unlikely
+  #define simdutf_unlikely(x) x
+  #endif
+
+  #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning( push ))
+  #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 ))
+  #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER ))
+  // Get rid of Intellisense-only warnings (Code Analysis)
+  // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910).
+  #ifdef __has_include
+  #if __has_include(<CppCoreCheck\Warnings.h>)
+  #include <CppCoreCheck\Warnings.h>
+  #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
+  #endif
+  #endif
+
+  #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+  #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+  #endif
+
+  #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996)
+  #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING
+  #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning( pop ))
 
 #else // SIMDUTF_REGULAR_VISUAL_STUDIO
 
-#define simdutf_really_inline inline __attribute__((always_inline))
-#define simdutf_never_inline inline __attribute__((noinline))
+  #define simdutf_really_inline inline __attribute__((always_inline))
+  #define simdutf_never_inline inline __attribute__((noinline))
+
+  #define simdutf_unused __attribute__((unused))
+  #define simdutf_warn_unused __attribute__((warn_unused_result))
+
+  #ifndef simdutf_likely
+  #define simdutf_likely(x) __builtin_expect(!!(x), 1)
+  #endif
+  #ifndef simdutf_unlikely
+  #define simdutf_unlikely(x) __builtin_expect(!!(x), 0)
+  #endif
+
+  #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
+  // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary
+  #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS SIMDUTF_PUSH_DISABLE_WARNINGS \
+    SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wall) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \
+    SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable)
+  #define SIMDUTF_PRAGMA(P) _Pragma(#P)
+  #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING)
+  #if defined(SIMDUTF_CLANG_VISUAL_STUDIO)
+  #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include)
+  #else
+  #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+  #endif
+  #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations)
+  #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow)
+  #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
 
-#define simdutf_unused __attribute__((unused))
-#define simdutf_warn_unused __attribute__((warn_unused_result))
 
-#ifndef simdutf_likely
-#define simdutf_likely(x) __builtin_expect(!!(x), 1)
-#endif
-#ifndef simdutf_unlikely
-#define simdutf_unlikely(x) __builtin_expect(!!(x), 0)
-#endif
-
-#define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
-// gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary
-#define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS                 \
-    SIMDUTF_PUSH_DISABLE_WARNINGS                         \
-    SIMDUTF_DISABLE_GCC_WARNING(-Weffc++)                 \
-    SIMDUTF_DISABLE_GCC_WARNING(-Wall)                    \
-    SIMDUTF_DISABLE_GCC_WARNING(-Wconversion)             \
-    SIMDUTF_DISABLE_GCC_WARNING(-Wextra)                  \
-    SIMDUTF_DISABLE_GCC_WARNING(-Wattributes)             \
-    SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit - fallthrough) \
-    SIMDUTF_DISABLE_GCC_WARNING(-Wnon - virtual - dtor)   \
-    SIMDUTF_DISABLE_GCC_WARNING(-Wreturn - type)          \
-    SIMDUTF_DISABLE_GCC_WARNING(-Wshadow)                 \
-    SIMDUTF_DISABLE_GCC_WARNING(-Wunused - parameter)     \
-    SIMDUTF_DISABLE_GCC_WARNING(-Wunused - variable)
-#define SIMDUTF_PRAGMA(P) _Pragma(#P)
-#define SIMDUTF_DISABLE_GCC_WARNING(WARNING) SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING)
-#if defined(SIMDUTF_CLANG_VISUAL_STUDIO)
-#define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft - include)
-#else
-#define SIMDUTF_DISABLE_UNDESIRED_WARNINGS
-#endif
-#define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated - declarations)
-#define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wstrict - overflow)
-#define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
 
 #endif // MSC_VER
 
 #ifndef SIMDUTF_DLLIMPORTEXPORT
-#if defined(SIMDUTF_VISUAL_STUDIO)
-/**
- * It does not matter here whether you are using
- * the regular visual studio or clang under visual
- * studio.
- */
-#if SIMDUTF_USING_LIBRARY
-#define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
-#else
-#define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
-#endif
-#else
-#define SIMDUTF_DLLIMPORTEXPORT
-#endif
+    #if defined(SIMDUTF_VISUAL_STUDIO)
+      /**
+       * It does not matter here whether you are using
+       * the regular visual studio or clang under visual
+       * studio.
+       */
+      #if SIMDUTF_USING_LIBRARY
+      #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
+      #else
+      #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
+      #endif
+    #else
+      #define SIMDUTF_DLLIMPORTEXPORT
+    #endif
 #endif
 
 /// If EXPR is an error, returns it.
-#define SIMDUTF_TRY(EXPR)   \
-    {                       \
-        auto _err = (EXPR); \
-        if (_err) {         \
-            return _err;    \
-        }                   \
-    }
+#define SIMDUTF_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
+
 
 #endif // SIMDUTF_COMMON_DEFS_H
 /* end file include/simdutf/common_defs.h */
@@ -497,19 +490,19 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
 namespace simdutf {
 
 enum encoding_type {
-    UTF8 = 1, // BOM 0xef 0xbb 0xbf
-    UTF16_LE = 2, // BOM 0xff 0xfe
-    UTF16_BE = 4, // BOM 0xfe 0xff
-    UTF32_LE = 8, // BOM 0xff 0xfe 0x00 0x00
-    UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff
-    Latin1 = 32,
-
-    unspecified = 0
+        UTF8 = 1,       // BOM 0xef 0xbb 0xbf
+        UTF16_LE = 2,   // BOM 0xff 0xfe
+        UTF16_BE = 4,   // BOM 0xfe 0xff
+        UTF32_LE = 8,   // BOM 0xff 0xfe 0x00 0x00
+        UTF32_BE = 16,   // BOM 0x00 0x00 0xfe 0xff
+        Latin1 = 32,
+
+        unspecified = 0
 };
 
 enum endianness {
-    LITTLE,
-    BIG
+        LITTLE,
+        BIG
 };
 
 bool match_system(endianness e);
@@ -546,27 +539,27 @@ size_t bom_byte_size(encoding_type bom);
 namespace simdutf {
 
 enum error_code {
-    SUCCESS = 0,
-    HEADER_BITS, // Any byte must have fewer than 5 header bits.
-    TOO_SHORT, // The leading byte must be followed by N-1 continuation bytes, where N is the UTF-8 character length
-               // This is also the error when the input is truncated.
-    TOO_LONG, // We either have too many consecutive continuation bytes or the string starts with a continuation byte.
-    OVERLONG, // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters,
-              // and U+FFFF for four-byte characters.
-    TOO_LARGE, // The decoded character must be less than or equal to U+10FFFF,less than or equal than U+7F for ASCII OR less than equal than U+FF for Latin1
-    SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR
-               // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16) OR
-               // there must be no surrogate at all (Latin1)
-    OTHER // Not related to validation/transcoding.
+  SUCCESS = 0,
+  HEADER_BITS,  // Any byte must have fewer than 5 header bits.
+  TOO_SHORT,    // The leading byte must be followed by N-1 continuation bytes, where N is the UTF-8 character length
+                // This is also the error when the input is truncated.
+  TOO_LONG,     // We either have too many consecutive continuation bytes or the string starts with a continuation byte.
+  OVERLONG,     // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters,
+                // and U+FFFF for four-byte characters.
+  TOO_LARGE,    // The decoded character must be less than or equal to U+10FFFF,less than or equal than U+7F for ASCII OR less than equal than U+FF for Latin1
+  SURROGATE,    // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR
+                // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16) OR
+                // there must be no surrogate at all (Latin1)
+  OTHER         // Not related to validation/transcoding.
 };
 
 struct result {
-    error_code error;
-    size_t count; // In case of error, indicates the position of the error. In case of success, indicates the number of words validated/written.
+  error_code error;
+  size_t count;     // In case of error, indicates the position of the error. In case of success, indicates the number of words validated/written.
 
-    simdutf_really_inline result();
+  simdutf_really_inline result();
 
-    simdutf_really_inline result(error_code, size_t);
+  simdutf_really_inline result(error_code, size_t);
 };
 
 }
@@ -589,18 +582,18 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 
 namespace simdutf {
 enum {
-    /**
-     * The major version (MAJOR.minor.revision) of simdutf being used.
-     */
-    SIMDUTF_VERSION_MAJOR = 3,
-    /**
-     * The minor version (major.MINOR.revision) of simdutf being used.
-     */
-    SIMDUTF_VERSION_MINOR = 2,
-    /**
-     * The revision (major.minor.REVISION) of simdutf being used.
-     */
-    SIMDUTF_VERSION_REVISION = 14
+  /**
+   * The major version (MAJOR.minor.revision) of simdutf being used.
+   */
+  SIMDUTF_VERSION_MAJOR = 3,
+  /**
+   * The minor version (major.MINOR.revision) of simdutf being used.
+   */
+  SIMDUTF_VERSION_MINOR = 2,
+  /**
+   * The revision (major.minor.REVISION) of simdutf being used.
+   */
+  SIMDUTF_VERSION_REVISION = 14
 };
 } // namespace simdutf
 
@@ -678,192 +671,191 @@ namespace simdutf {
 namespace internal {
 
 enum instruction_set {
-    DEFAULT = 0x0,
-    NEON = 0x1,
-    AVX2 = 0x4,
-    SSE42 = 0x8,
-    PCLMULQDQ = 0x10,
-    BMI1 = 0x20,
-    BMI2 = 0x40,
-    ALTIVEC = 0x80,
-    AVX512F = 0x100,
-    AVX512DQ = 0x200,
-    AVX512IFMA = 0x400,
-    AVX512PF = 0x800,
-    AVX512ER = 0x1000,
-    AVX512CD = 0x2000,
-    AVX512BW = 0x4000,
-    AVX512VL = 0x8000,
-    AVX512VBMI2 = 0x10000
+  DEFAULT = 0x0,
+  NEON = 0x1,
+  AVX2 = 0x4,
+  SSE42 = 0x8,
+  PCLMULQDQ = 0x10,
+  BMI1 = 0x20,
+  BMI2 = 0x40,
+  ALTIVEC = 0x80,
+  AVX512F = 0x100,
+  AVX512DQ = 0x200,
+  AVX512IFMA = 0x400,
+  AVX512PF = 0x800,
+  AVX512ER = 0x1000,
+  AVX512CD = 0x2000,
+  AVX512BW = 0x4000,
+  AVX512VL = 0x8000,
+  AVX512VBMI2 = 0x10000,
+  AVX512VPOPCNTDQ = 0x2000
 };
 
 #if defined(__PPC64__)
 
-static inline uint32_t detect_supported_architectures()
-{
-    return instruction_set::ALTIVEC;
+static inline uint32_t detect_supported_architectures() {
+  return instruction_set::ALTIVEC;
 }
 
 #elif defined(__aarch64__) || defined(_M_ARM64)
 
-static inline uint32_t detect_supported_architectures()
-{
-    return instruction_set::NEON;
+static inline uint32_t detect_supported_architectures() {
+  return instruction_set::NEON;
 }
 
 #elif defined(__x86_64__) || defined(_M_AMD64) // x64
 
+
 namespace {
 namespace cpuid_bit {
-// Can be found on Intel ISA Reference for CPUID
-
-// EAX = 0x01
-constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit  1 of ECX for EAX=0x1
-constexpr uint32_t sse42 = uint32_t(1) << 20; ///< @private bit 20 of ECX for EAX=0x1
-constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1
-
-// EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
-// See: "Table 3-8. Information Returned by CPUID Instruction"
-namespace ebx {
-constexpr uint32_t bmi1 = uint32_t(1) << 3;
-constexpr uint32_t avx2 = uint32_t(1) << 5;
-constexpr uint32_t bmi2 = uint32_t(1) << 8;
-constexpr uint32_t avx512f = uint32_t(1) << 16;
-constexpr uint32_t avx512dq = uint32_t(1) << 17;
-constexpr uint32_t avx512ifma = uint32_t(1) << 21;
-constexpr uint32_t avx512cd = uint32_t(1) << 28;
-constexpr uint32_t avx512bw = uint32_t(1) << 30;
-constexpr uint32_t avx512vl = uint32_t(1) << 31;
-}
+    // Can be found on Intel ISA Reference for CPUID
+
+    // EAX = 0x01
+    constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit  1 of ECX for EAX=0x1
+    constexpr uint32_t sse42 = uint32_t(1) << 20;    ///< @private bit 20 of ECX for EAX=0x1
+    constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1
+
+    // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
+    // See: "Table 3-8. Information Returned by CPUID Instruction"
+    namespace ebx {
+      constexpr uint32_t bmi1 = uint32_t(1) << 3;
+      constexpr uint32_t avx2 = uint32_t(1) << 5;
+      constexpr uint32_t bmi2 = uint32_t(1) << 8;
+      constexpr uint32_t avx512f = uint32_t(1) << 16;
+      constexpr uint32_t avx512dq = uint32_t(1) << 17;
+      constexpr uint32_t avx512ifma = uint32_t(1) << 21;
+      constexpr uint32_t avx512cd = uint32_t(1) << 28;
+      constexpr uint32_t avx512bw = uint32_t(1) << 30;
+      constexpr uint32_t avx512vl = uint32_t(1) << 31;
+    }
 
-namespace ecx {
-constexpr uint32_t avx512vbmi = uint32_t(1) << 1;
-constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6;
-constexpr uint32_t avx512vnni = uint32_t(1) << 11;
-constexpr uint32_t avx512bitalg = uint32_t(1) << 12;
-constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14;
-}
-namespace edx {
-constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
-}
-namespace xcr0_bit {
-constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX
-constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
-}
-}
+    namespace ecx {
+      constexpr uint32_t avx512vbmi = uint32_t(1) << 1;
+      constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6;
+      constexpr uint32_t avx512vnni = uint32_t(1) << 11;
+      constexpr uint32_t avx512bitalg = uint32_t(1) << 12;
+      constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14;
+    }
+    namespace edx {
+      constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
+    }
+    namespace xcr0_bit {
+     constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX
+     constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
+   }
+  }
 }
 
-static inline void cpuid(uint32_t* eax, uint32_t* ebx, uint32_t* ecx,
-    uint32_t* edx)
-{
-#if defined(_MSC_VER)
-    int cpu_info[4];
-    __cpuidex(cpu_info, *eax, *ecx);
-    *eax = cpu_info[0];
-    *ebx = cpu_info[1];
-    *ecx = cpu_info[2];
-    *edx = cpu_info[3];
-#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
-    uint32_t level = *eax;
-    __get_cpuid(level, eax, ebx, ecx, edx);
-#else
-    uint32_t a = *eax, b, c = *ecx, d;
-    asm volatile("cpuid\n\t"
-                 : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
-    *eax = a;
-    *ebx = b;
-    *ecx = c;
-    *edx = d;
-#endif
-}
 
-static inline uint64_t xgetbv()
-{
+
+static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
+                         uint32_t *edx) {
 #if defined(_MSC_VER)
-    return _xgetbv(0);
+  int cpu_info[4];
+  __cpuidex(cpu_info, *eax, *ecx);
+  *eax = cpu_info[0];
+  *ebx = cpu_info[1];
+  *ecx = cpu_info[2];
+  *edx = cpu_info[3];
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+  uint32_t level = *eax;
+  __get_cpuid(level, eax, ebx, ecx, edx);
 #else
-    uint32_t xcr0_lo, xcr0_hi;
-    asm volatile("xgetbv\n\t"
-                 : "=a"(xcr0_lo), "=d"(xcr0_hi)
-                 : "c"(0));
-    return xcr0_lo | ((uint64_t)xcr0_hi << 32);
+  uint32_t a = *eax, b, c = *ecx, d;
+  asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
+  *eax = a;
+  *ebx = b;
+  *ecx = c;
+  *edx = d;
 #endif
 }
 
-static inline uint32_t detect_supported_architectures()
-{
-    uint32_t eax;
-    uint32_t ebx = 0;
-    uint32_t ecx = 0;
-    uint32_t edx = 0;
-    uint32_t host_isa = 0x0;
-
-    // EBX for EAX=0x1
-    eax = 0x1;
-    cpuid(&eax, &ebx, &ecx, &edx);
-
-    if (ecx & cpuid_bit::sse42) {
-        host_isa |= instruction_set::SSE42;
-    }
-
-    if (ecx & cpuid_bit::pclmulqdq) {
-        host_isa |= instruction_set::PCLMULQDQ;
-    }
-
-    if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
-        return host_isa;
-    }
+static inline uint64_t xgetbv() {
+ #if defined(_MSC_VER)
+   return _xgetbv(0);
+ #else
+   uint32_t xcr0_lo, xcr0_hi;
+   asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0));
+   return xcr0_lo | ((uint64_t)xcr0_hi << 32);
+ #endif
+ }
+
+static inline uint32_t detect_supported_architectures() {
+  uint32_t eax;
+  uint32_t ebx = 0;
+  uint32_t ecx = 0;
+  uint32_t edx = 0;
+  uint32_t host_isa = 0x0;
+
+  // EBX for EAX=0x1
+  eax = 0x1;
+  cpuid(&eax, &ebx, &ecx, &edx);
+
+  if (ecx & cpuid_bit::sse42) {
+    host_isa |= instruction_set::SSE42;
+  }
+
+  if (ecx & cpuid_bit::pclmulqdq) {
+    host_isa |= instruction_set::PCLMULQDQ;
+  }
+
+  if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
+    return host_isa;
+  }
 
-    // xgetbv for checking if the OS saves registers
-    uint64_t xcr0 = xgetbv();
+  // xgetbv for checking if the OS saves registers
+  uint64_t xcr0 = xgetbv();
 
-    if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
-        return host_isa;
-    }
-    // ECX for EAX=0x7
-    eax = 0x7;
-    ecx = 0x0; // Sub-leaf = 0
-    cpuid(&eax, &ebx, &ecx, &edx);
-    if (ebx & cpuid_bit::ebx::avx2) {
-        host_isa |= instruction_set::AVX2;
-    }
-    if (ebx & cpuid_bit::ebx::bmi1) {
-        host_isa |= instruction_set::BMI1;
-    }
-    if (ebx & cpuid_bit::ebx::bmi2) {
-        host_isa |= instruction_set::BMI2;
-    }
-    if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == cpuid_bit::xcr0_bit::avx512_saved)) {
-        return host_isa;
-    }
-    if (ebx & cpuid_bit::ebx::avx512f) {
-        host_isa |= instruction_set::AVX512F;
-    }
-    if (ebx & cpuid_bit::ebx::avx512bw) {
-        host_isa |= instruction_set::AVX512BW;
-    }
-    if (ebx & cpuid_bit::ebx::avx512cd) {
-        host_isa |= instruction_set::AVX512CD;
-    }
-    if (ebx & cpuid_bit::ebx::avx512dq) {
-        host_isa |= instruction_set::AVX512DQ;
-    }
-    if (ebx & cpuid_bit::ebx::avx512vl) {
-        host_isa |= instruction_set::AVX512VL;
-    }
-    if (ecx & cpuid_bit::ecx::avx512vbmi2) {
-        host_isa |= instruction_set::AVX512VBMI2;
-    }
+  if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
+    return host_isa;
+  }
+  // ECX for EAX=0x7
+  eax = 0x7;
+  ecx = 0x0; // Sub-leaf = 0
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if (ebx & cpuid_bit::ebx::avx2) {
+    host_isa |= instruction_set::AVX2;
+  }
+  if (ebx & cpuid_bit::ebx::bmi1) {
+    host_isa |= instruction_set::BMI1;
+  }
+  if (ebx & cpuid_bit::ebx::bmi2) {
+    host_isa |= instruction_set::BMI2;
+  }
+  if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == cpuid_bit::xcr0_bit::avx512_saved)) {
     return host_isa;
+  }
+  if (ebx & cpuid_bit::ebx::avx512f) {
+    host_isa |= instruction_set::AVX512F;
+  }
+  if (ebx & cpuid_bit::ebx::avx512bw) {
+    host_isa |= instruction_set::AVX512BW;
+  }
+  if (ebx & cpuid_bit::ebx::avx512cd) {
+    host_isa |= instruction_set::AVX512CD;
+  }
+  if (ebx & cpuid_bit::ebx::avx512dq) {
+    host_isa |= instruction_set::AVX512DQ;
+  }
+  if (ebx & cpuid_bit::ebx::avx512vl) {
+    host_isa |= instruction_set::AVX512VL;
+  }
+  if (ecx & cpuid_bit::ecx::avx512vbmi2) {
+    host_isa |= instruction_set::AVX512VBMI2;
+  }
+  if (ecx & cpuid_bit::ecx::avx512vpopcnt) {
+    host_isa |= instruction_set::AVX512VPOPCNTDQ;
+  }
+  return host_isa;
 }
 #else // fallback
 
 // includes 32-bit ARM.
-static inline uint32_t detect_supported_architectures()
-{
-    return instruction_set::DEFAULT;
+static inline uint32_t detect_supported_architectures() {
+  return instruction_set::DEFAULT;
 }
 
+
 #endif // end SIMD extension detection code
 
 } // namespace internal
@@ -872,6 +864,7 @@ static inline uint32_t detect_supported_architectures()
 #endif // SIMDutf_INTERNAL_ISADETECTION_H
 /* end file include/simdutf/internal/isadetection.h */
 
+
 namespace simdutf {
 
 /**
@@ -884,10 +877,9 @@ namespace simdutf {
  * @param length the length of the string in bytes.
  * @return the detected encoding type
  */
-simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char* input, size_t length) noexcept;
-simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const uint8_t* input, size_t length) noexcept
-{
-    return autodetect_encoding(reinterpret_cast<const char*>(input), length);
+simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * input, size_t length) noexcept;
+simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const uint8_t * input, size_t length) noexcept {
+  return autodetect_encoding(reinterpret_cast<const char *>(input), length);
 }
 
 /**
@@ -901,10 +893,9 @@ simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_enco
  * @param length the length of the string in bytes.
  * @return the detected encoding type
  */
-simdutf_warn_unused int detect_encodings(const char* input, size_t length) noexcept;
-simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t* input, size_t length) noexcept
-{
-    return detect_encodings(reinterpret_cast<const char*>(input), length);
+simdutf_warn_unused int detect_encodings(const char * input, size_t length) noexcept;
+simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t * input, size_t length) noexcept {
+  return detect_encodings(reinterpret_cast<const char *>(input), length);
 }
 
 /**
@@ -918,7 +909,7 @@ simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t* in
  * @param len the length of the string in bytes.
  * @return true if and only if the string is valid UTF-8.
  */
-simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) noexcept;
+simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
 
 /**
  * Validate the UTF-8 string and stop on error.
@@ -929,7 +920,7 @@ simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) noexcept;
  * @param len the length of the string in bytes.
  * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
  */
-simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) noexcept;
+simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept;
 
 /**
  * Validate the ASCII string.
@@ -940,7 +931,7 @@ simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len
  * @param len the length of the string in bytes.
  * @return true if and only if the string is valid ASCII.
  */
-simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) noexcept;
+simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
 
 /**
  * Validate the ASCII string and stop on error. It might be faster than
@@ -952,7 +943,7 @@ simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) noexcept;
  * @param len the length of the string in bytes.
  * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
  */
-simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) noexcept;
+simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept;
 
 /**
  * Using native endianness; Validate the UTF-16 string.
@@ -967,7 +958,7 @@ simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t le
  * @param len the length of the string in number of 2-byte words (char16_t).
  * @return true if and only if the string is valid UTF-16.
  */
-simdutf_warn_unused bool validate_utf16(const char16_t* buf, size_t len) noexcept;
+simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcept;
 
 /**
  * Validate the UTF-16LE string. This function may be best when you expect
@@ -982,7 +973,7 @@ simdutf_warn_unused bool validate_utf16(const char16_t* buf, size_t len) noexcep
  * @param len the length of the string in number of 2-byte words (char16_t).
  * @return true if and only if the string is valid UTF-16LE.
  */
-simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) noexcept;
+simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexcept;
 
 /**
  * Validate the UTF-16BE string. This function may be best when you expect
@@ -997,7 +988,7 @@ simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) noexc
  * @param len the length of the string in number of 2-byte words (char16_t).
  * @return true if and only if the string is valid UTF-16BE.
  */
-simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) noexcept;
+simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) noexcept;
 
 /**
  * Using native endianness; Validate the UTF-16 string and stop on error.
@@ -1011,7 +1002,7 @@ simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) noexc
  * @param len the length of the string in number of 2-byte words (char16_t).
  * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
  */
-simdutf_warn_unused result validate_utf16_with_errors(const char16_t* buf, size_t len) noexcept;
+simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, size_t len) noexcept;
 
 /**
  * Validate the UTF-16LE string and stop on error. It might be faster than
@@ -1025,7 +1016,7 @@ simdutf_warn_unused result validate_utf16_with_errors(const char16_t* buf, size_
  * @param len the length of the string in number of 2-byte words (char16_t).
  * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
  */
-simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) noexcept;
+simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) noexcept;
 
 /**
  * Validate the UTF-16BE string and stop on error. It might be faster than
@@ -1039,7 +1030,7 @@ simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, siz
  * @param len the length of the string in number of 2-byte words (char16_t).
  * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
  */
-simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) noexcept;
+simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) noexcept;
 
 /**
  * Validate the UTF-32 string. This function may be best when you expect
@@ -1054,7 +1045,7 @@ simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, siz
  * @param len the length of the string in number of 4-byte words (char32_t).
  * @return true if and only if the string is valid UTF-32.
  */
-simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) noexcept;
+simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcept;
 
 /**
  * Validate the UTF-32 string and stop on error. It might be faster than
@@ -1068,68 +1059,69 @@ simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) noexcep
  * @param len the length of the string in number of 4-byte words (char32_t).
  * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
  */
-simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) noexcept;
-
-/**
- * Convert Latin1 string into UTF8 string.
- *
- * This function is suitable to work with inputs from untrusted sources.
- *
- * @param input         the Latin1 string to convert
- * @param length        the length of the string in bytes
- * @param latin1_output  the pointer to buffer that can hold conversion result
- * @return the number of written char; 0 if conversion is not possible
- */
-simdutf_warn_unused size_t convert_latin1_to_utf8(const char* input, size_t length, char* utf8_output) noexcept;
-
-/**
- * Convert possibly Latin1 string into UTF-16LE string.
- *
- * This function is suitable to work with inputs from untrusted sources.
- *
- * @param input         the Latin1  string to convert
- * @param length        the length of the string in bytes
- * @param utf16_buffer  the pointer to buffer that can hold conversion result
- * @return the number of written char16_t; 0 if conversion is not possible
- */
-simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* input, size_t length, char16_t* utf16_output) noexcept;
-
-/**
- * Convert Latin1 string into UTF-16BE string.
- *
- * This function is suitable to work with inputs from untrusted sources.
- *
- * @param input         the Latin1 string to convert
- * @param length        the length of the string in bytes
- * @param utf16_buffer  the pointer to buffer that can hold conversion result
- * @return the number of written char16_t; 0 if conversion is not possible
- */
-simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* input, size_t length, char16_t* utf16_output) noexcept;
-
-/**
- * Convert Latin1 string into UTF-32 string.
- *
- * This function is suitable to work with inputs from untrusted sources.
- *
- * @param input         the Latin1 string to convert
- * @param length        the length of the string in bytes
- * @param utf32_buffer  the pointer to buffer that can hold conversion result
- * @return the number of written char32_t; 0 if conversion is not possible
- */
-simdutf_warn_unused size_t convert_latin1_to_utf32(const char* input, size_t length, char32_t* utf32_buffer) noexcept;
-
-/**
- * Convert possibly broken UTF-8 string into latin1 string.
- *
- * During the conversion also validation of the input string is done.
- * This function is suitable to work with inputs from untrusted sources.
- *
- * @param input         the UTF-8 string to convert
- * @param length        the length of the string in bytes
- * @param latin1_output  the pointer to buffer that can hold conversion result
- * @return the number of written char; 0 if the input was not valid UTF-8 string
- */
-simdutf_warn_unused size_t convert_utf8_to_latin1(const char* input, size_t length, char* latin1_output) noexcept;
+simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept;
+
+  /**
+   * Convert Latin1 string into UTF8 string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) noexcept;
+
+
+    /**
+   * Convert possibly Latin1 string into UTF-16LE string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1  string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept;
+
+  /**
+   * Convert Latin1 string into UTF-16BE string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
+
+  /**
+   * Convert Latin1 string into UTF-32 string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf32_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char32_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
+
+ /**
+   * Convert possibly broken UTF-8 string into latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
 
 /**
  * Using native endianness; Convert possibly broken UTF-8 string into UTF-16 string.
@@ -1142,7 +1134,7 @@ simdutf_warn_unused size_t convert_utf8_to_latin1(const char* input, size_t leng
  * @param utf16_buffer  the pointer to buffer that can hold conversion result
  * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
  */
-simdutf_warn_unused size_t convert_utf8_to_utf16(const char* input, size_t length, char16_t* utf16_output) noexcept;
+simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept;
 
 /**
  * Convert possibly broken UTF-8 string into UTF-16LE string.
@@ -1155,7 +1147,7 @@ simdutf_warn_unused size_t convert_utf8_to_utf16(const char* input, size_t lengt
  * @param utf16_buffer  the pointer to buffer that can hold conversion result
  * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
  */
-simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* input, size_t length, char16_t* utf16_output) noexcept;
+simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept;
 
 /**
  * Convert possibly broken UTF-8 string into UTF-16BE string.
@@ -1168,20 +1160,21 @@ simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* input, size_t len
  * @param utf16_buffer  the pointer to buffer that can hold conversion result
  * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
  */
-simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* input, size_t length, char16_t* utf16_output) noexcept;
+simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
 
-/**
- * Convert possibly broken UTF-8 string into latin1 string. with errors
- *
- * During the conversion also validation of the input string is done.
- * This function is suitable to work with inputs from untrusted sources.
- *
- * @param input         the UTF-8 string to convert
- * @param length        the length of the string in bytes
- * @param latin1_output  the pointer to buffer that can hold conversion result
- * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
- */
-simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* input, size_t length, char* latin1_output) noexcept;
+
+  /**
+   * Convert possibly broken UTF-8 string into latin1 string. with errors
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) noexcept;
 
 /**
  * Using native endianness; Convert possibly broken UTF-8 string into UTF-16
@@ -1195,7 +1188,7 @@ simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* input,
  * @param utf16_buffer  the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
  */
-simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char* input, size_t length, char16_t* utf16_output) noexcept;
+simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
 
 /**
  * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
@@ -1208,7 +1201,7 @@ simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char* input,
  * @param utf16_buffer  the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
  */
-simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* input, size_t length, char16_t* utf16_output) noexcept;
+simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
 
 /**
  * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
@@ -1221,7 +1214,7 @@ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* input
  * @param utf16_buffer  the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
  */
-simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* input, size_t length, char16_t* utf16_output) noexcept;
+simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept;
 
 /**
  * Convert possibly broken UTF-8 string into UTF-32 string.
@@ -1234,7 +1227,7 @@ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* input
  * @param utf32_buffer  the pointer to buffer that can hold conversion result
  * @return the number of written char32_t; 0 if the input was not valid UTF-8 string
  */
-simdutf_warn_unused size_t convert_utf8_to_utf32(const char* input, size_t length, char32_t* utf32_output) noexcept;
+simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept;
 
 /**
  * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
@@ -1247,21 +1240,22 @@ simdutf_warn_unused size_t convert_utf8_to_utf32(const char* input, size_t lengt
  * @param utf32_buffer  the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
  */
-simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* input, size_t length, char32_t* utf32_output) noexcept;
+simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept;
+
+    /**
+   * Convert valid UTF-8 string into latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
 
-/**
- * Convert valid UTF-8 string into latin1 string.
- *
- * This function assumes that the input string is valid UTF-8.
- *
- * This function is not BOM-aware.
- *
- * @param input         the UTF-8 string to convert
- * @param length        the length of the string in bytes
- * @param latin1_output  the pointer to buffer that can hold conversion result
- * @return the number of written char; 0 if the input was not valid UTF-8 string
- */
-simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* input, size_t length, char* latin1_output) noexcept;
 
 /**
  * Using native endianness; Convert valid UTF-8 string into UTF-16 string.
@@ -1273,7 +1267,7 @@ simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* input, size_
  * @param utf16_buffer  the pointer to buffer that can hold conversion result
  * @return the number of written char16_t
  */
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char* input, size_t length, char16_t* utf16_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
 
 /**
  * Convert valid UTF-8 string into UTF-16LE string.
@@ -1285,7 +1279,7 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char* input, size_t
  * @param utf16_buffer  the pointer to buffer that can hold conversion result
  * @return the number of written char16_t
  */
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* input, size_t length, char16_t* utf16_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
 
 /**
  * Convert valid UTF-8 string into UTF-16BE string.
@@ -1297,7 +1291,7 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* input, size
  * @param utf16_buffer  the pointer to buffer that can hold conversion result
  * @return the number of written char16_t
  */
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* input, size_t length, char16_t* utf16_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept;
 
 /**
  * Convert valid UTF-8 string into UTF-32 string.
@@ -1309,29 +1303,30 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* input, size
  * @param utf32_buffer  the pointer to buffer that can hold conversion result
  * @return the number of written char32_t
  */
-simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* input, size_t length, char32_t* utf32_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
 
-/**
- * Return the number of bytes that this Latin1 string would require in UTF-8 format.
- *
- * @param input         the Latin1 string to convert
- * @param length        the length of the string bytes
- * @return the number of bytes required to encode the Latin1 string as UTF-8
- */
-simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) noexcept;
 
-/**
- * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
- *
- * This function does not validate the input.
- *
- * This function is not BOM-aware.
- *
- * @param input         the UTF-8 string to convert
- * @param length        the length of the string in byte
- * @return the number of bytes required to encode the UTF-8 string as Latin1
- */
-simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) noexcept;
+ /**
+   * Return the number of bytes that this Latin1 string would require in UTF-8 format.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string bytes
+   * @return the number of bytes required to encode the Latin1 string as UTF-8
+   */
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) noexcept;
+
+  /**
+   * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in byte
+   * @return the number of bytes required to encode the UTF-8 string as Latin1
+   */
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) noexcept;
 
 /**
  * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format.
@@ -1344,7 +1339,7 @@ simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t len
  * @param length        the length of the string in bytes
  * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE
  */
-simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) noexcept;
+simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept;
 
 /**
  * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format.
@@ -1359,7 +1354,7 @@ simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t leng
  * @param length        the length of the string in bytes
  * @return the number of char32_t words required to encode the UTF-8 string as UTF-32
  */
-simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) noexcept;
+simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept;
 
 /**
  * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string.
@@ -1374,37 +1369,39 @@ simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t leng
  * @param utf8_buffer   the pointer to buffer that can hold conversion result
  * @return number of written words; 0 if input is not a valid UTF-16LE string
  */
-simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t* input, size_t length, char* utf8_buffer) noexcept;
-
-/**
- * Convert possibly broken UTF-16LE string into Latin1 string.
- *
- * During the conversion also validation of the input string is done.
- * This function is suitable to work with inputs from untrusted sources.
- *
- * This function is not BOM-aware.
- *
- * @param input         the UTF-16LE string to convert
- * @param length        the length of the string in 2-byte words (char16_t)
- * @param latin1_buffer   the pointer to buffer that can hold conversion result
- * @return number of written words; 0 if input is not a valid UTF-16LE string
- */
-simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* input, size_t length, char* latin1_buffer) noexcept;
+simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
+
+
+  /**
+   * Convert possibly broken UTF-16LE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-16LE string
+   */
+  simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+  /**
+   * Convert possibly broken UTF-16BE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
 
-/**
- * Convert possibly broken UTF-16BE string into Latin1 string.
- *
- * During the conversion also validation of the input string is done.
- * This function is suitable to work with inputs from untrusted sources.
- *
- * This function is not BOM-aware.
- *
- * @param input         the UTF-16BE string to convert
- * @param length        the length of the string in 2-byte words (char16_t)
- * @param latin1_buffer   the pointer to buffer that can hold conversion result
- * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
- */
-simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* input, size_t length, char* latin1_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-16LE string into UTF-8 string.
@@ -1419,7 +1416,7 @@ simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* input, size
  * @param utf8_buffer   the pointer to buffer that can hold conversion result
  * @return number of written words; 0 if input is not a valid UTF-16LE string
  */
-simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* input, size_t length, char* utf8_buffer) noexcept;
+simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-16BE string into UTF-8 string.
@@ -1434,35 +1431,36 @@ simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* input, size_t
  * @param utf8_buffer   the pointer to buffer that can hold conversion result
  * @return number of written words; 0 if input is not a valid UTF-16LE string
  */
-simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* input, size_t length, char* utf8_buffer) noexcept;
+simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
+
+  /**
+   * Convert possibly broken UTF-16LE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+  /**
+   * Convert possibly broken UTF-16BE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
 
-/**
- * Convert possibly broken UTF-16LE string into Latin1 string.
- *
- * During the conversion also validation of the input string is done.
- * This function is suitable to work with inputs from untrusted sources.
- * This function is not BOM-aware.
- *
- * @param input         the UTF-16LE string to convert
- * @param length        the length of the string in 2-byte words (char16_t)
- * @param latin1_buffer   the pointer to buffer that can hold conversion result
- * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
- */
-simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* input, size_t length, char* latin1_buffer) noexcept;
-
-/**
- * Convert possibly broken UTF-16BE string into Latin1 string.
- *
- * During the conversion also validation of the input string is done.
- * This function is suitable to work with inputs from untrusted sources.
- * This function is not BOM-aware.
- *
- * @param input         the UTF-16BE string to convert
- * @param length        the length of the string in 2-byte words (char16_t)
- * @param latin1_buffer   the pointer to buffer that can hold conversion result
- * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
- */
-simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* input, size_t length, char* latin1_buffer) noexcept;
 
 /**
  * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string and stop on error.
@@ -1477,7 +1475,7 @@ simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t*
  * @param utf8_buffer   the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
  */
-simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t* input, size_t length, char* utf8_buffer) noexcept;
+simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
@@ -1492,7 +1490,7 @@ simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t* inp
  * @param utf8_buffer   the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
  */
-simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* input, size_t length, char* utf8_buffer) noexcept;
+simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
@@ -1507,7 +1505,7 @@ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* i
  * @param utf8_buffer   the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
  */
-simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* input, size_t length, char* utf8_buffer) noexcept;
+simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
 /**
  * Using native endianness; Convert valid UTF-16 string into UTF-8 string.
@@ -1521,35 +1519,37 @@ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* i
  * @param utf8_buffer   the pointer to buffer that can hold the conversion result
  * @return number of written words; 0 if conversion is not possible
  */
-simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t* input, size_t length, char* utf8_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
-/**
- * Convert valid UTF-16LE string into Latin1 string.
- *
- * This function assumes that the input string is valid UTF-8.
 
- * This function is not BOM-aware.
- *
- * @param input         the UTF-16LE string to convert
- * @param length        the length of the string in 2-byte words (char16_t)
- * @param latin1_buffer   the pointer to buffer that can hold conversion result
- * @return number of written words; 0 if conversion is not possible
- */
-simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* input, size_t length, char* latin1_buffer) noexcept;
+  /**
+   * Convert valid UTF-16LE string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+  /**
+   * Convert valid UTF-16BE string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
 
-/**
- * Convert valid UTF-16BE string into Latin1 string.
- *
- * This function assumes that the input string is valid UTF-8.
- *
- * This function is not BOM-aware.
- *
- * @param input         the UTF-16BE string to convert
- * @param length        the length of the string in 2-byte words (char16_t)
- * @param latin1_buffer   the pointer to buffer that can hold conversion result
- * @return number of written words; 0 if conversion is not possible
- */
-simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* input, size_t length, char* latin1_buffer) noexcept;
 
 /**
  * Convert valid UTF-16LE string into UTF-8 string.
@@ -1563,7 +1563,7 @@ simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* input
  * @param utf8_buffer   the pointer to buffer that can hold the conversion result
  * @return number of written words; 0 if conversion is not possible
  */
-simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* input, size_t length, char* utf8_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
 /**
  * Convert valid UTF-16BE string into UTF-8 string.
@@ -1577,7 +1577,7 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* input,
  * @param utf8_buffer   the pointer to buffer that can hold the conversion result
  * @return number of written words; 0 if conversion is not possible
  */
-simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* input, size_t length, char* utf8_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
 /**
  * Using native endianness; Convert possibly broken UTF-16 string into UTF-32 string.
@@ -1592,7 +1592,7 @@ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* input,
  * @param utf32_buffer   the pointer to buffer that can hold conversion result
  * @return number of written words; 0 if input is not a valid UTF-16LE string
  */
-simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t* input, size_t length, char32_t* utf32_buffer) noexcept;
+simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-16LE string into UTF-32 string.
@@ -1607,7 +1607,7 @@ simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t* input, size_t
  * @param utf32_buffer   the pointer to buffer that can hold conversion result
  * @return number of written words; 0 if input is not a valid UTF-16LE string
  */
-simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* input, size_t length, char32_t* utf32_buffer) noexcept;
+simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-16BE string into UTF-32 string.
@@ -1622,7 +1622,7 @@ simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* input, size_
  * @param utf32_buffer   the pointer to buffer that can hold conversion result
  * @return number of written words; 0 if input is not a valid UTF-16LE string
  */
-simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* input, size_t length, char32_t* utf32_buffer) noexcept;
+simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
 
 /**
  * Using native endianness; Convert possibly broken UTF-16 string into
@@ -1638,7 +1638,7 @@ simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* input, size_
  * @param utf32_buffer   the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
  */
-simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t* input, size_t length, char32_t* utf32_buffer) noexcept;
+simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
@@ -1653,7 +1653,7 @@ simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t* in
  * @param utf32_buffer   the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
  */
-simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* input, size_t length, char32_t* utf32_buffer) noexcept;
+simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
@@ -1668,7 +1668,7 @@ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t*
  * @param utf32_buffer   the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
  */
-simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* input, size_t length, char32_t* utf32_buffer) noexcept;
+simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
 
 /**
  * Using native endianness; Convert valid UTF-16 string into UTF-32 string.
@@ -1682,7 +1682,7 @@ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t*
  * @param utf32_buffer   the pointer to buffer that can hold the conversion result
  * @return number of written words; 0 if conversion is not possible
  */
-simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t* input, size_t length, char32_t* utf32_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
 
 /**
  * Convert valid UTF-16LE string into UTF-32 string.
@@ -1696,7 +1696,7 @@ simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t* input, s
  * @param utf32_buffer   the pointer to buffer that can hold the conversion result
  * @return number of written words; 0 if conversion is not possible
  */
-simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* input, size_t length, char32_t* utf32_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
 
 /**
  * Convert valid UTF-16BE string into UTF-32 string.
@@ -1710,20 +1710,22 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* input,
  * @param utf32_buffer   the pointer to buffer that can hold the conversion result
  * @return number of written words; 0 if conversion is not possible
  */
-simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* input, size_t length, char32_t* utf32_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
+
 
 /*
- * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
- *
- * This function does not validate the input.
- *
- * This function is not BOM-aware.
- *
- * @param input         the UTF-16LE string to convert
- * @param length        the length of the string in 2-byte words (char16_t)
- * @return the number of bytes required to encode the UTF-16LE string as Latin1
- */
-simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
+   * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16LE string as Latin1
+   */
+  simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
+
 
 /**
  * Using native endianness; Compute the number of bytes that this UTF-16
@@ -1735,7 +1737,7 @@ simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
  * @param length        the length of the string in 2-byte words (char16_t)
  * @return the number of bytes required to encode the UTF-16LE string as UTF-8
  */
-simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t* input, size_t length) noexcept;
+simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept;
 
 /**
  * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
@@ -1746,7 +1748,7 @@ simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t* input, size_t
  * @param length        the length of the string in 2-byte words (char16_t)
  * @return the number of bytes required to encode the UTF-16LE string as UTF-8
  */
-simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) noexcept;
+simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept;
 
 /**
  * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
@@ -1757,7 +1759,7 @@ simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_
  * @param length        the length of the string in 2-byte words (char16_t)
  * @return the number of bytes required to encode the UTF-16BE string as UTF-8
  */
-simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) noexcept;
+simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept;
 
 /**
  * Convert possibly broken UTF-32 string into UTF-8 string.
@@ -1772,7 +1774,7 @@ simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_
  * @param utf8_buffer   the pointer to buffer that can hold conversion result
  * @return number of written words; 0 if input is not a valid UTF-32 string
  */
-simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* input, size_t length, char* utf8_buffer) noexcept;
+simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
@@ -1787,7 +1789,7 @@ simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* input, size_t l
  * @param utf8_buffer   the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
  */
-simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* input, size_t length, char* utf8_buffer) noexcept;
+simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
 
 /**
  * Convert valid UTF-32 string into UTF-8 string.
@@ -1801,7 +1803,7 @@ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* inp
  * @param utf8_buffer   the pointer to buffer that can hold the conversion result
  * @return number of written words; 0 if conversion is not possible
  */
-simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* input, size_t length, char* utf8_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept;
 
 /**
  * Using native endianness; Convert possibly broken UTF-32 string into UTF-16 string.
@@ -1816,7 +1818,7 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* input, si
  * @param utf16_buffer   the pointer to buffer that can hold conversion result
  * @return number of written words; 0 if input is not a valid UTF-32 string
  */
-simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t* input, size_t length, char16_t* utf16_buffer) noexcept;
+simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-32 string into UTF-16LE string.
@@ -1831,53 +1833,54 @@ simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t* input, size_t
  * @param utf16_buffer   the pointer to buffer that can hold conversion result
  * @return number of written words; 0 if input is not a valid UTF-32 string
  */
-simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* input, size_t length, char16_t* utf16_buffer) noexcept;
-
-/**
- * Convert possibly broken UTF-32 string into Latin1 string.
- *
- * During the conversion also validation of the input string is done.
- * This function is suitable to work with inputs from untrusted sources.
- *
- * This function is not BOM-aware.
- *
- * @param input         the UTF-32 string to convert
- * @param length        the length of the string in 4-byte words (char32_t)
- * @param latin1_buffer   the pointer to buffer that can hold conversion result
- * @return number of written words; 0 if input is not a valid UTF-32 string
- */
-
-simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* input, size_t length, char* latin1_buffer) noexcept;
-
-/**
- * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
- *
- * During the conversion also validation of the input string is done.
- * This function is suitable to work with inputs from untrusted sources.
- *
- * This function is not BOM-aware.
- *
- * @param input         the UTF-32 string to convert
- * @param length        the length of the string in 4-byte words (char32_t)
- * @param latin1_buffer   the pointer to buffer that can hold conversion result
- * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
- */
-
-simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* input, size_t length, char* latin1_buffer) noexcept;
-
-/**
- * Convert valid UTF-32 string into Latin1 string.
- *
- * This function assumes that the input string is valid UTF-32.
- *
- * This function is not BOM-aware.
- *
- * @param input         the UTF-32 string to convert
- * @param length        the length of the string in 4-byte words (char32_t)
- * @param latin1_buffer   the pointer to buffer that can hold the conversion result
- * @return number of written words; 0 if conversion is not possible
- */
-simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* input, size_t length, char* latin1_buffer) noexcept;
+simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
+
+  /**
+   * Convert possibly broken UTF-32 string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-32 string
+   */
+
+  simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
+
+
+  /**
+   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
+
+  /**
+   * Convert valid UTF-32 string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-32.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-32 string into UTF-16BE string.
@@ -1892,7 +1895,7 @@ simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* input,
  * @param utf16_buffer   the pointer to buffer that can hold conversion result
  * @return number of written words; 0 if input is not a valid UTF-32 string
  */
-simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* input, size_t length, char16_t* utf16_buffer) noexcept;
+simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
 
 /**
  * Using native endianness; Convert possibly broken UTF-32 string into UTF-16
@@ -1908,7 +1911,7 @@ simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* input, size_
  * @param utf16_buffer   the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
  */
-simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t* input, size_t length, char16_t* utf16_buffer) noexcept;
+simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
@@ -1923,7 +1926,7 @@ simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t* in
  * @param utf16_buffer   the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
  */
-simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* input, size_t length, char16_t* utf16_buffer) noexcept;
+simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
 
 /**
  * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
@@ -1938,7 +1941,7 @@ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t*
  * @param utf16_buffer   the pointer to buffer that can hold conversion result
  * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
  */
-simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* input, size_t length, char16_t* utf16_buffer) noexcept;
+simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
 
 /**
  * Using native endianness; Convert valid UTF-32 string into UTF-16 string.
@@ -1952,7 +1955,7 @@ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t*
  * @param utf16_buffer   the pointer to buffer that can hold the conversion result
  * @return number of written words; 0 if conversion is not possible
  */
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t* input, size_t length, char16_t* utf16_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
 
 /**
  * Convert valid UTF-32 string into UTF-16LE string.
@@ -1966,7 +1969,7 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t* input, s
  * @param utf16_buffer   the pointer to buffer that can hold the conversion result
  * @return number of written words; 0 if conversion is not possible
  */
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* input, size_t length, char16_t* utf16_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
 
 /**
  * Convert valid UTF-32 string into UTF-16BE string.
@@ -1980,7 +1983,7 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* input,
  * @param utf16_buffer   the pointer to buffer that can hold the conversion result
  * @return number of written words; 0 if conversion is not possible
  */
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* input, size_t length, char16_t* utf16_buffer) noexcept;
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
 
 /**
  * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or
@@ -1994,7 +1997,7 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* input,
  * @param length        the length of the string in 2-byte words (char16_t)
  * @param output        the pointer to buffer that can hold the conversion result
  */
-void change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) noexcept;
+void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept;
 
 /**
  * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
@@ -2005,7 +2008,7 @@ void change_endianness_utf16(const char16_t* input, size_t length, char16_t* out
  * @param length        the length of the string in 4-byte words (char32_t)
  * @return the number of bytes required to encode the UTF-32 string as UTF-8
  */
-simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) noexcept;
+simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept;
 
 /**
  * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format.
@@ -2016,7 +2019,7 @@ simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t
  * @param length        the length of the string in 4-byte words (char32_t)
  * @return the number of bytes required to encode the UTF-32 string as UTF-16
  */
-simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) noexcept;
+simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept;
 
 /**
  * Using native endianness; Compute the number of bytes that this UTF-16
@@ -2032,7 +2035,7 @@ simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t
  * @param length        the length of the string in 2-byte words (char16_t)
  * @return the number of bytes required to encode the UTF-16LE string as UTF-32
  */
-simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t* input, size_t length) noexcept;
+simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept;
 
 /**
  * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
@@ -2047,7 +2050,7 @@ simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t* input, size_t
  * @param length        the length of the string in 2-byte words (char16_t)
  * @return the number of bytes required to encode the UTF-16LE string as UTF-32
  */
-simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) noexcept;
+simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept;
 
 /**
  * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format.
@@ -2062,7 +2065,7 @@ simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size
  * @param length        the length of the string in 2-byte words (char16_t)
  * @return the number of bytes required to encode the UTF-16BE string as UTF-32
  */
-simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) noexcept;
+simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept;
 
 /**
  * Count the number of code points (characters) in the string assuming that
@@ -2076,7 +2079,7 @@ simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size
  * @param length        the length of the string in 2-byte words (char16_t)
  * @return number of code points
  */
-simdutf_warn_unused size_t count_utf16(const char16_t* input, size_t length) noexcept;
+simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept;
 
 /**
  * Count the number of code points (characters) in the string assuming that
@@ -2090,7 +2093,7 @@ simdutf_warn_unused size_t count_utf16(const char16_t* input, size_t length) noe
  * @param length        the length of the string in 2-byte words (char16_t)
  * @return number of code points
  */
-simdutf_warn_unused size_t count_utf16le(const char16_t* input, size_t length) noexcept;
+simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept;
 
 /**
  * Count the number of code points (characters) in the string assuming that
@@ -2104,7 +2107,7 @@ simdutf_warn_unused size_t count_utf16le(const char16_t* input, size_t length) n
  * @param length        the length of the string in 2-byte words (char16_t)
  * @return number of code points
  */
-simdutf_warn_unused size_t count_utf16be(const char16_t* input, size_t length) noexcept;
+simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept;
 
 /**
  * Count the number of code points (characters) in the string assuming that
@@ -2116,7 +2119,7 @@ simdutf_warn_unused size_t count_utf16be(const char16_t* input, size_t length) n
  * @param length        the length of the string in bytes
  * @return number of code points
  */
-simdutf_warn_unused size_t count_utf8(const char* input, size_t length) noexcept;
+simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept;
 
 /**
  * An implementation of simdutf for a particular CPU architecture.
@@ -2126,1080 +2129,1090 @@ simdutf_warn_unused size_t count_utf8(const char* input, size_t length) noexcept
  */
 class implementation {
 public:
-    /**
-     * The name of this implementation.
-     *
-     *     const implementation *impl = simdutf::active_implementation;
-     *     cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
-     *
-     * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
-     */
-    virtual const std::string& name() const { return _name; }
-
-    /**
-     * The description of this implementation.
-     *
-     *     const implementation *impl = simdutf::active_implementation;
-     *     cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
-     *
-     * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
-     */
-    virtual const std::string& description() const { return _description; }
-
-    /**
-     * The instruction sets this implementation is compiled against
-     * and the current CPU match. This function may poll the current CPU/system
-     * and should therefore not be called too often if performance is a concern.
-     *
-     *
-     * @return true if the implementation can be safely used on the current system (determined at runtime)
-     */
-    bool supported_by_runtime_system() const;
-
-    /**
-     * This function will try to detect the encoding
-     * @param input the string to identify
-     * @param length the length of the string in bytes.
-     * @return the encoding type detected
-     */
-    virtual encoding_type autodetect_encoding(const char* input, size_t length) const noexcept;
-
-    /**
-     * This function will try to detect the possible encodings in one pass
-     * @param input the string to identify
-     * @param length the length of the string in bytes.
-     * @return the encoding type detected
-     */
-    virtual int detect_encodings(const char* input, size_t length) const noexcept = 0;
-
-    /**
-     * @private For internal implementation use
-     *
-     * The instruction sets this implementation is compiled against.
-     *
-     * @return a mask of all required `internal::instruction_set::` values
-     */
-    virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; }
-
-    /**
-     * Validate the UTF-8 string.
-     *
-     * Overridden by each implementation.
-     *
-     * @param buf the UTF-8 string to validate.
-     * @param len the length of the string in bytes.
-     * @return true if and only if the string is valid UTF-8.
-     */
-    simdutf_warn_unused virtual bool validate_utf8(const char* buf, size_t len) const noexcept = 0;
-
-    /**
-     * Validate the UTF-8 string and stop on errors.
-     *
-     * Overridden by each implementation.
-     *
-     * @param buf the UTF-8 string to validate.
-     * @param len the length of the string in bytes.
-     * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
-     */
-    simdutf_warn_unused virtual result validate_utf8_with_errors(const char* buf, size_t len) const noexcept = 0;
-
-    /**
-     * Validate the ASCII string.
-     *
-     * Overridden by each implementation.
-     *
-     * @param buf the ASCII string to validate.
-     * @param len the length of the string in bytes.
-     * @return true if and only if the string is valid ASCII.
-     */
-    simdutf_warn_unused virtual bool validate_ascii(const char* buf, size_t len) const noexcept = 0;
-
-    /**
-     * Validate the ASCII string and stop on error.
-     *
-     * Overridden by each implementation.
-     *
-     * @param buf the ASCII string to validate.
-     * @param len the length of the string in bytes.
-     * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
-     */
-    simdutf_warn_unused virtual result validate_ascii_with_errors(const char* buf, size_t len) const noexcept = 0;
-
-    /**
-     * Validate the UTF-16LE string.This function may be best when you expect
-     * the input to be almost always valid. Otherwise, consider using
-     * validate_utf16le_with_errors.
-     *
-     * Overridden by each implementation.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param buf the UTF-16LE string to validate.
-     * @param len the length of the string in number of 2-byte words (char16_t).
-     * @return true if and only if the string is valid UTF-16LE.
-     */
-    simdutf_warn_unused virtual bool validate_utf16le(const char16_t* buf, size_t len) const noexcept = 0;
-
-    /**
-     * Validate the UTF-16BE string. This function may be best when you expect
-     * the input to be almost always valid. Otherwise, consider using
-     * validate_utf16be_with_errors.
-     *
-     * Overridden by each implementation.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param buf the UTF-16BE string to validate.
-     * @param len the length of the string in number of 2-byte words (char16_t).
-     * @return true if and only if the string is valid UTF-16BE.
-     */
-    simdutf_warn_unused virtual bool validate_utf16be(const char16_t* buf, size_t len) const noexcept = 0;
-
-    /**
-     * Validate the UTF-16LE string and stop on error.  It might be faster than
-     * validate_utf16le when an error is expected to occur early.
-     *
-     * Overridden by each implementation.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param buf the UTF-16LE string to validate.
-     * @param len the length of the string in number of 2-byte words (char16_t).
-     * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
-     */
-    simdutf_warn_unused virtual result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept = 0;
-
-    /**
-     * Validate the UTF-16BE string and stop on error. It might be faster than
-     * validate_utf16be when an error is expected to occur early.
-     *
-     * Overridden by each implementation.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param buf the UTF-16BE string to validate.
-     * @param len the length of the string in number of 2-byte words (char16_t).
-     * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
-     */
-    simdutf_warn_unused virtual result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept = 0;
-
-    /**
-     * Validate the UTF-32 string.
-     *
-     * Overridden by each implementation.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param buf the UTF-32 string to validate.
-     * @param len the length of the string in number of 4-byte words (char32_t).
-     * @return true if and only if the string is valid UTF-32.
-     */
-    simdutf_warn_unused virtual bool validate_utf32(const char32_t* buf, size_t len) const noexcept = 0;
-
-    /**
-     * Validate the UTF-32 string and stop on error.
-     *
-     * Overridden by each implementation.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param buf the UTF-32 string to validate.
-     * @param len the length of the string in number of 4-byte words (char32_t).
-     * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
-     */
-    simdutf_warn_unused virtual result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept = 0;
-
-    /**
-     * Convert Latin1 string into UTF8 string.
-     *
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * @param input         the Latin1 string to convert
-     * @param length        the length of the string in bytes
-     * @param latin1_output  the pointer to buffer that can hold conversion result
-     * @return the number of written char; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_latin1_to_utf8(const char* input, size_t length, char* utf8_output) const noexcept = 0;
-
-    /**
-     * Convert possibly Latin1 string into UTF-16LE string.
-     *
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * @param input         the Latin1  string to convert
-     * @param length        the length of the string in bytes
-     * @param utf16_buffer  the pointer to buffer that can hold conversion result
-     * @return the number of written char16_t; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_latin1_to_utf16le(const char* input, size_t length, char16_t* utf16_output) const noexcept = 0;
-
-    /**
-     * Convert Latin1 string into UTF-16BE string.
-     *
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * @param input         the Latin1 string to convert
-     * @param length        the length of the string in bytes
-     * @param utf16_buffer  the pointer to buffer that can hold conversion result
-     * @return the number of written char16_t; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_latin1_to_utf16be(const char* input, size_t length, char16_t* utf16_output) const noexcept = 0;
-
-    /**
-     * Convert Latin1 string into UTF-32 string.
-     *
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * @param input         the Latin1 string to convert
-     * @param length        the length of the string in bytes
-     * @param utf32_buffer  the pointer to buffer that can hold conversion result
-     * @return the number of written char32_t; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_latin1_to_utf32(const char* input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-8 string into latin1 string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in bytes
-     * @param latin1_output  the pointer to buffer that can hold conversion result
-     * @return the number of written char; 0 if the input was not valid UTF-8 string
-     */
-    simdutf_warn_unused virtual size_t convert_utf8_to_latin1(const char* input, size_t length, char* latin1_output) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-8 string into latin1 string. with errors
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in bytes
-     * @param latin1_output  the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf8_to_latin1_with_errors(const char* input, size_t length, char* latin1_output) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-8 string into latin1 string.
-     *
-     * This function assumes that the input string is valid UTF-8.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in bytes
-     * @param latin1_output  the pointer to buffer that can hold conversion result
-     * @return the number of written char; 0 if the input was not valid UTF-8 string
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf8_to_latin1(const char* input, size_t length, char* latin1_output) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-8 string into UTF-16LE string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in bytes
-     * @param utf16_buffer  the pointer to buffer that can hold conversion result
-     * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
-     */
-    simdutf_warn_unused virtual size_t convert_utf8_to_utf16le(const char* input, size_t length, char16_t* utf16_output) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-8 string into UTF-16BE string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in bytes
-     * @param utf16_buffer  the pointer to buffer that can hold conversion result
-     * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
-     */
-    simdutf_warn_unused virtual size_t convert_utf8_to_utf16be(const char* input, size_t length, char16_t* utf16_output) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in bytes
-     * @param utf16_buffer  the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(const char* input, size_t length, char16_t* utf16_output) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in bytes
-     * @param utf16_buffer  the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(const char* input, size_t length, char16_t* utf16_output) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-8 string into UTF-32 string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in bytes
-     * @param utf32_buffer  the pointer to buffer that can hold conversion result
-     * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
-     */
-    simdutf_warn_unused virtual size_t convert_utf8_to_utf32(const char* input, size_t length, char32_t* utf32_output) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in bytes
-     * @param utf32_buffer  the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf8_to_utf32_with_errors(const char* input, size_t length, char32_t* utf32_output) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-8 string into UTF-16LE string.
-     *
-     * This function assumes that the input string is valid UTF-8.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in bytes
-     * @param utf16_buffer  the pointer to buffer that can hold conversion result
-     * @return the number of written char16_t
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16le(const char* input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-8 string into UTF-16BE string.
-     *
-     * This function assumes that the input string is valid UTF-8.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in bytes
-     * @param utf16_buffer  the pointer to buffer that can hold conversion result
-     * @return the number of written char16_t
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16be(const char* input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-8 string into UTF-32 string.
-     *
-     * This function assumes that the input string is valid UTF-8.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in bytes
-     * @param utf16_buffer  the pointer to buffer that can hold conversion result
-     * @return the number of written char32_t
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf32(const char* input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
 
-    /**
-     * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format.
-     *
-     * This function does not validate the input.
-     *
-     * @param input         the UTF-8 string to process
-     * @param length        the length of the string in bytes
-     * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE
-     */
-    simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept = 0;
-
-    /**
-     * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format.
-     *
-     * This function is equivalent to count_utf8.
-     *
-     * This function does not validate the input.
-     *
-     * @param input         the UTF-8 string to process
-     * @param length        the length of the string in bytes
-     * @return the number of char32_t words required to encode the UTF-8 string as UTF-32
-     */
-    simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-16LE string into Latin1 string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param latin1_buffer   the pointer to buffer that can hold conversion result
-     * @return number of written words; 0 if input is not a valid UTF-16LE string
-     */
-    simdutf_warn_unused virtual size_t convert_utf16le_to_latin1(const char16_t* input, size_t length, char* latin1_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-16BE string into Latin1 string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16BE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param latin1_buffer   the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
-     */
-    simdutf_warn_unused virtual size_t convert_utf16be_to_latin1(const char16_t* input, size_t length, char* latin1_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-16LE string into Latin1 string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param latin1_buffer   the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf16le_to_latin1_with_errors(const char16_t* input, size_t length, char* latin1_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-16BE string into Latin1 string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16BE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param latin1_buffer   the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf16be_to_latin1_with_errors(const char16_t* input, size_t length, char* latin1_buffer) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-16LE string into Latin1 string.
-     *
-     * This function assumes that the input string is valid UTF-8.
-
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param latin1_buffer   the pointer to buffer that can hold conversion result
-     * @return number of written words; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf16le_to_latin1(const char16_t* input, size_t length, char* latin1_buffer) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-16BE string into Latin1 string.
-     *
-     * This function assumes that the input string is valid UTF-8.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16BE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param latin1_buffer   the pointer to buffer that can hold conversion result
-     * @return number of written words; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf16be_to_latin1(const char16_t* input, size_t length, char* latin1_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-16LE string into UTF-8 string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param utf8_buffer   the pointer to buffer that can hold conversion result
-     * @return number of written words; 0 if input is not a valid UTF-16LE string
-     */
-    simdutf_warn_unused virtual size_t convert_utf16le_to_utf8(const char16_t* input, size_t length, char* utf8_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-16BE string into UTF-8 string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16BE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param utf8_buffer   the pointer to buffer that can hold conversion result
-     * @return number of written words; 0 if input is not a valid UTF-16BE string
-     */
-    simdutf_warn_unused virtual size_t convert_utf16be_to_utf8(const char16_t* input, size_t length, char* utf8_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param utf8_buffer   the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf16le_to_utf8_with_errors(const char16_t* input, size_t length, char* utf8_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16BE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param utf8_buffer   the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf16be_to_utf8_with_errors(const char16_t* input, size_t length, char* utf8_buffer) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-16LE string into UTF-8 string.
-     *
-     * This function assumes that the input string is valid UTF-16LE.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param utf8_buffer   the pointer to buffer that can hold the conversion result
-     * @return number of written words; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf8(const char16_t* input, size_t length, char* utf8_buffer) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-16BE string into UTF-8 string.
-     *
-     * This function assumes that the input string is valid UTF-16BE.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16BE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param utf8_buffer   the pointer to buffer that can hold the conversion result
-     * @return number of written words; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf8(const char16_t* input, size_t length, char* utf8_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-16LE string into UTF-32 string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param utf32_buffer   the pointer to buffer that can hold conversion result
-     * @return number of written words; 0 if input is not a valid UTF-16LE string
-     */
-    simdutf_warn_unused virtual size_t convert_utf16le_to_utf32(const char16_t* input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-16BE string into UTF-32 string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16BE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param utf32_buffer   the pointer to buffer that can hold conversion result
-     * @return number of written words; 0 if input is not a valid UTF-16BE string
-     */
-    simdutf_warn_unused virtual size_t convert_utf16be_to_utf32(const char16_t* input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param utf32_buffer   the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(const char16_t* input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16BE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param utf32_buffer   the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(const char16_t* input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-16LE string into UTF-32 string.
-     *
-     * This function assumes that the input string is valid UTF-16LE.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param utf32_buffer   the pointer to buffer that can hold the conversion result
-     * @return number of written words; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf32(const char16_t* input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-16LE string into UTF-32BE string.
-     *
-     * This function assumes that the input string is valid UTF-16BE.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16BE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param utf32_buffer   the pointer to buffer that can hold the conversion result
-     * @return number of written words; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf32(const char16_t* input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
-
-    /**
-     * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
-     *
-     * This function does not validate the input.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @return the number of bytes required to encode the UTF-16LE string as UTF-8
-     */
-    simdutf_warn_unused virtual size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept = 0;
-
-    /**
-     * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
-     *
-     * This function does not validate the input.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16BE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @return the number of bytes required to encode the UTF-16BE string as UTF-8
-     */
-    simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-32 string into Latin1 string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @param latin1_buffer   the pointer to buffer that can hold conversion result
-     * @return number of written words; 0 if input is not a valid UTF-32 string
-     */
-
-    simdutf_warn_unused virtual size_t convert_utf32_to_latin1(const char32_t* input, size_t length, char* latin1_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @param latin1_buffer   the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
-     */
-
-    simdutf_warn_unused virtual result convert_utf32_to_latin1_with_errors(const char32_t* input, size_t length, char* latin1_buffer) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-32 string into Latin1 string.
-     *
-     * This function assumes that the input string is valid UTF-32.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @param latin1_buffer   the pointer to buffer that can hold the conversion result
-     * @return number of written words; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf32_to_latin1(const char32_t* input, size_t length, char* latin1_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-32 string into UTF-8 string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @param utf8_buffer   the pointer to buffer that can hold conversion result
-     * @return number of written words; 0 if input is not a valid UTF-32 string
-     */
-    simdutf_warn_unused virtual size_t convert_utf32_to_utf8(const char32_t* input, size_t length, char* utf8_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @param utf8_buffer   the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf32_to_utf8_with_errors(const char32_t* input, size_t length, char* utf8_buffer) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-32 string into UTF-8 string.
-     *
-     * This function assumes that the input string is valid UTF-32.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @param utf8_buffer   the pointer to buffer that can hold the conversion result
-     * @return number of written words; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t* input, size_t length, char* utf8_buffer) const noexcept = 0;
-
-    /**
-     * Return the number of bytes that this UTF-16 string would require in Latin1 format.
-     *
-     *
-     * @param input         the UTF-16 string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @return the number of bytes required to encode the UTF-16 string as Latin1
-     */
+  /**
+   * The name of this implementation.
+   *
+   *     const implementation *impl = simdutf::active_implementation;
+   *     cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
+   *
+   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
+   */
+  virtual const std::string &name() const { return _name; }
+
+  /**
+   * The description of this implementation.
+   *
+   *     const implementation *impl = simdutf::active_implementation;
+   *     cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
+   *
+   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
+   */
+  virtual const std::string &description() const { return _description; }
+
+  /**
+   * The instruction sets this implementation is compiled against
+   * and the current CPU match. This function may poll the current CPU/system
+   * and should therefore not be called too often if performance is a concern.
+   *
+   *
+   * @return true if the implementation can be safely used on the current system (determined at runtime)
+   */
+  bool supported_by_runtime_system() const;
+
+  /**
+   * This function will try to detect the encoding
+   * @param input the string to identify
+   * @param length the length of the string in bytes.
+   * @return the encoding type detected
+   */
+  virtual encoding_type autodetect_encoding(const char * input, size_t length) const noexcept;
+
+  /**
+   * This function will try to detect the possible encodings in one pass
+   * @param input the string to identify
+   * @param length the length of the string in bytes.
+   * @return the encoding type detected
+   */
+  virtual int detect_encodings(const char * input, size_t length) const noexcept = 0;
+
+  /**
+   * @private For internal implementation use
+   *
+   * The instruction sets this implementation is compiled against.
+   *
+   * @return a mask of all required `internal::instruction_set::` values
+   */
+  virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; }
+
+
+  /**
+   * Validate the UTF-8 string.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the UTF-8 string to validate.
+   * @param len the length of the string in bytes.
+   * @return true if and only if the string is valid UTF-8.
+   */
+  simdutf_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
+
+  /**
+   * Validate the UTF-8 string and stop on errors.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the UTF-8 string to validate.
+   * @param len the length of the string in bytes.
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused virtual result validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
+
+  /**
+   * Validate the ASCII string.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the ASCII string to validate.
+   * @param len the length of the string in bytes.
+   * @return true if and only if the string is valid ASCII.
+   */
+  simdutf_warn_unused virtual bool validate_ascii(const char *buf, size_t len) const noexcept = 0;
+
+  /**
+   * Validate the ASCII string and stop on error.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the ASCII string to validate.
+   * @param len the length of the string in bytes.
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused virtual result validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
+
+  /**
+   * Validate the UTF-16LE string.This function may be best when you expect
+   * the input to be almost always valid. Otherwise, consider using
+   * validate_utf16le_with_errors.
+   *
+   * Overridden by each implementation.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param buf the UTF-16LE string to validate.
+   * @param len the length of the string in number of 2-byte words (char16_t).
+   * @return true if and only if the string is valid UTF-16LE.
+   */
+  simdutf_warn_unused virtual bool validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
+
+  /**
+   * Validate the UTF-16BE string. This function may be best when you expect
+   * the input to be almost always valid. Otherwise, consider using
+   * validate_utf16be_with_errors.
+   *
+   * Overridden by each implementation.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param buf the UTF-16BE string to validate.
+   * @param len the length of the string in number of 2-byte words (char16_t).
+   * @return true if and only if the string is valid UTF-16BE.
+   */
+  simdutf_warn_unused virtual bool validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
+
+  /**
+   * Validate the UTF-16LE string and stop on error.  It might be faster than
+ * validate_utf16le when an error is expected to occur early.
+   *
+   * Overridden by each implementation.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param buf the UTF-16LE string to validate.
+   * @param len the length of the string in number of 2-byte words (char16_t).
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused virtual result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept = 0;
+
+  /**
+   * Validate the UTF-16BE string and stop on error. It might be faster than
+   * validate_utf16be when an error is expected to occur early.
+   *
+   * Overridden by each implementation.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param buf the UTF-16BE string to validate.
+   * @param len the length of the string in number of 2-byte words (char16_t).
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused virtual result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept = 0;
+
+  /**
+   * Validate the UTF-32 string.
+   *
+   * Overridden by each implementation.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param buf the UTF-32 string to validate.
+   * @param len the length of the string in number of 4-byte words (char32_t).
+   * @return true if and only if the string is valid UTF-32.
+   */
+  simdutf_warn_unused virtual bool validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
+
+  /**
+   * Validate the UTF-32 string and stop on error.
+   *
+   * Overridden by each implementation.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param buf the UTF-32 string to validate.
+   * @param len the length of the string in number of 4-byte words (char32_t).
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused virtual result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept = 0;
+
+  /**
+   * Convert Latin1 string into UTF8 string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) const noexcept = 0;
+
+
+    /**
+   * Convert possibly Latin1 string into UTF-16LE string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1  string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
+
+  /**
+   * Convert Latin1 string into UTF-16BE string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
+
+  /**
+   * Convert Latin1 string into UTF-32 string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf32_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char32_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
+
+ /**
+   * Convert possibly broken UTF-8 string into latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused virtual size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-8 string into latin1 string. with errors
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) const noexcept = 0;
+
+    /**
+   * Convert valid UTF-8 string into latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0;
+
+
+  /**
+   * Convert possibly broken UTF-8 string into UTF-16LE string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused virtual size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-8 string into UTF-16BE string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused virtual size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-8 string into UTF-32 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf32_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused virtual size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf32_buffer  the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-8 string into UTF-16LE string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
+
+/**
+   * Convert valid UTF-8 string into UTF-16BE string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-8 string into UTF-32 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char32_t
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
+
+  /**
+   * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-8 string to process
+   * @param length        the length of the string in bytes
+   * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE
+   */
+  simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0;
+
+   /**
+   * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format.
+   *
+   * This function is equivalent to count_utf8.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-8 string to process
+   * @param length        the length of the string in bytes
+   * @return the number of char32_t words required to encode the UTF-8 string as UTF-32
+   */
+  simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16LE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-16LE string
+   */
+  simdutf_warn_unused virtual size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16BE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused virtual size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16LE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16BE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-16LE string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-16BE string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16LE string into UTF-8 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf8_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-16LE string
+   */
+  simdutf_warn_unused virtual size_t convert_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16BE string into UTF-8 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf8_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-16BE string
+   */
+  simdutf_warn_unused virtual size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf8_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf16le_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf8_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-16LE string into UTF-8 string.
+   *
+   * This function assumes that the input string is valid UTF-16LE.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf8_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-16BE string into UTF-8 string.
+   *
+   * This function assumes that the input string is valid UTF-16BE.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf8_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16LE string into UTF-32 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf32_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-16LE string
+   */
+  simdutf_warn_unused virtual size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16BE string into UTF-32 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf32_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-16BE string
+   */
+  simdutf_warn_unused virtual size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf32_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf32_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-16LE string into UTF-32 string.
+   *
+   * This function assumes that the input string is valid UTF-16LE.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf32_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-16LE string into UTF-32BE string.
+   *
+   * This function assumes that the input string is valid UTF-16BE.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param utf32_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
+
+  /**
+   * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16LE string as UTF-8
+   */
+  simdutf_warn_unused virtual size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0;
+
+  /**
+   * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16BE string as UTF-8
+   */
+  simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-32 string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-32 string
+   */
+
+  simdutf_warn_unused virtual size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+
+  simdutf_warn_unused virtual result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-32 string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-32.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-32 string into UTF-8 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param utf8_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-32 string
+   */
+  simdutf_warn_unused virtual size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param utf8_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-32 string into UTF-8 string.
+   *
+   * This function assumes that the input string is valid UTF-32.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param utf8_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
+
+
+    /**
+   * Return the number of bytes that this UTF-16 string would require in Latin1 format.
+   *
+   *
+   * @param input         the UTF-16 string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16 string as Latin1
+   */
     simdutf_warn_unused virtual size_t utf16_length_from_latin1(size_t length) const noexcept = 0;
 
-    /**
-     * Convert possibly broken UTF-32 string into UTF-16LE string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @param utf16_buffer   the pointer to buffer that can hold conversion result
-     * @return number of written words; 0 if input is not a valid UTF-32 string
-     */
-    simdutf_warn_unused virtual size_t convert_utf32_to_utf16le(const char32_t* input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-32 string into UTF-16BE string.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @param utf16_buffer   the pointer to buffer that can hold conversion result
-     * @return number of written words; 0 if input is not a valid UTF-32 string
-     */
-    simdutf_warn_unused virtual size_t convert_utf32_to_utf16be(const char32_t* input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @param utf16_buffer   the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(const char32_t* input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
-
-    /**
-     * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
-     *
-     * During the conversion also validation of the input string is done.
-     * This function is suitable to work with inputs from untrusted sources.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @param utf16_buffer   the pointer to buffer that can hold conversion result
-     * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
-     */
-    simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(const char32_t* input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-32 string into UTF-16LE string.
-     *
-     * This function assumes that the input string is valid UTF-32.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @param utf16_buffer   the pointer to buffer that can hold the conversion result
-     * @return number of written words; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16le(const char32_t* input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
-
-    /**
-     * Convert valid UTF-32 string into UTF-16BE string.
-     *
-     * This function assumes that the input string is valid UTF-32.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @param utf16_buffer   the pointer to buffer that can hold the conversion result
-     * @return number of written words; 0 if conversion is not possible
-     */
-    simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16be(const char32_t* input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
+  /**
+   * Convert possibly broken UTF-32 string into UTF-16LE string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param utf16_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-32 string
+   */
+  simdutf_warn_unused virtual size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-32 string into UTF-16BE string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param utf16_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-32 string
+   */
+  simdutf_warn_unused virtual size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param utf16_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param utf16_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-32 string into UTF-16LE string.
+   *
+   * This function assumes that the input string is valid UTF-32.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param utf16_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-32 string into UTF-16BE string.
+   *
+   * This function assumes that the input string is valid UTF-32.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param utf16_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0;
+
+  /**
+   * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or
+   * from UTF-16BE to UTF-16LE.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16 string to process
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param output        the pointer to buffer that can hold the conversion result
+   */
+  virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0;
+
+ /**
+   * Return the number of bytes that this Latin1 string would require in UTF-8 format.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string bytes
+   * @return the number of bytes required to encode the Latin1 string as UTF-8
+   */
+    simdutf_warn_unused virtual size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept = 0;
+
+  /**
+   * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @return the number of bytes required to encode the UTF-32 string as UTF-8
+   */
+  simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
+
+  /**
+   * Compute the number of bytes that this UTF-32 string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @return the number of bytes required to encode the UTF-32 string as Latin1
+   */
+    simdutf_warn_unused virtual size_t latin1_length_from_utf32( size_t length) const noexcept = 0;
+
+  /**
+   * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in byte
+   * @return the number of bytes required to encode the UTF-8 string as Latin1
+   */
+    simdutf_warn_unused virtual size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept = 0;
 
-    /**
-     * Change the endianness of the input. Can be used to go from UTF-16LE to UTF-16BE or
-     * from UTF-16BE to UTF-16LE.
-     *
-     * This function does not validate the input.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16 string to process
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @param output        the pointer to buffer that can hold the conversion result
-     */
-    virtual void change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept = 0;
-
-    /**
-     * Return the number of bytes that this Latin1 string would require in UTF-8 format.
-     *
-     * @param input         the Latin1 string to convert
-     * @param length        the length of the string bytes
-     * @return the number of bytes required to encode the Latin1 string as UTF-8
-     */
-    simdutf_warn_unused virtual size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept = 0;
-
-    /**
-     * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
-     *
-     * This function does not validate the input.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @return the number of bytes required to encode the UTF-32 string as UTF-8
-     */
-    simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept = 0;
-
-    /**
-     * Compute the number of bytes that this UTF-32 string would require in Latin1 format.
-     *
-     * This function does not validate the input.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @return the number of bytes required to encode the UTF-32 string as Latin1
-     */
-    simdutf_warn_unused virtual size_t latin1_length_from_utf32(size_t length) const noexcept = 0;
-
-    /**
-     * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
-     *
-     * This function does not validate the input.
-     *
-     * @param input         the UTF-8 string to convert
-     * @param length        the length of the string in byte
-     * @return the number of bytes required to encode the UTF-8 string as Latin1
-     */
-    simdutf_warn_unused virtual size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept = 0;
-
-    /*
-     * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
-     *
-     * This function does not validate the input.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @return the number of bytes required to encode the UTF-16LE string as Latin1
-     */
-    simdutf_warn_unused virtual size_t latin1_length_from_utf16(size_t length) const noexcept = 0;
-
-    /**
-     * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format.
-     *
-     * This function does not validate the input.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @return the number of bytes required to encode the UTF-32 string as UTF-16
-     */
-    simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept = 0;
-
-    /**
-     * Return the number of bytes that this UTF-32 string would require in Latin1 format.
-     *
-     * This function does not validate the input.
-     *
-     * @param input         the UTF-32 string to convert
-     * @param length        the length of the string in 4-byte words (char32_t)
-     * @return the number of bytes required to encode the UTF-32 string as Latin1
-     */
+/*
+   * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16LE string as Latin1
+   */
+  simdutf_warn_unused virtual size_t latin1_length_from_utf16(size_t length) const noexcept = 0;
+
+  /**
+   * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @return the number of bytes required to encode the UTF-32 string as UTF-16
+   */
+  simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
+
+
+    /**
+   * Return the number of bytes that this UTF-32 string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @return the number of bytes required to encode the UTF-32 string as Latin1
+   */
     simdutf_warn_unused virtual size_t utf32_length_from_latin1(size_t length) const noexcept = 0;
 
-    /*
-     * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
-     *
-     * This function is equivalent to count_utf16le.
-     *
-     * This function does not validate the input.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @return the number of bytes required to encode the UTF-16LE string as UTF-32
-     */
-    simdutf_warn_unused virtual size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept = 0;
-
-    /*
-     * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format.
-     *
-     * This function is equivalent to count_utf16be.
-     *
-     * This function does not validate the input.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16BE string to convert
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @return the number of bytes required to encode the UTF-16BE string as UTF-32
-     */
-    simdutf_warn_unused virtual size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept = 0;
+  /*
+   * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
+   *
+   * This function is equivalent to count_utf16le.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16LE string as UTF-32
+   */
+  simdutf_warn_unused virtual size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0;
+
+  /*
+   * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format.
+   *
+   * This function is equivalent to count_utf16be.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16BE string as UTF-32
+   */
+  simdutf_warn_unused virtual size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0;
+
+  /**
+   * Count the number of code points (characters) in the string assuming that
+   * it is valid.
+   *
+   * This function assumes that the input string is valid UTF-16LE.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to process
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return number of code points
+   */
+  simdutf_warn_unused virtual size_t count_utf16le(const char16_t * input, size_t length) const noexcept = 0;
+
+  /**
+   * Count the number of code points (characters) in the string assuming that
+   * it is valid.
+   *
+   * This function assumes that the input string is valid UTF-16BE.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to process
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return number of code points
+   */
+  simdutf_warn_unused virtual size_t count_utf16be(const char16_t * input, size_t length) const noexcept = 0;
+
+
+  /**
+   * Count the number of code points (characters) in the string assuming that
+   * it is valid.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * @param input         the UTF-8 string to process
+   * @param length        the length of the string in bytes
+   * @return number of code points
+   */
+  simdutf_warn_unused virtual size_t count_utf8(const char * input, size_t length) const noexcept = 0;
 
-    /**
-     * Count the number of code points (characters) in the string assuming that
-     * it is valid.
-     *
-     * This function assumes that the input string is valid UTF-16LE.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16LE string to process
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @return number of code points
-     */
-    simdutf_warn_unused virtual size_t count_utf16le(const char16_t* input, size_t length) const noexcept = 0;
 
-    /**
-     * Count the number of code points (characters) in the string assuming that
-     * it is valid.
-     *
-     * This function assumes that the input string is valid UTF-16BE.
-     *
-     * This function is not BOM-aware.
-     *
-     * @param input         the UTF-16BE string to process
-     * @param length        the length of the string in 2-byte words (char16_t)
-     * @return number of code points
-     */
-    simdutf_warn_unused virtual size_t count_utf16be(const char16_t* input, size_t length) const noexcept = 0;
-
-    /**
-     * Count the number of code points (characters) in the string assuming that
-     * it is valid.
-     *
-     * This function assumes that the input string is valid UTF-8.
-     *
-     * @param input         the UTF-8 string to process
-     * @param length        the length of the string in bytes
-     * @return number of code points
-     */
-    simdutf_warn_unused virtual size_t count_utf8(const char* input, size_t length) const noexcept = 0;
 
 protected:
-    /** @private Construct an implementation with the given name and description. For subclasses. */
-    simdutf_really_inline implementation(
-        std::string name,
-        std::string description,
-        uint32_t required_instruction_sets)
-        : _name(name)
-        , _description(description)
-        , _required_instruction_sets(required_instruction_sets)
-    {
-    }
-    virtual ~implementation() = default;
+  /** @private Construct an implementation with the given name and description. For subclasses. */
+  simdutf_really_inline implementation(
+    std::string name,
+    std::string description,
+    uint32_t required_instruction_sets
+  ) :
+    _name(name),
+    _description(description),
+    _required_instruction_sets(required_instruction_sets)
+  {
+  }
+  virtual ~implementation()=default;
 
 private:
-    /**
-     * The name of this implementation.
-     */
-    const std::string _name;
-
-    /**
-     * The description of this implementation.
-     */
-    const std::string _description;
-
-    /**
-     * Instruction sets required for this implementation.
-     */
-    const uint32_t _required_instruction_sets;
+  /**
+   * The name of this implementation.
+   */
+  const std::string _name;
+
+  /**
+   * The description of this implementation.
+   */
+  const std::string _description;
+
+  /**
+   * Instruction sets required for this implementation.
+   */
+  const uint32_t _required_instruction_sets;
 };
 
 /** @private */
@@ -3210,102 +3223,82 @@ namespace internal {
  */
 class available_implementation_list {
 public:
-    /** Get the list of available implementations compiled into simdutf */
-    simdutf_really_inline available_implementation_list() {}
-    /** Number of implementations */
-    size_t size() const noexcept;
-    /** STL const begin() iterator */
-    const implementation* const* begin() const noexcept;
-    /** STL const end() iterator */
-    const implementation* const* end() const noexcept;
-
-    /**
-     * Get the implementation with the given name.
-     *
-     * Case sensitive.
-     *
-     *     const implementation *impl = simdutf::available_implementations["westmere"];
-     *     if (!impl) { exit(1); }
-     *     if (!imp->supported_by_runtime_system()) { exit(1); }
-     *     simdutf::active_implementation = impl;
-     *
-     * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
-     * @return the implementation, or nullptr if the parse failed.
-     */
-    const implementation* operator[](const std::string& name) const noexcept
-    {
-        for (const implementation* impl : *this) {
-            if (impl->name() == name) {
-                return impl;
-            }
-        }
-        return nullptr;
+  /** Get the list of available implementations compiled into simdutf */
+  simdutf_really_inline available_implementation_list() {}
+  /** Number of implementations */
+  size_t size() const noexcept;
+  /** STL const begin() iterator */
+  const implementation * const *begin() const noexcept;
+  /** STL const end() iterator */
+  const implementation * const *end() const noexcept;
+
+  /**
+   * Get the implementation with the given name.
+   *
+   * Case sensitive.
+   *
+   *     const implementation *impl = simdutf::available_implementations["westmere"];
+   *     if (!impl) { exit(1); }
+   *     if (!imp->supported_by_runtime_system()) { exit(1); }
+   *     simdutf::active_implementation = impl;
+   *
+   * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
+   * @return the implementation, or nullptr if the parse failed.
+   */
+  const implementation * operator[](const std::string &name) const noexcept {
+    for (const implementation * impl : *this) {
+      if (impl->name() == name) { return impl; }
     }
-
-    /**
-     * Detect the most advanced implementation supported by the current host.
-     *
-     * This is used to initialize the implementation on startup.
-     *
-     *     const implementation *impl = simdutf::available_implementation::detect_best_supported();
-     *     simdutf::active_implementation = impl;
-     *
-     * @return the most advanced supported implementation for the current host, or an
-     *         implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported
-     *         implementation. Will never return nullptr.
-     */
-    const implementation* detect_best_supported() const noexcept;
+    return nullptr;
+  }
+
+  /**
+   * Detect the most advanced implementation supported by the current host.
+   *
+   * This is used to initialize the implementation on startup.
+   *
+   *     const implementation *impl = simdutf::available_implementation::detect_best_supported();
+   *     simdutf::active_implementation = impl;
+   *
+   * @return the most advanced supported implementation for the current host, or an
+   *         implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported
+   *         implementation. Will never return nullptr.
+   */
+  const implementation *detect_best_supported() const noexcept;
 };
 
 template<typename T>
 class atomic_ptr {
 public:
-    atomic_ptr(T* _ptr)
-        : ptr { _ptr }
-    {
-    }
+  atomic_ptr(T *_ptr) : ptr{_ptr} {}
 
 #if defined(SIMDUTF_NO_THREADS)
-    operator const T*() const
-    {
-        return ptr;
-    }
-    const T& operator*() const { return *ptr; }
-    const T* operator->() const { return ptr; }
-
-    operator T*() { return ptr; }
-    T& operator*() { return *ptr; }
-    T* operator->() { return ptr; }
-    atomic_ptr& operator=(T* _ptr)
-    {
-        ptr = _ptr;
-        return *this;
-    }
+  operator const T*() const { return ptr; }
+  const T& operator*() const { return *ptr; }
+  const T* operator->() const { return ptr; }
+
+  operator T*() { return ptr; }
+  T& operator*() { return *ptr; }
+  T* operator->() { return ptr; }
+  atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
 
 #else
-    operator const T*() const
-    {
-        return ptr.load();
-    }
-    const T& operator*() const { return *ptr; }
-    const T* operator->() const { return ptr.load(); }
-
-    operator T*() { return ptr.load(); }
-    T& operator*() { return *ptr; }
-    T* operator->() { return ptr.load(); }
-    atomic_ptr& operator=(T* _ptr)
-    {
-        ptr = _ptr;
-        return *this;
-    }
+  operator const T*() const { return ptr.load(); }
+  const T& operator*() const { return *ptr; }
+  const T* operator->() const { return ptr.load(); }
+
+  operator T*() { return ptr.load(); }
+  T& operator*() { return *ptr; }
+  T* operator->() { return ptr.load(); }
+  atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
 
 #endif
 
 private:
 #if defined(SIMDUTF_NO_THREADS)
-    T* ptr;
+  T* ptr;
 #else
-    std::atomic<T*> ptr;
+  std::atomic<T*> ptr;
 #endif
 };
 
@@ -3319,22 +3312,25 @@ class detect_best_supported_implementation_on_first_use;
 extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations();
 
 /**
- * The active implementation.
- *
- * Automatically initialized on first use to the most advanced implementation supported by this hardware.
- */
+  * The active implementation.
+  *
+  * Automatically initialized on first use to the most advanced implementation supported by this hardware.
+  */
 extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation();
 
+
 } // namespace simdutf
 
 #endif // SIMDUTF_IMPLEMENTATION_H
 /* end file include/simdutf/implementation.h */
 
+
 // Implementation-internal files (must be included before the implementations themselves, to keep
 // amalgamation working--otherwise, the first time a file is included, it might be put inside the
 // #ifdef SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other implementations can't
 // compile unless that implementation is turned on).
 
+
 SIMDUTF_POP_DISABLE_WARNINGS
 
 #endif // SIMDUTF_H
author	Jarred Sumner <jarred@jarredsumner.com>	2023-08-09 09:14:51 -0700
committer	GitHub <noreply@github.com>	2023-08-09 09:14:51 -0700
commit	b3019270c9640a60f7a30f172cea10e310baf3b6 (patch)
tree	8b6252ac910863f513a27444526b461c39be8e76
parent	5d7c77aab0761e16ef163dcf9792e8947bdab214 (diff)
download	bun-b3019270c9640a60f7a30f172cea10e310baf3b6.tar.gz bun-b3019270c9640a60f7a30f172cea10e310baf3b6.tar.zst bun-b3019270c9640a60f7a30f172cea10e310baf3b6.zip