diff options
Diffstat (limited to 'src/bun.js/bindings/simdutf.h')
-rw-r--r-- | src/bun.js/bindings/simdutf.h | 511 |
1 files changed, 402 insertions, 109 deletions
diff --git a/src/bun.js/bindings/simdutf.h b/src/bun.js/bindings/simdutf.h index 3514f117d..0a57a69f7 100644 --- a/src/bun.js/bindings/simdutf.h +++ b/src/bun.js/bindings/simdutf.h @@ -1,4 +1,4 @@ -/* auto-generated on 2023-01-02 15:43:33 -0500. Do not edit! */ +/* auto-generated on 2023-02-10 14:42:58 -0500. Do not edit! */ // dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf.h /* begin file include/simdutf.h */ #ifndef SIMDUTF_H @@ -64,6 +64,48 @@ #include <strings.h> #endif +/** + * We want to check that it is actually a little endian system at + * compile-time. + */ + +#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) +#define SIMDUTF_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#elif defined(_WIN32) +#define SIMDUTF_IS_BIG_ENDIAN 0 +#else +#if defined(__APPLE__) || defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ +#include <machine/endian.h> +#elif defined(sun) || defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__) +#include <sys/byteorder.h> +#else // defined(__APPLE__) || defined(__FreeBSD__) + +#ifdef __has_include +#if __has_include(<endian.h>) +#include <endian.h> +#endif //__has_include(<endian.h>) +#endif //__has_include + +#endif // defined(__APPLE__) || defined(__FreeBSD__) + + +#ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) +#define SIMDUTF_IS_BIG_ENDIAN 0 +#endif + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define SIMDUTF_IS_BIG_ENDIAN 0 +#else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define SIMDUTF_IS_BIG_ENDIAN 1 +#endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + +#endif // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ + + +/** + * At this point in time, SIMDUTF_IS_BIG_ENDIAN is defined. + */ + #ifdef _MSC_VER #define SIMDUTF_VISUAL_STUDIO 1 /** @@ -98,8 +140,10 @@ #define SIMDUTF_IS_ARM64 1 #elif defined(__PPC64__) || defined(_M_PPC64) //#define SIMDUTF_IS_PPC64 1 -#pragma message("The simdutf library does yet support SIMD acceleration under\ -POWER processors. Please see https://github.com/lemire/simdutf/issues/51") +// The simdutf library does yet support SIMD acceleration under +// POWER processors. Please see https://github.com/lemire/simdutf/issues/51 +#elif defined(__s390__) +// s390 IBM system. Big endian. #else // The simdutf library is designed // for 64-bit processors and it seems that you are not @@ -456,6 +500,8 @@ enum endianness { BIG }; +bool match_system(endianness e); + std::string to_string(encoding_type bom); // Note that BOM for UTF8 is discouraged. @@ -492,7 +538,7 @@ enum error_code { HEADER_BITS, // Any byte must have fewer than 5 header bits. TOO_SHORT, // The leading byte must be followed by N-1 continuation bytes, where N is the UTF-8 character length // This is also the error when the input is truncated. - TOO_LONG, // The leading byte must not be a continuation byte. + TOO_LONG, // We either have too many consecutive continuation bytes or the string starts with a continuation byte. OVERLONG, // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters, // and U+FFFF for four-byte characters. TOO_LARGE, // The decoded character must be less than or equal to U+10FFFF OR less than or equal than U+7F for ASCII. @@ -526,18 +572,18 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS #define SIMDUTF_SIMDUTF_VERSION_H /** The version of simdutf being used (major.minor.revision) */ -#define SIMDUTF_VERSION "2.1.0" +#define SIMDUTF_VERSION "3.2.0" namespace simdutf { enum { /** * The major version (MAJOR.minor.revision) of simdutf being used. */ - SIMDUTF_VERSION_MAJOR = 2, + SIMDUTF_VERSION_MAJOR = 3, /** * The minor version (major.MINOR.revision) of simdutf being used. */ - SIMDUTF_VERSION_MINOR = 1, + SIMDUTF_VERSION_MINOR = 2, /** * The revision (major.minor.REVISION) of simdutf being used. */ @@ -873,6 +919,21 @@ simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept; simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept; /** + * Using native endianness; Validate the UTF-16 string. + * This function may be best when you expect the input to be almost always valid. + * Otherwise, consider using validate_utf16_with_errors. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16 string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return true if and only if the string is valid UTF-16. + */ +simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcept; + +/** * Validate the UTF-16LE string. This function may be best when you expect * the input to be almost always valid. Otherwise, consider using * validate_utf16le_with_errors. @@ -903,6 +964,20 @@ simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) noexc simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) noexcept; /** + * Using native endianness; Validate the UTF-16 string and stop on error. + * It might be faster than validate_utf16 when an error is expected to occur early. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16 string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. + */ +simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf, size_t len) noexcept; + +/** * Validate the UTF-16LE string and stop on error. It might be faster than * validate_utf16le when an error is expected to occur early. * @@ -931,7 +1006,7 @@ simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, siz simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) noexcept; /** - * Validate the UTF-32LE string. This function may be best when you expect + * Validate the UTF-32 string. This function may be best when you expect * the input to be almost always valid. Otherwise, consider using * validate_utf32_with_errors. * @@ -939,27 +1014,40 @@ simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, siz * * This function is not BOM-aware. * - * @param buf the UTF-32LE string to validate. + * @param buf the UTF-32 string to validate. * @param len the length of the string in number of 4-byte words (char32_t). - * @return true if and only if the string is valid UTF-32LE. + * @return true if and only if the string is valid UTF-32. */ simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcept; /** - * Validate the UTF-32LE string and stop on error. It might be faster than + * Validate the UTF-32 string and stop on error. It might be faster than * validate_utf32 when an error is expected to occur early. * * Overridden by each implementation. * * This function is not BOM-aware. * - * @param buf the UTF-32LE string to validate. + * @param buf the UTF-32 string to validate. * @param len the length of the string in number of 4-byte words (char32_t). * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. */ simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept; /** + * Using native endianness; Convert possibly broken UTF-8 string into UTF-16 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + */ +simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept; + +/** * Convert possibly broken UTF-8 string into UTF-16LE string. * * During the conversion also validation of the input string is done. @@ -986,6 +1074,20 @@ simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t le simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept; /** + * Using native endianness; Convert possibly broken UTF-8 string into UTF-16 + * string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + */ +simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; + +/** * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error. * * During the conversion also validation of the input string is done. @@ -1012,7 +1114,7 @@ simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * inpu simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept; /** - * Convert possibly broken UTF-8 string into UTF-32LE string. + * Convert possibly broken UTF-8 string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1025,7 +1127,7 @@ simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * inpu simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept; /** - * Convert possibly broken UTF-8 string into UTF-32LE string and stop on error. + * Convert possibly broken UTF-8 string into UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1038,6 +1140,18 @@ simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t leng simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept; /** + * Using native endianness; Convert valid UTF-8 string into UTF-16 string. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t + */ +simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** * Convert valid UTF-8 string into UTF-16LE string. * * This function assumes that the input string is valid UTF-8. @@ -1062,7 +1176,7 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, siz simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Convert valid UTF-8 string into UTF-32LE string. + * Convert valid UTF-8 string into UTF-32 string. * * This function assumes that the input string is valid UTF-8. * @@ -1087,7 +1201,7 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_ simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept; /** - * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32LE format. + * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format. * * This function is equivalent to count_utf8 * @@ -1097,11 +1211,26 @@ simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t len * * @param input the UTF-8 string to process * @param length the length of the string in bytes - * @return the number of char32_t words required to encode the UTF-8 string as UTF-32LE + * @return the number of char32_t words required to encode the UTF-8 string as UTF-32 */ simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept; /** + * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16LE string + */ +simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + +/** * Convert possibly broken UTF-16LE string into UTF-8 string. * * During the conversion also validation of the input string is done. @@ -1132,6 +1261,21 @@ simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_ simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; /** + * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. + */ +simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + +/** * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error. * * During the conversion also validation of the input string is done. @@ -1162,6 +1306,20 @@ simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * input, size_t length, char* utf8_buffer) noexcept; /** + * Using native endianness; Convert valid UTF-16 string into UTF-8 string. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + +/** * Convert valid UTF-16LE string into UTF-8 string. * * This function assumes that the input string is valid UTF-16LE. @@ -1190,7 +1348,22 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * input, simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; /** - * Convert possibly broken UTF-16LE string into UTF-32LE string. + * Using native endianness; Convert possibly broken UTF-16 string into UTF-32 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16LE string + */ +simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Convert possibly broken UTF-16LE string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1205,7 +1378,7 @@ simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * input, simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** - * Convert possibly broken UTF-16BE string into UTF-32LE string. + * Convert possibly broken UTF-16BE string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1220,7 +1393,23 @@ simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * input, size simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** - * Convert possibly broken UTF-16LE string into UTF-32LE string and stop on error. + * Using native endianness; Convert possibly broken UTF-16 string into + * UTF-32 string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char32_t written if successful. + */ +simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1235,7 +1424,7 @@ simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * input, size simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** - * Convert possibly broken UTF-16BE string into UTF-32LE string and stop on error. + * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1250,7 +1439,21 @@ simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** - * Convert valid UTF-16LE string into UTF-32LE string. + * Using native endianness; Convert valid UTF-16 string into UTF-32 string. + * + * This function assumes that the input string is valid UTF-16 (native endianness). + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf32_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; + +/** + * Convert valid UTF-16LE string into UTF-32 string. * * This function assumes that the input string is valid UTF-16LE. * @@ -1264,7 +1467,7 @@ simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** - * Convert valid UTF-16BE string into UTF-32LE string. + * Convert valid UTF-16BE string into UTF-32 string. * * This function assumes that the input string is valid UTF-16LE. * @@ -1278,6 +1481,18 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept; /** + * Using native endianness; Compute the number of bytes that this UTF-16 + * string would require in UTF-8 format. + * + * This function does not validate the input. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as UTF-8 + */ +simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept; + +/** * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format. * * This function does not validate the input. @@ -1300,29 +1515,29 @@ simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept; /** - * Convert possibly broken UTF-32LE string into UTF-8 string. + * Convert possibly broken UTF-32 string into UTF-8 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32LE string + * @return number of written words; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept; /** - * Convert possibly broken UTF-32LE string into UTF-8 string and stop on error. + * Convert possibly broken UTF-32 string into UTF-8 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. @@ -1330,13 +1545,13 @@ simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * input, size_t simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) noexcept; /** - * Convert valid UTF-32LE string into UTF-8 string. + * Convert valid UTF-32 string into UTF-8 string. * - * This function assumes that the input string is valid UTF-32LE. + * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf8_buffer the pointer to buffer that can hold the conversion result * @return number of written words; 0 if conversion is not possible @@ -1344,44 +1559,75 @@ simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * in simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) noexcept; /** - * Convert possibly broken UTF-32LE string into UTF-16LE string. + * Using native endianness; Convert possibly broken UTF-32 string into UTF-16 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32LE string + * @return number of written words; 0 if input is not a valid UTF-32 string + */ +simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert possibly broken UTF-32 string into UTF-16LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Convert possibly broken UTF-32LE string into UTF-16BE string. + * Convert possibly broken UTF-32 string into UTF-16BE string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32LE string + * @return number of written words; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Convert possibly broken UTF-32LE string into UTF-16LE string and stop on error. + * Using native endianness; Convert possibly broken UTF-32 string into UTF-16 + * string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. + */ +simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. @@ -1389,14 +1635,14 @@ simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * input, size simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Convert possibly broken UTF-32LE string into UTF-16BE string and stop on error. + * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. @@ -1404,13 +1650,27 @@ simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Convert valid UTF-32LE string into UTF-16LE string. + * Using native endianness; Convert valid UTF-32 string into UTF-16 string. + * + * This function assumes that the input string is valid UTF-32. + * + * This function is not BOM-aware. + * + * @param input the UTF-32 string to convert + * @param length the length of the string in 4-byte words (char32_t) + * @param utf16_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Convert valid UTF-32 string into UTF-16LE string. * - * This function assumes that the input string is valid UTF-32LE. + * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result * @return number of written words; 0 if conversion is not possible @@ -1418,13 +1678,13 @@ simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept; /** - * Convert valid UTF-32LE string into UTF-16BE string. + * Convert valid UTF-32 string into UTF-16BE string. * - * This function assumes that the input string is valid UTF-32LE. + * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result * @return number of written words; 0 if conversion is not possible @@ -1446,29 +1706,45 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * input void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept; /** - * Compute the number of bytes that this UTF-32LE string would require in UTF-8 format. + * Compute the number of bytes that this UTF-32 string would require in UTF-8 format. * * This function does not validate the input. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) - * @return the number of bytes required to encode the UTF-32LE string as UTF-8 + * @return the number of bytes required to encode the UTF-32 string as UTF-8 */ simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept; /** - * Compute the number of two-byte words that this UTF-32LE string would require in UTF-16 format. + * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format. * * This function does not validate the input. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) - * @return the number of bytes required to encode the UTF-32LE string as UTF-16 + * @return the number of bytes required to encode the UTF-32 string as UTF-16 */ simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept; /** - * Compute the number of bytes that this UTF-16LE string would require in UTF-32LE format. + * Using native endianness; Compute the number of bytes that this UTF-16 + * string would require in UTF-32 format. + * + * This function is equivalent to count_utf16. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as UTF-32 + */ +simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept; + +/** + * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format. * * This function is equivalent to count_utf16le. * @@ -1478,12 +1754,12 @@ simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_ * * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte words (char16_t) - * @return the number of bytes required to encode the UTF-16LE string as UTF-32LE + * @return the number of bytes required to encode the UTF-16LE string as UTF-32 */ simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept; /** - * Compute the number of bytes that this UTF-16BE string would require in UTF-32LE format. + * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format. * * This function is equivalent to count_utf16be. * @@ -1493,7 +1769,7 @@ simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, siz * * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte words (char16_t) - * @return the number of bytes required to encode the UTF-16BE string as UTF-32LE + * @return the number of bytes required to encode the UTF-16BE string as UTF-32 */ simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept; @@ -1501,6 +1777,20 @@ simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, siz * Count the number of code points (characters) in the string assuming that * it is valid. * + * This function assumes that the input string is valid UTF-16 (native endianness). + * + * This function is not BOM-aware. + * + * @param input the UTF-16 string to process + * @param length the length of the string in 2-byte words (char16_t) + * @return number of code points + */ +simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept; + +/** + * Count the number of code points (characters) in the string assuming that + * it is valid. + * * This function assumes that the input string is valid UTF-16LE. * * This function is not BOM-aware. @@ -1599,7 +1889,7 @@ public: * * @return a mask of all required `internal::instruction_set::` values */ - virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; }; + virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; } /** @@ -1705,26 +1995,26 @@ public: simdutf_warn_unused virtual result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept = 0; /** - * Validate the UTF-32LE string. + * Validate the UTF-32 string. * * Overridden by each implementation. * * This function is not BOM-aware. * - * @param buf the UTF-32LE string to validate. + * @param buf the UTF-32 string to validate. * @param len the length of the string in number of 4-byte words (char32_t). - * @return true if and only if the string is valid UTF-32LE. + * @return true if and only if the string is valid UTF-32. */ simdutf_warn_unused virtual bool validate_utf32(const char32_t *buf, size_t len) const noexcept = 0; /** - * Validate the UTF-32LE string and stop on error. + * Validate the UTF-32 string and stop on error. * * Overridden by each implementation. * * This function is not BOM-aware. * - * @param buf the UTF-32LE string to validate. + * @param buf the UTF-32 string to validate. * @param len the length of the string in number of 4-byte words (char32_t). * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful. */ @@ -1783,7 +2073,7 @@ public: simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0; /** - * Convert possibly broken UTF-8 string into UTF-32LE string. + * Convert possibly broken UTF-8 string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1796,7 +2086,7 @@ public: simdutf_warn_unused virtual size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) const noexcept = 0; /** - * Convert possibly broken UTF-8 string into UTF-32LE string and stop on error. + * Convert possibly broken UTF-8 string into UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1833,7 +2123,7 @@ public: simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; /** - * Convert valid UTF-8 string into UTF-32LE string. + * Convert valid UTF-8 string into UTF-32 string. * * This function assumes that the input string is valid UTF-8. * @@ -1856,7 +2146,7 @@ public: simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0; /** - * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32LE format. + * Compute the number of 4-byte words that this UTF-8 string would require in UTF-32 format. * * This function is equivalent to count_utf8. * @@ -1864,7 +2154,7 @@ public: * * @param input the UTF-8 string to process * @param length the length of the string in bytes - * @return the number of char32_t words required to encode the UTF-8 string as UTF-32LE + * @return the number of char32_t words required to encode the UTF-8 string as UTF-32 */ simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0; @@ -1957,7 +2247,7 @@ public: simdutf_warn_unused virtual size_t convert_valid_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-16LE string into UTF-32LE string. + * Convert possibly broken UTF-16LE string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1972,7 +2262,7 @@ public: simdutf_warn_unused virtual size_t convert_utf16le_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-16BE string into UTF-32LE string. + * Convert possibly broken UTF-16BE string into UTF-32 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -1987,7 +2277,7 @@ public: simdutf_warn_unused virtual size_t convert_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-16LE string into UTF-32LE string and stop on error. + * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -2002,7 +2292,7 @@ public: simdutf_warn_unused virtual result convert_utf16le_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-16BE string into UTF-32LE string and stop on error. + * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. @@ -2017,7 +2307,7 @@ public: simdutf_warn_unused virtual result convert_utf16be_to_utf32_with_errors(const char16_t * input, size_t length, char32_t* utf32_buffer) const noexcept = 0; /** - * Convert valid UTF-16LE string into UTF-32LE string. + * Convert valid UTF-16LE string into UTF-32 string. * * This function assumes that the input string is valid UTF-16LE. * @@ -2071,29 +2361,29 @@ public: simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0; /** - * Convert possibly broken UTF-32LE string into UTF-8 string. + * Convert possibly broken UTF-32 string into UTF-8 string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32LE string + * @return number of written words; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused virtual size_t convert_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-32LE string into UTF-8 string and stop on error. + * Convert possibly broken UTF-32 string into UTF-8 string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf8_buffer the pointer to buffer that can hold conversion result * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful. @@ -2101,13 +2391,13 @@ public: simdutf_warn_unused virtual result convert_utf32_to_utf8_with_errors(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; /** - * Convert valid UTF-32LE string into UTF-8 string. + * Convert valid UTF-32 string into UTF-8 string. * - * This function assumes that the input string is valid UTF-32LE. + * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf8_buffer the pointer to buffer that can hold the conversion result * @return number of written words; 0 if conversion is not possible @@ -2115,44 +2405,44 @@ public: simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-32LE string into UTF-16LE string. + * Convert possibly broken UTF-32 string into UTF-16LE string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32LE string + * @return number of written words; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused virtual size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-32LE string into UTF-16BE string. + * Convert possibly broken UTF-32 string into UTF-16BE string. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return number of written words; 0 if input is not a valid UTF-32LE string + * @return number of written words; 0 if input is not a valid UTF-32 string */ simdutf_warn_unused virtual size_t convert_utf32_to_utf16be(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-32LE string into UTF-16LE string and stop on error. + * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. @@ -2160,14 +2450,14 @@ public: simdutf_warn_unused virtual result convert_utf32_to_utf16le_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; /** - * Convert possibly broken UTF-32LE string into UTF-16BE string and stop on error. + * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error. * * During the conversion also validation of the input string is done. * This function is suitable to work with inputs from untrusted sources. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold conversion result * @return a result pair struct with an error code and either the position of the error if any or the number of char16_t written if successful. @@ -2175,13 +2465,13 @@ public: simdutf_warn_unused virtual result convert_utf32_to_utf16be_with_errors(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; /** - * Convert valid UTF-32LE string into UTF-16LE string. + * Convert valid UTF-32 string into UTF-16LE string. * - * This function assumes that the input string is valid UTF-32LE. + * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result * @return number of written words; 0 if conversion is not possible @@ -2189,13 +2479,13 @@ public: simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; /** - * Convert valid UTF-32LE string into UTF-16BE string. + * Convert valid UTF-32 string into UTF-16BE string. * - * This function assumes that the input string is valid UTF-32LE. + * This function assumes that the input string is valid UTF-32. * * This function is not BOM-aware. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) * @param utf16_buffer the pointer to buffer that can hold the conversion result * @return number of written words; 0 if conversion is not possible @@ -2217,29 +2507,29 @@ public: virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0; /** - * Compute the number of bytes that this UTF-32LE string would require in UTF-8 format. + * Compute the number of bytes that this UTF-32 string would require in UTF-8 format. * * This function does not validate the input. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) - * @return the number of bytes required to encode the UTF-32LE string as UTF-8 + * @return the number of bytes required to encode the UTF-32 string as UTF-8 */ simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0; /** - * Compute the number of two-byte words that this UTF-32LE string would require in UTF-16 format. + * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format. * * This function does not validate the input. * - * @param input the UTF-32LE string to convert + * @param input the UTF-32 string to convert * @param length the length of the string in 4-byte words (char32_t) - * @return the number of bytes required to encode the UTF-32LE string as UTF-16 + * @return the number of bytes required to encode the UTF-32 string as UTF-16 */ simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0; /* - * Compute the number of bytes that this UTF-16LE string would require in UTF-32LE format. + * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format. * * This function is equivalent to count_utf16le. * @@ -2249,12 +2539,12 @@ public: * * @param input the UTF-16LE string to convert * @param length the length of the string in 2-byte words (char16_t) - * @return the number of bytes required to encode the UTF-16LE string as UTF-32LE + * @return the number of bytes required to encode the UTF-16LE string as UTF-32 */ simdutf_warn_unused virtual size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept = 0; /* - * Compute the number of bytes that this UTF-16BE string would require in UTF-32LE format. + * Compute the number of bytes that this UTF-16BE string would require in UTF-32 format. * * This function is equivalent to count_utf16be. * @@ -2264,7 +2554,7 @@ public: * * @param input the UTF-16BE string to convert * @param length the length of the string in 2-byte words (char16_t) - * @return the number of bytes required to encode the UTF-16BE string as UTF-32LE + * @return the number of bytes required to encode the UTF-16BE string as UTF-32 */ simdutf_warn_unused virtual size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0; @@ -2429,19 +2719,22 @@ private: #endif }; +class detect_best_supported_implementation_on_first_use; + } // namespace internal /** * The list of available implementations compiled into simdutf. */ -extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations; +extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations(); /** * The active implementation. * * Automatically initialized on first use to the most advanced implementation supported by this hardware. */ -extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> active_implementation; +extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation(); + } // namespace simdutf |