diff options
author | 2022-04-25 07:09:18 -0700 | |
---|---|---|
committer | 2022-04-25 07:09:18 -0700 | |
commit | 2c6e5c3fc4a7255eb29f7ae618d2826dd9a7d5e5 (patch) | |
tree | 1de65b5cc11b7a479f92f853980d6a8ee640c9e6 /src/base64 | |
parent | 4b4df5095ea756388df4c26683ab0bb222750ed3 (diff) | |
download | bun-2c6e5c3fc4a7255eb29f7ae618d2826dd9a7d5e5.tar.gz bun-2c6e5c3fc4a7255eb29f7ae618d2826dd9a7d5e5.tar.zst bun-2c6e5c3fc4a7255eb29f7ae618d2826dd9a7d5e5.zip |
some of buffer
Diffstat (limited to 'src/base64')
-rw-r--r-- | src/base64/README.md | 8 | ||||
-rw-r--r-- | src/base64/base64.zig | 53 | ||||
-rw-r--r-- | src/base64/bun-base64.bc | bin | 0 -> 4096 bytes | |||
-rw-r--r-- | src/base64/bun-base64.c | 46 | ||||
-rw-r--r-- | src/base64/bun-base64.h | 7 | ||||
-rw-r--r-- | src/base64/chromiumbase64.bc | bin | 0 -> 15120 bytes | |||
-rw-r--r-- | src/base64/chromiumbase64.c | 415 | ||||
-rw-r--r-- | src/base64/chromiumbase64.h | 165 | ||||
-rw-r--r-- | src/base64/fastavxbase64.bc | bin | 0 -> 1856 bytes | |||
-rw-r--r-- | src/base64/fastavxbase64.c | 186 | ||||
-rw-r--r-- | src/base64/fastavxbase64.h | 41 | ||||
-rw-r--r-- | src/base64/neonbase64 | bin | 0 -> 17648 bytes | |||
-rw-r--r-- | src/base64/neonbase64.bc | bin | 0 -> 31120 bytes | |||
-rw-r--r-- | src/base64/neonbase64.cc | 120 |
14 files changed, 1041 insertions, 0 deletions
diff --git a/src/base64/README.md b/src/base64/README.md new file mode 100644 index 000000000..82bfb8ba2 --- /dev/null +++ b/src/base64/README.md @@ -0,0 +1,8 @@ +# Base64 + +This uses https://github.com/lemire/fastbase64 + +Changes: + +- chromiumbase64 doesn't add a null byte +- chromiumbase64 handles some whitespace characters more loosely diff --git a/src/base64/base64.zig b/src/base64/base64.zig new file mode 100644 index 000000000..fa4fc9dc9 --- /dev/null +++ b/src/base64/base64.zig @@ -0,0 +1,53 @@ +const std = @import("std"); + +extern fn bun_base64_encode(dest: [*]u8, src: [*]const u8, len: usize) usize; +extern fn bun_base64_decode(dest: [*]u8, src: [*]const u8, len: usize, out_len: *usize) usize; + +pub const DecodeResult = struct { + written: usize, + fail: bool = false, +}; + +pub fn decode(destination: []u8, source: []const u8) DecodeResult { + var out: usize = 0; + const ret = bun_base64_decode(destination.ptr, source.ptr, source.len, &out); + if (ret == std.math.maxInt(usize) - 1) { + return .{ + .written = out, + .fail = true, + }; + } + + // std.debug.assert(out == ret); + + return .{ + .written = out, + .fail = false, + }; +} + +pub fn encode(destination: []u8, source: []const u8) usize { + return bun_base64_encode(destination.ptr, source.ptr, source.len); +} + +/// Given a source string of length len, this returns the amount of +/// memory the destination string should have. +/// +/// remember, this is integer math +/// 3 bytes turn into 4 chars +/// ceiling[len / 3] * 4 +/// +/// +pub fn decodeLen(source: []const u8) usize { + return (source.len / 4 * 3 + 2); +} + +pub fn encodeLen(source: []const u8) usize { + return (source.len + 2) / 3 * 4; +} + +pub const urlsafe = std.base64.Base64DecoderWithIgnore.init( + std.base64.url_safe_alphabet_chars, + null, + "= \t\r\n" ++ [_]u8{ std.ascii.control_code.VT, std.ascii.control_code.FF }, +); diff --git a/src/base64/bun-base64.bc b/src/base64/bun-base64.bc Binary files differnew file mode 100644 index 000000000..6a004c8dc --- /dev/null +++ b/src/base64/bun-base64.bc diff --git a/src/base64/bun-base64.c b/src/base64/bun-base64.c new file mode 100644 index 000000000..e11f88da5 --- /dev/null +++ b/src/base64/bun-base64.c @@ -0,0 +1,46 @@ + +#include "bun-base64.h" + +#if defined(__GNUC__) && defined(__ARM_NEON__) + +int neon_base64_decode(char *out, const char *src, size_t srclen, + size_t *outlen); + +#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) + +#include "fastavxbase64.h" + +#endif + +#if defined(__GNUC__) && defined(__ARM_NEON__) +size_t bun_base64_decode(char *dest, const char *src, size_t len, + size_t *outlen) { + // neon base64 is decode only + return neon_base64_decode(dest, src, len, outlen); +} +size_t bun_base64_encode(char *dest, const char *src, size_t len) { + return chromium_base64_encode(dest, src, len); +} + +#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) + +size_t bun_base64_decode(char *dest, const char *src, size_t len, + size_t *outlen) { + return fast_avx2_base64_decode(dest, src, len, outlen); +} +size_t bun_base64_encode(char *dest, const char *src, size_t len) { + + return fast_avx2_base64_encode(dest, src, len); +} + +#else + +size_t bun_base64_decode(char *dest, const char *src, size_t len, + size_t *outlen) { + return chromium_base64_decode(dest, src, len, outlen); +} +size_t bun_base64_encode(char *dest, const char *src, size_t len) { + return chromium_base64_encode(dest, src, len); +} + +#endif
\ No newline at end of file diff --git a/src/base64/bun-base64.h b/src/base64/bun-base64.h new file mode 100644 index 000000000..82b15d73a --- /dev/null +++ b/src/base64/bun-base64.h @@ -0,0 +1,7 @@ + +#include "chromiumbase64.h" +#include "fastavxbase64.h" + +size_t bun_base64_decode(char *dest, const char *src, size_t len, + size_t *outlen); +size_t bun_base64_encode(char *dest, const char *str, size_t len);
\ No newline at end of file diff --git a/src/base64/chromiumbase64.bc b/src/base64/chromiumbase64.bc Binary files differnew file mode 100644 index 000000000..42178e5b8 --- /dev/null +++ b/src/base64/chromiumbase64.bc diff --git a/src/base64/chromiumbase64.c b/src/base64/chromiumbase64.c new file mode 100644 index 000000000..8fabd57d2 --- /dev/null +++ b/src/base64/chromiumbase64.c @@ -0,0 +1,415 @@ +#include "chromiumbase64.h" + +// from node: +static const int8_t unbase64_table[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, -2, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, 62, -1, 62, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, + 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, + 63, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1}; + +#define CHAR62 '+' +#define CHAR63 '/' +#define CHARPAD '=' +static const char e0[256] = { + 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D', + 'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H', + 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L', + 'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', + 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S', + 'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W', + 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a', + 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', + 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h', + 'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l', + 'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p', + 'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's', + 't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w', + 'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0', + '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4', + '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7', + '8', '8', '8', '8', '9', '9', '9', '9', '+', '+', '+', '+', '/', '/', '/', + '/'}; + +static const char e1[256] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', + 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', + 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', + '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', + 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', + 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', + '/'}; + +static const char e2[256] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', + 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', + 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', + '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', + 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', + 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', + 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', + '/'}; + +/* SPECIAL DECODE TABLES FOR LITTLE ENDIAN (INTEL) CPUS */ + +static const uint32_t d0[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc, + 0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4, + 0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018, + 0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030, + 0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048, + 0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060, + 0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078, + 0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090, + 0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8, + 0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0, + 0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; + +static const uint32_t d1[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003, + 0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003, + 0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000, + 0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000, + 0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001, + 0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001, + 0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001, + 0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002, + 0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002, + 0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003, + 0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; + +static const uint32_t d2[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00, + 0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00, + 0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100, + 0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300, + 0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400, + 0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600, + 0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700, + 0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900, + 0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00, + 0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00, + 0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; + +static const uint32_t d3[256] = { + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000, + 0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000, + 0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, + 0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000, + 0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000, + 0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000, + 0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000, + 0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000, + 0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000, + 0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000, + 0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000, + 0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, + 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}; + +#define BADCHAR 0x01FFFFFF + +/** + * you can control if we use padding by commenting out this + * next line. However, I highly recommend you use padding and not + * using it should only be for compatability with a 3rd party. + * Also, 'no padding' is not tested! + */ +// #define DOPAD 1 + +/* + * if we aren't doing padding + * set the pad character to NULL + */ +#ifndef DOPAD +#undef CHARPAD +#define CHARPAD '\0' +#endif + +size_t chromium_base64_encode(char *dest, const char *str, size_t len) { + size_t i = 0; + uint8_t *p = (uint8_t *)dest; + + /* unsigned here is important! */ + uint8_t t1, t2, t3; + + if (len > 2) { + for (; i < len - 2; i += 3) { + t1 = str[i]; + t2 = str[i + 1]; + t3 = str[i + 2]; + *p++ = e0[t1]; + *p++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *p++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; + *p++ = e2[t3]; + } + } + + switch (len - i) { + case 0: + break; + case 1: + t1 = str[i]; + *p++ = e0[t1]; + *p++ = e1[(t1 & 0x03) << 4]; + // *p++ = CHARPAD; + // *p++ = CHARPAD; + break; + default: /* case 2 */ + t1 = str[i]; + t2 = str[i + 1]; + *p++ = e0[t1]; + *p++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; + *p++ = e2[(t2 & 0x0F) << 2]; + // *p++ = CHARPAD; + } + + // Commented out because it already returns the length + // *p = '\0'; + return p - (uint8_t *)dest; +} + +size_t chromium_base64_decode(char *dest, const char *src, size_t len, + size_t *out_len) { + if (len == 0) { + *out_len = 0; + return 0; + } + +#ifdef DOPAD + /* + * if padding is used, then the message must be at least + * 4 chars and be a multiple of 4 + */ + if (len < 4 || (len % 4 != 0)) { + *out_len = 0; + return MODP_B64_ERROR; /* error */ + } + /* there can be at most 2 pad chars at the end */ + if (src[len - 1] == CHARPAD) { + len--; + if (src[len - 1] == CHARPAD) { + len--; + } + } +#endif + + size_t i; + int leftover = len % 4; + size_t chunks = (leftover == 0) ? len / 4 - 1 : len / 4; + + uint8_t *p = (uint8_t *)dest; + uint32_t x = 0; + const uint8_t *y = (uint8_t *)src; + for (i = 0; i < chunks;) { + x = d0[y[0]] | d1[y[1]] | d2[y[2]] | d3[y[3]]; + if (x >= BADCHAR) { + // skip whitespace + // this is change bun added + if (y[0] < 64) { + y++; + continue; + } + + *out_len = p - (uint8_t *)dest; + return MODP_B64_ERROR; + } + + *p++ = ((uint8_t *)(&x))[0]; + *p++ = ((uint8_t *)(&x))[1]; + *p++ = ((uint8_t *)(&x))[2]; + y += 4; + ++i; + } + + switch (leftover) { + case 0: + x = d0[y[0]] | d1[y[1]] | d2[y[2]] | d3[y[3]]; + + if (x >= BADCHAR) { + *out_len = p - (uint8_t *)dest + 1; + return MODP_B64_ERROR; + } + + *p++ = ((uint8_t *)(&x))[0]; + *p++ = ((uint8_t *)(&x))[1]; + *p = ((uint8_t *)(&x))[2]; + return (chunks + 1) * 3; + break; + case 1: /* with padding this is an impossible case */ + x = d0[y[0]]; + *p = *((uint8_t *)(&x)); // i.e. first char/byte in int + break; + case 2: // * case 2, 1 output byte */ + x = d0[y[0]] | d1[y[1]]; + *p = *((uint8_t *)(&x)); // i.e. first char + break; + default: /* case 3, 2 output bytes */ + x = d0[y[0]] | d1[y[1]] | d2[y[2]]; /* 0x3c */ + *p++ = ((uint8_t *)(&x))[0]; + *p = ((uint8_t *)(&x))[1]; + break; + } + + *out_len = 3 * chunks + (6 * leftover) / 8; + + if (x >= BADCHAR) + return MODP_B64_ERROR; + + return 3 * chunks + (6 * leftover) / 8; +}
\ No newline at end of file diff --git a/src/base64/chromiumbase64.h b/src/base64/chromiumbase64.h new file mode 100644 index 000000000..96b201c7b --- /dev/null +++ b/src/base64/chromiumbase64.h @@ -0,0 +1,165 @@ +/*************** + * Taken more or less as-is from the chromium project + ****************/ + +/** + * \file + * <PRE> + * High performance base64 encoder / decoder + * Version 1.3 -- 17-Mar-2006 + * + * Copyright © 2005, 2006, Nick Galbreath -- nickg [at] modp [dot] com + * All rights reserved. + * + * http://modp.com/release/base64 + * + * Released under bsd license. See modp_b64.c for details. + * </pre> + * + * The default implementation is the standard b64 encoding with padding. + * It's easy to change this to use "URL safe" characters and to remove + * padding. See the modp_b64.c source code for details. + * + */ + +#ifndef MODP_B64 +#define MODP_B64 + +#include <stddef.h> +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define MODP_B64_ERROR ((size_t)-1) +/** + * Encode a raw binary string into base 64. + * src contains the bytes + * len contains the number of bytes in the src + * dest should be allocated by the caller to contain + * at least chromium_base64_encode_len(len) bytes (see below) + * This will contain the null-terminated b64 encoded result + * returns length of the destination string plus the ending null byte + * i.e. the result will be equal to strlen(dest) + 1 + * + * Example + * + * \code + * char* src = ...; + * int srclen = ...; //the length of number of bytes in src + * char* dest = (char*) malloc(chromium_base64_encode_len(srclen)); + * int len = chromium_base64_encode(dest, src, sourcelen); + * if (len == MODP_B64_ERROR) { + * printf("Error\n"); + * } else { + * printf("b64 = %s\n", dest); + * } + * \endcode + * + */ +size_t chromium_base64_encode(char *dest, const char *str, size_t len); + +/** + * Decode a base64 encoded string + * + * + * src should contain exactly len bytes of b64 characters. + * if src contains -any- non-base characters (such as white + * space, MODP_B64_ERROR is returned. + * + * dest should be allocated by the caller to contain at least + * len * 3 / 4 bytes. + * + * Returns the length (strlen) of the output, or MODP_B64_ERROR if unable to + * decode + * + * \code + * char* src = ...; + * int srclen = ...; // or if you don't know use strlen(src) + * char* dest = (char*) malloc(chromium_base64_decode_len(srclen)); + * int len = chromium_base64_decode(dest, src, sourcelen); + * if (len == MODP_B64_ERROR) { error } + * \endcode + */ +size_t chromium_base64_decode(char *dest, const char *src, size_t len, + size_t *out_len); + +/** + * Given a source string of length len, this returns the amount of + * memory the destination string should have. + * + * remember, this is integer math + * 3 bytes turn into 4 chars + * ceiling[len / 3] * 4 + 1 + * + * +1 is for any extra null. + */ +#define chromium_base64_encode_len(A) ((A + 2) / 3 * 4 + 1) + +/** + * Given a base64 string of length len, + * this returns the amount of memory required for output string + * It maybe be more than the actual number of bytes written. + * NOTE: remember this is integer math + * this allocates a bit more memory than traditional versions of b64 + * decode 4 chars turn into 3 bytes + * floor[len * 3/4] + 2 + */ +#define chromium_base64_decode_len(A) (A / 4 * 3 + 2) + +/** + * Will return the strlen of the output from encoding. + * This may be less than the required number of bytes allocated. + * + * This allows you to 'deserialized' a struct + * \code + * char* b64encoded = "..."; + * int len = strlen(b64encoded); + * + * struct datastuff foo; + * if (chromium_base64_encode_strlen(sizeof(struct datastuff)) != len) { + * // wrong size + * return false; + * } else { + * // safe to do; + * if (chromium_base64_encode((char*) &foo, b64encoded, len) == + * MODP_B64_ERROR) { + * // bad characters + * return false; + * } + * } + * // foo is filled out now + * \endcode + */ +#define chromium_base64_encode_strlen(A) ((A + 2) / 3 * 4) + +#ifdef __cplusplus +} + +#include <string> + +/** + * base 64 decode a string (self-modifing) + * On failure, the string is empty. + * + * This function is for C++ only (duh) + * + * \param[in,out] s the string to be decoded + * \return a reference to the input string + */ +inline std::string &chromium_base64_encode(std::string &s) { + std::string x(chromium_base64_encode_len(s.size()), '\0'); + size_t d = chromium_base64_encode(const_cast<char *>(x.data()), s.data(), + (int)s.size()); + if (d == MODP_B64_ERROR) { + x.clear(); + } else { + x.erase(d, std::string::npos); + } + s.swap(x); + return s; +} + +#endif /* __cplusplus */ +#endif
\ No newline at end of file diff --git a/src/base64/fastavxbase64.bc b/src/base64/fastavxbase64.bc Binary files differnew file mode 100644 index 000000000..f7ee9825d --- /dev/null +++ b/src/base64/fastavxbase64.bc diff --git a/src/base64/fastavxbase64.c b/src/base64/fastavxbase64.c new file mode 100644 index 000000000..47be824b4 --- /dev/null +++ b/src/base64/fastavxbase64.c @@ -0,0 +1,186 @@ +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#include "fastavxbase64.h" + +#include <stdbool.h> +#include <x86intrin.h> + +/** + * This code borrows from Wojciech Mula's library at + * https://github.com/WojciechMula/base64simd (published under BSD) + * as well as code from Alfred Klomp's library https://github.com/aklomp/base64 + * (published under BSD) + * + */ + +/** + * Note : Hardware such as Knights Landing might do poorly with this AVX2 code + * since it relies on shuffles. Alternatives might be faster. + */ + +static inline __m256i enc_reshuffle(const __m256i input) { + + // translation from SSE into AVX2 of procedure + // https://github.com/WojciechMula/base64simd/blob/master/encode/unpack_bigendian.cpp + const __m256i in = _mm256_shuffle_epi8( + input, + _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, + + 14, 15, 13, 14, 11, 12, 10, 11, 8, 9, 7, 8, 5, 6, 4, 5)); + + const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); + const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); + + const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); + const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); + + return _mm256_or_si256(t1, t3); +} + +static inline __m256i enc_translate(const __m256i in) { + const __m256i lut = _mm256_setr_epi8( + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, 65, 71, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); + __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51)); + __m256i mask = _mm256_cmpgt_epi8((in), _mm256_set1_epi8(25)); + indices = _mm256_sub_epi8(indices, mask); + __m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices)); + return out; +} + +static inline __m256i dec_reshuffle(__m256i in) { + + // inlined procedure pack_madd from + // https://github.com/WojciechMula/base64simd/blob/master/decode/pack.avx2.cpp + // The only difference is that elements are reversed, + // only the multiplication constants were changed. + + const __m256i merge_ab_and_bc = _mm256_maddubs_epi16( + in, + _mm256_set1_epi32(0x01400140)); //_mm256_maddubs_epi16 is likely expensive + __m256i out = + _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000)); + // end of inlined + + // Pack bytes together within 32-bit words, discarding words 3 and 7: + out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, + 14, 13, 12, -1, -1, -1, -1, 2, + 1, 0, 6, 5, 4, 10, 9, 8, 14, + 13, 12, -1, -1, -1, -1)); + // the call to _mm256_permutevar8x32_epi32 could be replaced by a call to + // _mm256_storeu2_m128i but it is doubtful that it would help + return _mm256_permutevar8x32_epi32( + out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1)); +} + +size_t fast_avx2_base64_encode(char *dest, const char *str, size_t len) { + const char *const dest_orig = dest; + if (len >= 32 - 4) { + // first load is masked + __m256i inputvector = _mm256_maskload_epi32( + (int const *)(str - 4), + _mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, + + 0x80000000, 0x80000000, 0x80000000, + 0x00000000 // we do not load the first 4 bytes + )); + ////////// + // Intel docs: Faults occur only due to mask-bit required memory accesses + // that caused the faults. Faults will not occur due to referencing any + // memory location if the corresponding mask bit for + // that memory location is 0. For example, no faults will be detected if the + // mask bits are all zero. + //////////// + while (true) { + inputvector = enc_reshuffle(inputvector); + inputvector = enc_translate(inputvector); + _mm256_storeu_si256((__m256i *)dest, inputvector); + str += 24; + dest += 32; + len -= 24; + if (len >= 32) { + inputvector = + _mm256_loadu_si256((__m256i *)(str - 4)); // no need for a mask here + // we could do a mask load as long as len >= 24 + } else { + break; + } + } + } + size_t scalarret = chromium_base64_encode(dest, str, len); + if (scalarret == MODP_B64_ERROR) + return MODP_B64_ERROR; + return (dest - dest_orig) + scalarret; +} + +size_t fast_avx2_base64_decode(char *out, const char *src, size_t srclen, + size_t *outlen) { + char *out_orig = out; + while (srclen >= 45) { + + // The input consists of six character sets in the Base64 alphabet, + // which we need to map back to the 6-bit values they represent. + // There are three ranges, two singles, and then there's the rest. + // + // # From To Add Characters + // 1 [43] [62] +19 + + // 2 [47] [63] +16 / + // 3 [48..57] [52..61] +4 0..9 + // 4 [65..90] [0..25] -65 A..Z + // 5 [97..122] [26..51] -71 a..z + // (6) Everything else => invalid input + + __m256i str = _mm256_loadu_si256((__m256i *)src); + + // code by @aqrit from + // https://github.com/WojciechMula/base64simd/issues/3#issuecomment-271137490 + // transated into AVX2 + const __m256i lut_lo = _mm256_setr_epi8( + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x13, 0x1A, + 0x1B, 0x1B, 0x1B, 0x1A, 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); + const __m256i lut_hi = _mm256_setr_epi8( + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); + const __m256i lut_roll = _mm256_setr_epi8( + 0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 19, 4, + -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0); + + const __m256i mask_2F = _mm256_set1_epi8(0x2f); + + // lookup + __m256i hi_nibbles = _mm256_srli_epi32(str, 4); + __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); + + const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); + const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); + + hi_nibbles = _mm256_and_si256(hi_nibbles, mask_2F); + const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); + const __m256i roll = + _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); + + if (!_mm256_testz_si256(lo, hi)) { + break; + } + + str = _mm256_add_epi8(str, roll); + // end of copied function + + srclen -= 32; + src += 32; + + // end of inlined function + + // Reshuffle the input to packed 12-byte output format: + str = dec_reshuffle(str); + _mm256_storeu_si256((__m256i *)out, str); + out += 24; + } + size_t scalarret = chromium_base64_decode(out, src, srclen, outlen); + *outlen += (out - out_orig); + if (scalarret == MODP_B64_ERROR) + return MODP_B64_ERROR; + return (out - out_orig) + scalarret; +} +#endif
\ No newline at end of file diff --git a/src/base64/fastavxbase64.h b/src/base64/fastavxbase64.h new file mode 100644 index 000000000..d1064a5d1 --- /dev/null +++ b/src/base64/fastavxbase64.h @@ -0,0 +1,41 @@ +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) + +#ifndef EXPAVX_B64 +#define EXPAVX_B64 + +/** + * Assumes recent x64 hardware with AVX2 instructions. + */ + +#include "chromiumbase64.h" +#include <stddef.h> +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** + * This code extends Nick Galbreath's high performance base 64decoder (used in + * Chromium), the API is the same effectively, see chromium64.h. + */ + +/* + * AVX2 accelerated version of Galbreath's chromium_base64_decode function + * Usage remains the same, see chromium.h. + */ +size_t fast_avx2_base64_decode(char *out, const char *src, size_t srclen, + size_t *outlen); + +/* + * AVX2 accelerated version of Galbreath's chromium_base64_encode function + * Usage remains the same, see chromium.h. + */ +size_t fast_avx2_base64_encode(char *dest, const char *str, size_t len); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif +#endif
\ No newline at end of file diff --git a/src/base64/neonbase64 b/src/base64/neonbase64 Binary files differnew file mode 100644 index 000000000..74153ebe4 --- /dev/null +++ b/src/base64/neonbase64 diff --git a/src/base64/neonbase64.bc b/src/base64/neonbase64.bc Binary files differnew file mode 100644 index 000000000..91c10bf4f --- /dev/null +++ b/src/base64/neonbase64.bc diff --git a/src/base64/neonbase64.cc b/src/base64/neonbase64.cc new file mode 100644 index 000000000..a1249d21b --- /dev/null +++ b/src/base64/neonbase64.cc @@ -0,0 +1,120 @@ +// clang-format off +#if defined (__GNUC__) && defined(__ARM_NEON__) + +#include <arm_neon.h> +#include <cstddef> +#include "chromiumbase64.h" +#define MODP_B64_ERROR ((size_t)-1) + +#include <iostream> + + +extern "C" int neon_base64_decode(char *out, const char *src, size_t srclen, size_t *outlen); + + +// The input consists of six character sets in the Base64 alphabet, +// which we need to map back to the 6-bit values they represent. +// There are three ranges, two singles, and then there's the rest. +// +// # From To Add Characters +// 1 [43] [62] +19 + +// 2 [47] [63] +16 / +// 3 [48..57] [52..61] +4 0..9 +// 4 [65..90] [0..25] -65 A..Z +// 5 [97..122] [26..51] -71 a..z +// (6) Everything else => invalid input + +int neon_base64_decode(char *out, const char *src, size_t srclen, size_t *outlen) { + char *out_orig = out; + const uint8x16_t lut_lo = {0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A}; + const uint8x16_t lut_hi = {0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}; + const uint8x16_t lut_roll = {0, 16, 19, 4, 191, 191, 185, 185, + 0, 0, 0, 0, 0, 0, 0, 0}; + const uint8x16_t zero8 = vdupq_n_u8(0); + const uint16x8_t zero16 = vdupq_n_u16(0); + const uint8x16_t k2f = vdupq_n_u8(0x2f); + const uint8x16_t kf = vdupq_n_u8(0xf); + const uint8x8_t cst = {0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40}; + const uint16x4_t cst1 = {0x1000, 0x1000, 0x1000, 0x1000}; + + const uint8x8_t shuf0 = {2, 1, 0, 6, 5, 4, 2 + 8, 1 + 8}; + const uint8x8_t shuf1 = {0 + 8, 6 + 8, 5 + 8, 4 + 8, + 2 + 16, 1 + 16, 0 + 16, 6 + 16}; + const uint8x8_t shuf2 = {5 + 16, 4 + 16, 2 + 24, 1 + 24, + 0 + 24, 6 + 24, 5 + 24, 4 + 24}; + + uint8x8x4_t pack; + uint8x8_t res[3]; + uint8x16_t str[2]; + + while (srclen >= 8 * 4) { + __builtin_memcpy(str, src, 8 * 4); + + uint8x16_t in0 = str[0]; + uint8x16_t in1 = str[1]; + uint8x16_t lo_nibbles0 = vandq_u8(in0, kf); + uint8x16_t lo_nibbles1 = vandq_u8(in1, kf); + uint8x16_t hi_nibbles0 = vshrq_n_u8(in0, 4); + uint8x16_t hi_nibbles1 = vshrq_n_u8(in1, 4); + + uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0); + uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1); + uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0); + uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1); + uint8x16_t test0 = vtstq_u8(lo0, hi0); + uint8x16_t test1 = vtstq_u8(lo1, hi1); + uint8x16_t orr0 = vorrq_u8(test0, test1); + uint8x8_t orr1 = vorr_u8(vget_low_u8(orr0), vget_high_u8(orr0)); + if ((uint64_t)orr1) + break; + + uint8x16_t eq_2F0 = vceqq_u8(in0, k2f); + uint8x16_t eq_2F1 = vceqq_u8(in1, k2f); + uint8x16_t add0 = vaddq_u8(eq_2F0, hi_nibbles0); + uint8x16_t add1 = vaddq_u8(eq_2F1, hi_nibbles1); + uint8x16_t roll0 = vqtbl1q_u8(lut_roll, add0); + uint8x16_t roll1 = vqtbl1q_u8(lut_roll, add1); + uint8x16_t rolled0 = vaddq_u8(in0, roll0); + uint8x16_t rolled1 = vaddq_u8(in1, roll1); + + // Step 1: swap and merge adjacent 6-bit fields. + uint8x16x2_t unzip8 = vuzpq_u8(rolled0, rolled1); + uint8x16x2_t zip8 = vzipq_u8(unzip8.val[1], zero8); + uint16x8_t mul0 = vmlal_u8(vreinterpretq_u16_u8(zip8.val[0]), + vget_low_u8(unzip8.val[0]), cst); + uint16x8_t mul1 = vmlal_u8(vreinterpretq_u16_u8(zip8.val[1]), + vget_high_u8(unzip8.val[0]), cst); + + // Step 2: swap and merge 12-bit words into a 24-bit word. + uint16x8x2_t unzip16 = vuzpq_u16(mul0, mul1); + uint16x8x2_t zip16 = vzipq_u16(unzip16.val[1], zero16); + uint32x4_t merge0 = vmlal_u16(vreinterpretq_u32_u16(zip16.val[0]), + vget_low_u16(unzip16.val[0]), cst1); + uint32x4_t merge1 = vmlal_u16(vreinterpretq_u32_u16(zip16.val[1]), + vget_high_u16(unzip16.val[0]), cst1); + pack.val[0] = vget_low_u8(vreinterpretq_u8_u32(merge0)); + pack.val[1] = vget_high_u8(vreinterpretq_u8_u32(merge0)); + pack.val[2] = vget_low_u8(vreinterpretq_u8_u32(merge1)); + pack.val[3] = vget_high_u8(vreinterpretq_u8_u32(merge1)); + + res[0] = vtbl4_u8(pack, shuf0); + res[1] = vtbl4_u8(pack, shuf1); + res[2] = vtbl4_u8(pack, shuf2); + __builtin_memcpy(out, res, 6 * 4); + + out += 6 * 4; + srclen -= 8 * 4; + src += 8 * 4; + } + +// std::cout << "Chromium? " << (out - out_orig) << std::endl; + size_t scalarret = chromium_base64_decode(out, src, srclen, outlen); + *outlen += (out - out_orig); + if (scalarret == MODP_B64_ERROR) + return (int)MODP_B64_ERROR; + return (out - out_orig) + scalarret; +} + +#endif
\ No newline at end of file |