1 files changed, 186 insertions, 0 deletions
diff --git a/src/base64/fastavxbase64.c b/src/base64/fastavxbase64.c
new file mode 100644
index 000000000..47be824b4
--- /dev/null
+++ b/src/base64/fastavxbase64.c
@@ -0,0 +1,186 @@
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#include "fastavxbase64.h"
+
+#include <stdbool.h>
+#include <x86intrin.h>
+
+/**
+ * This code borrows from Wojciech Mula's library at
+ * https://github.com/WojciechMula/base64simd (published under BSD)
+ * as well as code from Alfred Klomp's library https://github.com/aklomp/base64
+ * (published under BSD)
+ *
+ */
+
+/**
+ * Note : Hardware such as Knights Landing might do poorly with this AVX2 code
+ * since it relies on shuffles. Alternatives might be faster.
+ */
+
+static inline __m256i enc_reshuffle(const __m256i input) {
+
+  // translation from SSE into AVX2 of procedure
+  // https://github.com/WojciechMula/base64simd/blob/master/encode/unpack_bigendian.cpp
+  const __m256i in = _mm256_shuffle_epi8(
+      input,
+      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
+
+                      14, 15, 13, 14, 11, 12, 10, 11, 8, 9, 7, 8, 5, 6, 4, 5));
+
+  const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
+  const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
+
+  const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
+  const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
+
+  return _mm256_or_si256(t1, t3);
+}
+
+static inline __m256i enc_translate(const __m256i in) {
+  const __m256i lut = _mm256_setr_epi8(
+      65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, 65, 71,
+      -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
+  __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
+  __m256i mask = _mm256_cmpgt_epi8((in), _mm256_set1_epi8(25));
+  indices = _mm256_sub_epi8(indices, mask);
+  __m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
+  return out;
+}
+
+static inline __m256i dec_reshuffle(__m256i in) {
+
+  // inlined procedure pack_madd from
+  // https://github.com/WojciechMula/base64simd/blob/master/decode/pack.avx2.cpp
+  // The only difference is that elements are reversed,
+  // only the multiplication constants were changed.
+
+  const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(
+      in,
+      _mm256_set1_epi32(0x01400140)); //_mm256_maddubs_epi16 is likely expensive
+  __m256i out =
+      _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
+  // end of inlined
+
+  // Pack bytes together within 32-bit words, discarding words 3 and 7:
+  out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8,
+                                                  14, 13, 12, -1, -1, -1, -1, 2,
+                                                  1, 0, 6, 5, 4, 10, 9, 8, 14,
+                                                  13, 12, -1, -1, -1, -1));
+  // the call to _mm256_permutevar8x32_epi32 could be replaced by a call to
+  // _mm256_storeu2_m128i but it is doubtful that it would help
+  return _mm256_permutevar8x32_epi32(
+      out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
+}
+
+size_t fast_avx2_base64_encode(char *dest, const char *str, size_t len) {
+  const char *const dest_orig = dest;
+  if (len >= 32 - 4) {
+    // first load is masked
+    __m256i inputvector = _mm256_maskload_epi32(
+        (int const *)(str - 4),
+        _mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000,
+
+                         0x80000000, 0x80000000, 0x80000000,
+                         0x00000000 // we do not load the first 4 bytes
+                         ));
+    //////////
+    // Intel docs: Faults occur only due to mask-bit required memory accesses
+    // that caused the faults. Faults will not occur due to referencing any
+    // memory location if the corresponding mask bit for
+    // that memory location is 0. For example, no faults will be detected if the
+    // mask bits are all zero.
+    ////////////
+    while (true) {
+      inputvector = enc_reshuffle(inputvector);
+      inputvector = enc_translate(inputvector);
+      _mm256_storeu_si256((__m256i *)dest, inputvector);
+      str += 24;
+      dest += 32;
+      len -= 24;
+      if (len >= 32) {
+        inputvector =
+            _mm256_loadu_si256((__m256i *)(str - 4)); // no need for a mask here
+        // we could do a mask load as long as len >= 24
+      } else {
+        break;
+      }
+    }
+  }
+  size_t scalarret = chromium_base64_encode(dest, str, len);
+  if (scalarret == MODP_B64_ERROR)
+    return MODP_B64_ERROR;
+  return (dest - dest_orig) + scalarret;
+}
+
+size_t fast_avx2_base64_decode(char *out, const char *src, size_t srclen,
+                               size_t *outlen) {
+  char *out_orig = out;
+  while (srclen >= 45) {
+
+    // The input consists of six character sets in the Base64 alphabet,
+    // which we need to map back to the 6-bit values they represent.
+    // There are three ranges, two singles, and then there's the rest.
+    //
+    //  #  From       To        Add  Characters
+    //  1  [43]       [62]      +19  +
+    //  2  [47]       [63]      +16  /
+    //  3  [48..57]   [52..61]   +4  0..9
+    //  4  [65..90]   [0..25]   -65  A..Z
+    //  5  [97..122]  [26..51]  -71  a..z
+    // (6) Everything else => invalid input
+
+    __m256i str = _mm256_loadu_si256((__m256i *)src);
+
+    // code by @aqrit from
+    // https://github.com/WojciechMula/base64simd/issues/3#issuecomment-271137490
+    // transated into AVX2
+    const __m256i lut_lo = _mm256_setr_epi8(
+        0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x13, 0x1A,
+        0x1B, 0x1B, 0x1B, 0x1A, 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+        0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
+    const __m256i lut_hi = _mm256_setr_epi8(
+        0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, 0x10, 0x10, 0x10, 0x10,
+        0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+        0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
+    const __m256i lut_roll = _mm256_setr_epi8(
+        0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 19, 4,
+        -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0);
+
+    const __m256i mask_2F = _mm256_set1_epi8(0x2f);
+
+    // lookup
+    __m256i hi_nibbles = _mm256_srli_epi32(str, 4);
+    __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
+
+    const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
+    const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
+
+    hi_nibbles = _mm256_and_si256(hi_nibbles, mask_2F);
+    const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
+    const __m256i roll =
+        _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
+
+    if (!_mm256_testz_si256(lo, hi)) {
+      break;
+    }
+
+    str = _mm256_add_epi8(str, roll);
+    // end of copied function
+
+    srclen -= 32;
+    src += 32;
+
+    // end of inlined function
+
+    // Reshuffle the input to packed 12-byte output format:
+    str = dec_reshuffle(str);
+    _mm256_storeu_si256((__m256i *)out, str);
+    out += 24;
+  }
+  size_t scalarret = chromium_base64_decode(out, src, srclen, outlen);
+  *outlen += (out - out_orig);
+  if (scalarret == MODP_B64_ERROR)
+    return MODP_B64_ERROR;
+  return (out - out_orig) + scalarret;
+}
+#endif
+\ No newline at end of file