bwasti · October 19, 2021 02:21
diff --git a/dot_cache.cc b/dot_cache.cc
 /*

 AVX2:
 g++ dot_cache.cc -march=native -O3

 AVX512:
 g++ -DVEC_WIDTH=512 dot_cache.cc -march=native -O3

 2x larger cache (P=8 is the sweetspot)
 g++ -DP=16 dot_cache.cc -march=native -O3


 The premise is that we want to multiply binary activations
 by high precision weights.

 The trick is that we can precompute the partial dot products
 (the value P is how big the precompute is) and use chunks of the
 binary activations as indices (reinterpret the bits as integers).

 Results on avx2, int16 weights, binary activations, batch=8, 2 layer 32 neuron
 MLP + relu: ~71ns per batch, 8.89287 ns/infer 460.594 Gops

 Extensions/TODOs:

 - We can increase the activation precision by using multiple horizontal layers
 and computing the bits individually (e.g. CSA logic).
 - Need to test the limit of precompute size
 - Need to test avx512
 - Need to test out prefetching

 */

 #include <bitset>
 #include <cassert>
 #include <chrono>
 #include <iostream>
 #include <vector>

 #define NONE 0
 #define INTEL 1
 #define ARM 2

 #ifndef ISA
 #define ISA INTEL
 #endif

 #if ISA == INTEL
 #include <immintrin.h>
 #else
 #endif

 #ifndef P
 #define P 8
 #endif


 #ifndef VEC_WIDTH
 #if ISA == INTEL
 #define VEC_WIDTH 256
 #else
 #define VEC_WIDTH 128
 #endif
 #endif

 #define VEC_BYTES (VEC_WIDTH / 8)

 template <typename T> int32_t layer(int32_t a, T *weights);
 template <typename T> int64_t layer_64(int64_t a, T *weights);

 // 2bit represented as: a_k = a_1 & (0x1 << k) << 1 + a_0 & (0x1 << k);
 int32_t layer_2bit(int32_t a_0, int32_t a_1, int16_t *weights) {
  auto v0 = _mm256_set1_epi16(0);
  auto v1 = _mm256_set1_epi16(0);
  auto zero = _mm256_set1_epi16(0);
  auto N = 32;
  auto vec_width = VEC_BYTES / sizeof(int16_t);
  auto psN = N / vec_width;
  auto psP = 1 << P;

 #pragma unroll
  for (auto i = 0; i < 32; i += P) {
    auto b = (a_0 >> i) & ((1 << P) - 1);
    auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0);
    auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1);
    __m256i v0_ = _mm256_loadu_si256((__m256i *)&(weights[idx0]));
    __m256i v1_ = _mm256_loadu_si256((__m256i *)&(weights[idx1]));
    v0 = _mm256_add_epi16(v0_, v0);
    v1 = _mm256_add_epi16(v1, v1_);
  }
 #pragma unroll
  for (auto i = 0; i < 32; i += P) {
    auto b = (a_1 >> i) & ((1 << P) - 1);
    auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0);
    auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1);
    __m256i v0_ = _mm256_loadu_si256((__m256i *)&(weights[idx0]));
    __m256i v1_ = _mm256_loadu_si256((__m256i *)&(weights[idx1]));
    v0_ = _mm256_slli_epi16(v0_, 1);
    v1_ = _mm256_slli_epi16(v1_, 1);
    v0 = _mm256_add_epi16(v0_, v0);
    v1 = _mm256_add_epi16(v1, v1_);
  }
  auto mask = _mm256_set_epi8(0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                              0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                              0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0);
  v0 = _mm256_cmpgt_epi16(v0, zero);
  v1 = _mm256_cmpgt_epi16(v1, zero);
  auto outv = _mm256_blendv_epi8(v0, v1, mask);
  return _mm256_movemask_epi8(outv);
 }

 #if ISA == INTEL

 #if VEC_WIDTH == 256

 template <> int32_t layer(int32_t a, int8_t *weights) {
  auto psN = 1;
  auto psP = 1 << P;
  auto vec_width = VEC_BYTES / sizeof(int8_t);

  auto v = _mm256_set1_epi8(0);
  auto zero = _mm256_set1_epi8(0);

  for (auto i = 0; i < 32; i += P) {
    auto b = (a >> i) & ((1 << P) - 1);
    auto idx = vec_width * (psN * (psP * (i / P) + b));
    __m256i v_ = _mm256_loadu_si256((__m256i *)&(weights[idx]));
    v = _mm256_add_epi8(v_, v);
  }
  v = _mm256_cmpgt_epi8(v, zero);
  return _mm256_movemask_epi8(v);
 }

 template <> int32_t layer(int32_t a, int16_t *weights) {
  auto v0 = _mm256_set1_epi16(0);
  auto v1 = _mm256_set1_epi16(0);
  auto zero = _mm256_set1_epi16(0);
  auto N = 32;
  auto vec_width = VEC_BYTES / sizeof(int16_t);
  auto psN = N / vec_width;
  auto psP = 1 << P;

 #pragma unroll
  for (auto i = 0; i < 32; i += P) {
    auto b = (a >> i) & ((1 << P) - 1);
    auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0);
    auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1);
    __m256i v0_ = _mm256_loadu_si256((__m256i *)&(weights[idx0]));
    __m256i v1_ = _mm256_loadu_si256((__m256i *)&(weights[idx1]));
    v0 = _mm256_add_epi16(v0_, v0);
    v1 = _mm256_add_epi16(v1, v1_);
  }
  auto mask = _mm256_set_epi8(0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                              0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                              0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0);
  v0 = _mm256_cmpgt_epi16(v0, zero);
  v1 = _mm256_cmpgt_epi16(v1, zero);
  auto outv = _mm256_blendv_epi8(v0, v1, mask);
  return _mm256_movemask_epi8(outv);
 }

 #else // VEC_WIDTH == 512

 template <> int32_t layer(int32_t a, int16_t *weights) {
  auto v0 = _mm512_set1_epi16(0);
  auto v1 = _mm512_set1_epi16(0);
  auto zero = _mm512_set1_epi16(0);
  auto N = 32;
  auto vec_width = VEC_BYTES / sizeof(int16_t);
  auto psN = N / vec_width;
  auto psP = 1 << P;

 #pragma unroll
  for (auto i = 0; i < 32; i += P) {
    auto b = (a >> i) & ((1 << P) - 1);
    auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0);
    __m512i v0_ = _mm512_loadu_si512((__m512i *)&(weights[idx0]));
    v0 = _mm512_add_epi16(v0_, v0);
  }
  auto out_mask = _mm512_cmpgt_epi16_mask(v0, zero);
  return _cvtmask32_u32(out_mask);
 }

 template <> int32_t layer(int32_t a, float *weights) {
  auto v0 = _mm512_set1_ps(0);
  auto v1 = _mm512_set1_ps(0);
  auto zero = _mm512_set1_ps(0);
  auto N = 32;
  auto vec_width = VEC_BYTES / sizeof(float);
  auto psN = N / vec_width;
  auto psP = 1 << P;

 #pragma unroll
  for (auto i = 0; i < 32; i += P) {
    auto b = (a >> i) & ((1 << P) - 1);
    auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0);
    auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1);
    auto v0_ = _mm512_loadu_ps((__m512 *)&(weights[idx0]));
    auto v1_ = _mm512_loadu_ps((__m512 *)&(weights[idx1]));
    v0 = _mm512_add_ps(v0_, v0);
    v1 = _mm512_add_ps(v1_, v1);
  }
  auto v0_m = _mm512_cmp_ps_mask(v0, zero, _CMP_GT_OQ);
  auto v1_m = _mm512_cmp_ps_mask(v1, zero, _CMP_GT_OQ);
  return _cvtmask16_u32(v0_m) | _cvtmask16_u32(v1_m) << 16;
 }


 template <> int64_t layer_64(int64_t a, int16_t *weights) {
  auto v0 = _mm512_set1_epi16(0);
  auto v1 = _mm512_set1_epi16(0);
  auto zero = _mm512_set1_epi16(0);
  auto N = 64;
  auto K = 64;
  auto vec_width = VEC_BYTES / sizeof(int16_t);
  auto psN = N / vec_width;
  auto psP = 1 << P;

 #pragma unroll
  for (auto i = 0; i < K; i += P) {
    auto b = (a >> i) & ((1 << P) - 1);
    auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0);
    auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1);
    __m512i v0_ = _mm512_loadu_si512((__m512i *)&(weights[idx0]));
    __m512i v1_ = _mm512_loadu_si512((__m512i *)&(weights[idx1]));
    v0 = _mm512_add_epi16(v0_, v0);
    v1 = _mm512_add_epi16(v1, v1_);
  }
  auto v0_m = _mm512_cmpgt_epi16_mask(v0, zero);
  auto v1_m = _mm512_cmpgt_epi16_mask(v1, zero);
  return (int64_t)_cvtmask32_u32(v0_m) | (int64_t)_cvtmask32_u32(v1_m) << 32;
 }

 template <> int64_t layer_64(int64_t a, int8_t *weights) {
  auto v = _mm512_set1_epi8(0);
  auto zero = _mm512_set1_epi8(0);
  auto N = 64;
  auto K = 64;
  auto vec_width = VEC_BYTES / sizeof(int8_t);
  auto psN = N / vec_width;
  auto psP = 1 << P;

 #pragma unroll
  for (auto i = 0; i < K; i += P) {
    auto b = (a >> i) & ((1 << P) - 1);
    auto idx = vec_width * (psN * (psP * (i / P) + b));
    __m512i v_ = _mm512_loadu_si512((__m512i *)&(weights[idx]));
    v = _mm512_add_epi8(v_, v);
  }
  auto v_m = _mm512_cmpgt_epi8_mask(v, zero);
  return _cvtmask64_u64(v_m);
 }

 int32_t layer_64_32(int32_t a, int32_t b, int16_t *weights) {
  auto zero = _mm512_set1_epi16(0);
  auto N = 32;
  auto K = 32;
  auto vec_width = VEC_BYTES / sizeof(int16_t);
  auto psN = 1;
  auto psP = 1 << P;

  auto v = _mm512_set1_epi16(0);
  // first a
  for (auto i = 0; i < K; i += P) {
    auto bin = (a >> i) & ((1 << P) - 1);
    auto idx = vec_width * (psN * (psP * (i / P) + bin));
    __m512i v_ = _mm512_loadu_si512((__m512i *)&(weights[idx]));
    v = _mm512_add_epi16(v_, v);
  }
  // then b
  for (auto i = 0; i < K; i += P) {
    auto bin = (b >> i) & ((1 << P) - 1);
    auto idx = vec_width * (psN * (psP * ((i + K) / P) + bin));
    __m512i v_ = _mm512_loadu_si512((__m512i *)&(weights[idx]));
    v = _mm512_add_epi16(v_, v);
  }
  auto out_mask = _mm512_cmpgt_epi16_mask(v, zero);
  return _cvtmask32_u32(out_mask);
 }

 void batch_layer_64_32(int32_t *inps, int N, int16_t *weights, int32_t *outs) {
  for (auto i = 0; i < N; ++i) {
    outs[i] = layer_64_32(inps[i], ~inps[i], weights);
  }
 }

 #endif // VEC_WIDTH

 #else // ISA != INTEL

 template <typename T> int32_t layer(int32_t a, T *weights) {
  auto psN = 1;
  auto psP = 1 << P;


  T v[32] = { 0 };
  for (auto i = 0; i < 32; i += P) {
    auto b = (a >> i) & ((1 << P) - 1);
    auto idx = 32 * (psN * (psP * (i / P) + b));
    for (auto k = 0; k < 32; ++k) {
      v[k] += weights[idx + k];
    }
  }
  uint32_t out = 0;
  for (auto k = 0; k < 32; ++k) {
    out |= v[k] > 0 ? (1 << k) : 0;
  }
  return out;
 }

 template <typename T> int64_t layer_64(int64_t a, T *weights) {
  auto psN = 1;
  auto psP = 1 << P;

  T v[64] = { 0 };
  for (auto i = 0; i < 64; i += P) {
    auto b = (a >> i) & ((1 << P) - 1);
    auto idx = 64 * (psN * (psP * (i / P) + b));
    for (auto k = 0; k < 64; ++k) {
      v[k] += weights[idx + k];
    }
  }
  uint64_t out = 0;
  for (auto k = 0; k < 64; ++k) {
    out |= v[k] > 0 ? (1 << k) : 0;
  }
  return out;
 }

 #endif

 template <typename T, typename BT>
 void batch_layer(BT *inps, int N, T *weights, BT *outs) {
  if (sizeof(BT) == 4) {
    for (auto i = 0; i < N; ++i) {
      outs[i] = layer<T>(inps[i], weights);
    }
  } else {
    for (auto i = 0; i < N; ++i) {
      outs[i] = layer_64(inps[i], weights);
    }
  }
 }

 // convert K x N weight vec into (K / P) x N pre-summed indexable vec
 template <typename T>
 std::vector<T> to_cache(std::vector<T> orig, int K, int N) {
  auto vec_width = VEC_BYTES / sizeof(T);
  assert(N % vec_width == 0);
  assert(K % P == 0);
  auto psN = N / vec_width;
  auto psP = 1 << P;
  auto psK = K / P;
  std::vector<T> out(N * psK * psP);

  for (auto n = 0; n < N; ++n) {
    auto n_ = n;
    // enables the quick blend stuff for merging int16 vectors after relu
    if (sizeof(T) == 2 && VEC_WIDTH == 256) {
      n_ = (n % psN) * vec_width + n / psN;
    }
    for (auto o_k = 0; o_k < psK; ++o_k) {
      for (auto b = 0; b < psP; ++b) {
        int16_t tot = 0;
        for (auto i_k = 0; i_k < P; ++i_k) {
          auto k = o_k * P + i_k;
          if ((b >> i_k) & 1) {
            tot += orig[k * N + n];
          }
        }
        auto idx = psN * vec_width * (psP * (o_k) + b) + n_;
        out[idx] = tot;
      }
    }
  }
  return out;
 }

 template <typename T> std::vector<T> rand_mat(int K, int N) {
  std::vector<T> out(N * K);
  for (auto k = 0; k < K; ++k) {
    for (auto n = 0; n < N; ++n) {
      auto i = rand() % 10 - 5;
      if (sizeof(T) > 1) {
        i = rand() % 100 - 50;
      }
      out[k * N + n] = i;
    }
  }
  return out;
 }

 template <typename T, typename IN, typename OUT>
 OUT ref_layer(IN inp, std::vector<T> W) {
  OUT out = 0;
  // std::cout << "ref ";
  for (auto n = 0; n < sizeof(OUT) * 8; ++n) {
    T tot = 0;
    for (auto k = 0; k < sizeof(IN) * 8; ++k) {
      if ((inp >> k) & 1) {
        tot += W[k * sizeof(OUT) * 8 + n];
      }
    }
    // std::cout << tot << " ";
    if (tot > 0) {
      out |= ((uint64_t)1 << n);
    }
  }
  // std::cout << "\n";
  return out;
 }
 template <typename T>
 void ref_layer_full(T* inp, T* W, int M, int N, int K, T* out) {
  //std::cout << "ref ";
  for (auto n = 0; n < N; ++n) {
    for (auto m = 0; m < M; ++m) {
      T tot = 0;
      for (auto k = 0; k < K; ++k) {
        tot += inp[m * K + k] * W[k * N + n];
      }
      out[m * N + n] = tot;
      //std::cout << tot << " ";
    }
  }
  //std::cout << "\n";
 }

 template <typename T>
 int32_t ref_layer(int32_t inp, int32_t inp2, std::vector<T> W) {
  int32_t out = 0;
  for (auto n = 0; n < 32; ++n) {
    T tot = 0;
    for (auto k = 0; k < 32; ++k) {
      if ((inp >> k) & 1) {
        tot += W[k * 32 + n];
      }
    }
    for (auto k = 32; k < 64; ++k) {
      if ((inp2 >> (k - 32)) & 1) {
        tot += W[k * 32 + n];
      }
    }
    if (tot > 0) {
      out |= (1 << n);
    }
  }
  return out;
 }

 // https://crypto.stackexchange.com/questions/16219/cryptographic-hash-function-for-32-bit-length-input-keys
 uint32_t perm32(uint32_t x) {
  int n = 12;
  do {
    x = ((x >> 8) ^ x) * 0x6B + n;
  } while (--n != 0);
  return x;
 }

 template <typename T> void bench() {
  auto W = rand_mat<T>(32, 32);
  auto psW = to_cache(W, 32, 32);
  std::cout << "benchmarking " << 8 * sizeof(T)
            << "bit weight (binary activations)\n";

  auto ref = ref_layer<T, int32_t, int32_t>(123, W);
  std::cout << "  ref: " << std::bitset<32>(ref) << "\n";
  auto l = layer<T>(123, psW.data());
  std::cout << "  layer: " << std::bitset<32>(l) << "\n";
  assert(ref == l);
  for (auto i = 1; i < (1 << 30); i *= 2) {
    auto test = i + 1337;
    auto ref = ref_layer<T, int32_t, int32_t>(test, W);
    auto l = layer<T>(test, psW.data());
    if (ref != l) {
      std::cerr << "ERROR! input " << test << " breaks test\n";
      return;
    }
  }

  auto X = 0xff1fff17;

  {
    auto W0 = rand_mat<T>(32, 32);
    auto psW0 = to_cache(W, 32, 32);
    auto iters = 10000000;
    std::cout << "  1 layer 32 neuron\n";
    for (size_t i = 0; i < 1000; ++i) {
      X ^= layer<T>(X, psW0.data());
      X = ~X >> 1;
    }
    auto start = std::chrono::steady_clock::now();
    for (size_t i = 0; i < iters; ++i) {
      X ^= layer<T>(X, psW0.data());
      X = ~X >> 1;
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> diff = end - start;
    auto gops = 1.0 * iters * 32 * 32 * 2 / diff.count() / 1e9;
    std::cout << "  iters/s: " << iters / diff.count();
    std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
    std::cout << gops << " Gops) [" << X << "]\n";
  }
  {
    auto W0 = rand_mat<T>(32, 32);
    auto psW0 = to_cache(W, 32, 32);
    auto W1 = rand_mat<T>(32, 32);
    auto psW1 = to_cache(W, 32, 32);
    auto iters = 10000000;
    std::cout << "  2 layer 32 neuron\n";
    for (size_t i = 0; i < 1000; ++i) {
      X ^= layer<T>(X, psW0.data());
      X ^= layer<T>(X, psW1.data());
    }
    auto start = std::chrono::steady_clock::now();
    for (size_t i = 0; i < iters; ++i) {
      X ^= layer<T>(X, psW0.data());
      X = ~X;
      X ^= layer<T>(X, psW1.data());
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> diff = end - start;
    auto gops = 1.0 * iters * 32 * 32 * 2 * 2 / diff.count() / 1e9;
    std::cout << "  iters/s: " << iters / diff.count();
    std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
    std::cout << gops << " Gops) [" << X << "]\n";
  }
  {
    auto W0 = rand_mat<T>(32, 32);
    auto psW0 = to_cache(W, 32, 32);
    auto W1 = rand_mat<T>(32, 32);
    auto psW1 = to_cache(W, 32, 32);
    auto W2 = rand_mat<T>(32, 32);
    auto psW2 = to_cache(W, 32, 32);
    auto iters = 10000000;
    std::cout << "  3 layer 32 neuron\n";
    for (size_t i = 0; i < 1000; ++i) {
      X ^= layer<T>(X, psW0.data());
      X ^= layer<T>(X, psW1.data());
      X ^= layer<T>(X, psW2.data());
    }
    auto start = std::chrono::steady_clock::now();
    for (size_t i = 0; i < iters; ++i) {
      X ^= layer<T>(X, psW0.data());
      X = ~X;
      X ^= layer<T>(X, psW1.data());
      X = ~X;
      X ^= layer<T>(X, psW2.data());
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> diff = end - start;
    auto gops = 1.0 * iters * 32 * 32 * 2 * 3 / diff.count() / 1e9;
    std::cout << "  iters/s: " << iters / diff.count();
    std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
    std::cout << gops << " Gops) [" << X << "]\n";
  }
  {
    auto W0 = rand_mat<T>(32, 32);
    auto psW0 = to_cache(W, 32, 32);
    auto W1 = rand_mat<T>(32, 32);
    auto psW1 = to_cache(W, 32, 32);
    auto iters = 10000000;
    auto bs = 4;
    int32_t Xs[bs];
    for (auto i = 0; i < bs; ++i) {
      Xs[i] = perm32(i);
    }
    std::cout << "  " << bs << " batched 2 layer bs neuron\n";
    auto rot = [&]() {
      for (auto i = 0; i < bs; ++i) {
        // rotate and flip bottom bits
        auto j = (i + 1) % bs;
        Xs[j] = ((Xs[i] << 1) | ((uint32_t)Xs[i] >> (32 - 1))) ^ X;
        X ^= Xs[j];
      }
    };
    for (size_t i = 0; i < 1000; ++i) {
      rot();
      batch_layer<T, int32_t>(Xs, bs, psW0.data(), Xs);
      batch_layer<T, int32_t>(Xs, bs, psW1.data(), Xs);
    }
    auto start = std::chrono::steady_clock::now();
    for (size_t i = 0; i < iters; ++i) {
      rot();
      batch_layer<T, int32_t>(Xs, bs, psW0.data(), Xs);
      batch_layer<T, int32_t>(Xs, bs, psW1.data(), Xs);
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> diff = end - start;
    auto gops = bs * 1.0 * iters * 32 * 32 * 2 * 2 / diff.count() / 1e9;
    std::cout << "  infers/s: " << (iters * bs) / diff.count();
    std::cout << " (" << diff.count() * 1e9 / (iters * bs) << " ns/infer ";
    std::cout << gops << " Gops : ";
    std::cout <<  diff.count() * 1e9 / iters << "ns/iter) [" << Xs[0] << "]\n";
  }
  {
    auto W0 = rand_mat<T>(32, 32);
    auto cache_killer = 10000;
    std::vector<std::vector<T>> ws;
    for (auto i = 0; i < cache_killer; ++i) {
      ws.emplace_back(to_cache(W, 32, 32));
    }
    auto iters = 10000000;
    std::cout << "  cache-killed 1 layer 32 neuron\n";
    for (size_t i = 0; i < 1000; ++i) {
      auto &w = ws[i % cache_killer];
      X ^= layer<T>(X, w.data());
      X = ~X;
    }
    auto start = std::chrono::steady_clock::now();
    for (size_t i = 0; i < iters; ++i) {
      auto &w = ws[i % cache_killer];
      X ^= layer<T>(X, w.data());
      X = ~X;
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> diff = end - start;
    auto gops = 1.0 * iters * 32 * 32 * 2 / diff.count() / 1e9;
    std::cout << "  iters/s: " << iters / diff.count();
    std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
    std::cout << gops << " Gops) [" << X << "]\n";
  }
 }

 template <typename T> void bench_64_32() {
  std::cout << "benchmarking " << 8 * sizeof(T)
            << "bit weight 64->32 (binary activations)\n";
  auto W = rand_mat<T>(64, 32);
  auto psW = to_cache(W, 64, 32);
  auto ref = ref_layer<T>(123, 123, W);
  std::cout << "  ref: " << std::bitset<32>(ref) << "\n";
  auto l = layer_64_32(123, 123, psW.data());
  std::cout << "  layer: " << std::bitset<32>(l) << "\n";
  assert(ref == l);
  for (auto i = 1; i < (1 << 30); i *= 2) {
    auto test = i + 1337;
    auto ref = ref_layer<T>(test, ~test, W);
    auto l = layer_64_32(test, ~test, psW.data());
    if (ref != l) {
      std::cerr << "ERROR! input " << test << " breaks test\n";
      return;
    }
  }
  auto X = 0xff1fff17;

  {
    auto W0 = rand_mat<T>(64, 32);
    auto psW0 = to_cache(W, 64, 32);
    auto iters = 10000000;
    std::cout << "  1 layer 32 neuron\n";
    for (size_t i = 0; i < 1000; ++i) {
      X ^= layer_64_32(X, ~X, psW0.data());
      X = ~X;
    }
    auto start = std::chrono::steady_clock::now();
    for (size_t i = 0; i < iters; ++i) {
      X ^= layer_64_32(X, ~X, psW0.data());
      X = ~X;
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> diff = end - start;
    auto gops = 1.0 * iters * 64 * 32 * 2 / diff.count() / 1e9;
    std::cout << "  iters/s: " << iters / diff.count();
    std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
    std::cout << gops << " Gops) [" << X << "]\n";
  }
  {
    auto W0 = rand_mat<T>(64, 32);
    auto psW0 = to_cache(W, 64, 32);
    auto iters = 10000000;
    auto bs = 8;
    int32_t Xs[bs];
    for (auto i = 0; i < bs; ++i) {
      Xs[i] = perm32(i);
    }
    std::cout << "  " << bs << " batched 1 layer 32 neuron\n";
    auto rot = [&]() {
      for (auto i = 0; i < bs; ++i) {
        // rotate and flip bottom bits
        auto j = (i + 1) % bs;
        Xs[j] = ((Xs[i] << 1) | ((uint32_t)Xs[i] >> (32 - 1))) ^ X;
        X ^= Xs[j];
      }
    };
    for (size_t i = 0; i < 1000; ++i) {
      rot();
      batch_layer_64_32(Xs, bs, psW0.data(), Xs);
    }
    auto start = std::chrono::steady_clock::now();
    for (size_t i = 0; i < iters; ++i) {
      rot();
      batch_layer_64_32(Xs, bs, psW0.data(), Xs);
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> diff = end - start;
    auto gops = bs * 1.0 * iters * 64 * 32 * 2 / diff.count() / 1e9;
    std::cout << "  infers/s: " << (iters * bs) / diff.count();
    std::cout << " (" << diff.count() * 1e9 / (iters * bs) << " ns/infer ";
    std::cout << gops << " Gops) [" << Xs[0] << "]\n";
  }
 }

 template <typename T> void bench_64() {
  auto W = rand_mat<T>(64, 64);
  auto psW = to_cache(W, 64, 64);
  std::cout << "benchmarking 64->64 " << 8 * sizeof(T)
            << "bit weight (binary activations)\n";

  auto ref = ref_layer<T, int64_t, int64_t>(123, W);
  std::cout << "  ref: " << std::bitset<64>(ref) << "\n";
  auto l = layer_64<T>(123, psW.data());
  std::cout << "  layer: " << std::bitset<64>(l) << "\n";
  assert(ref == l);
  for (auto i = 1; i < (1 << 30); i *= 2) {
    auto test = i + 1337;
    auto ref = ref_layer<T, int64_t, int64_t>(test, W);
    auto l = layer_64<T>(test, psW.data());
    if (ref != l) {
      std::cerr << "ERROR! input " << test << " breaks test\n";
      return;
    }
  }

  int64_t X = 0xff1fff17ffff883f;
  {
    auto W0 = rand_mat<T>(64, 64);
    auto psW0 = to_cache(W, 64, 64);
    auto iters = 10000000;
    std::cout << "  1 layer 64 neuron\n";
    for (size_t i = 0; i < 1000; ++i) {
      X ^= layer_64<T>(X, psW0.data());
      X = ~X;
    }
    auto start = std::chrono::steady_clock::now();
    for (size_t i = 0; i < iters; ++i) {
      X ^= layer_64<T>(X, psW0.data());
      X = ~X;
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> diff = end - start;
    auto gops = 1.0 * iters * 64 * 64 * 2 / diff.count() / 1e9;
    std::cout << "  iters/s: " << iters / diff.count();
    std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
    std::cout << gops << " Gops) [" << X << "]\n";
  }
  {
    auto W0 = rand_mat<T>(64, 64);
    auto psW0 = to_cache(W, 64, 64);
    auto iters = 10000000;
    auto bs = 8;
    int64_t Xs[bs];
    for (auto i = 0; i < bs; ++i) {
      Xs[i] = (uint64_t)perm32(i) << 32 | perm32(~i);
    }
    std::cout << "  " << bs << " batched 1 layer 64 neuron\n";
    auto rot = [&]() {
      for (auto i = 0; i < bs; ++i) {
        // rotate and flip bottom bits
        auto j = (i + 1) % bs;
        Xs[j] = (((uint64_t)Xs[i] << 1) | ((uint64_t)Xs[i] >> (64 - 2))) ^ X;
        X ^= Xs[j];
      }
    };
    for (size_t i = 0; i < 1000; ++i) {
      rot();
      batch_layer<T, int64_t>(Xs, bs, psW0.data(), Xs);
    }
    auto start = std::chrono::steady_clock::now();
    for (size_t i = 0; i < iters; ++i) {
      rot();
      batch_layer<T, int64_t>(Xs, bs, psW0.data(), Xs);
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double> diff = end - start;
    auto gops = bs * 1.0 * iters * 64 * 64 * 2 / diff.count() / 1e9;
    std::cout << "  infers/s: " << (iters * bs) / diff.count();
    std::cout << " (" << diff.count() * 1e9 / (iters * bs) << " ns/infer ";
    std::cout << gops << " Gops) [" << Xs[0] << "]\n";
  }
 }

 template <typename T>
 void bench_2bit() {
  auto W = rand_mat<T>(32, 32);
  auto psW = to_cache(W, 32, 32);
  auto X = rand_mat<T>(1, 32);
  for (auto& x : X) {
    x &= 0b11;
  }
  int32_t Xb0 = 0;
  int32_t Xb1 = 0;
  for (auto i = 0; i < 32; ++i) {
    Xb0 |= (X[i] & 0b1) << i;
    Xb1 |= ((X[i] & 0b10) >> 1) << i;
  }
  auto o = layer_2bit(Xb0, Xb1, psW.data());
  std::cout << "alg ";
  std::cout << std::bitset<32>(o) << "\n";
  std::vector<T> out(32);
  ref_layer_full(X.data(), W.data(), 1, 32, 32, out.data());
  std::cout << "ref ";
  int32_t ref =0;
  for (auto i = 0; i < out.size(); ++i) {
    ref |= (out[i] > 0) << i;
  }
  std::cout << std::bitset<32>(ref) << "\n";

  int32_t Xs[2];
  Xs[0] = perm32(0 + 1337);
  Xs[1] = perm32(1 + 1337);

  auto iters = 100000000;
  for (size_t i = 0; i < 10000; ++i) {
    auto o = layer_2bit(Xs[0], Xs[1], psW.data());
    Xs[0] ^= o;
    Xs[0] = ~Xs[0] << 1;
    Xs[1] ^= o;
    Xs[1] = ~Xs[1] >> 1;
    //std::cout << std::bitset<32>(Xs[0]) << ":" << std::bitset<32>(Xs[1]) << "\n";
  }
  auto start = std::chrono::steady_clock::now();
  for (size_t i = 0; i < iters; ++i) {
    auto o = layer_2bit(Xs[0], Xs[1], psW.data());
    Xs[0] ^= o;
    Xs[0] = ~Xs[0] << 1;
    Xs[1] ^= o;
    Xs[1] = ~Xs[1] >> 1;
  }
  auto end = std::chrono::steady_clock::now();
  std::chrono::duration<double> diff = end - start;
  auto gops = 1.0 * iters * 32 * 32 * 2 / diff.count() / 1e9;
  std::cout << "  iters/s: " << iters / diff.count();
  std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
  std::cout << gops << " Gops) [" << Xs[0] << "]\n";
 }

 int main(int argc, char **argv) {
 #if VEC_WIDTH == 256
  bench<int8_t>();
 #endif
  bench<int16_t>();
 #if VEC_WIDTH == 512
  bench<float>();
  bench_64_32<int16_t>();
  bench_64<int16_t>();
  bench_64<int8_t>();
 #endif
  bench_2bit<int16_t>();
  return 0;
 }