Skip to content

Instantly share code, notes, and snippets.

@bwasti
Last active October 19, 2021 02:21
Show Gist options
  • Save bwasti/897ae148a8bdce6b5fb520bb903b8eba to your computer and use it in GitHub Desktop.
Save bwasti/897ae148a8bdce6b5fb520bb903b8eba to your computer and use it in GitHub Desktop.
binary activations for fast inference on intel devices
/*
AVX2:
g++ dot_cache.cc -march=native -O3
AVX512:
g++ -DVEC_WIDTH=512 dot_cache.cc -march=native -O3
2x larger cache (P=8 is the sweetspot)
g++ -DP=16 dot_cache.cc -march=native -O3
The premise is that we want to multiply binary activations
by high precision weights.
The trick is that we can precompute the partial dot products
(the value P is how big the precompute is) and use chunks of the
binary activations as indices (reinterpret the bits as integers).
Results on avx2, int16 weights, binary activations, batch=8, 2 layer 32 neuron
MLP + relu: ~71ns per batch, 8.89287 ns/infer 460.594 Gops
Extensions/TODOs:
- We can increase the activation precision by using multiple horizontal layers
and computing the bits individually (e.g. CSA logic).
- Need to test the limit of precompute size
- Need to test avx512
- Need to test out prefetching
*/
#include <bitset>
#include <cassert>
#include <chrono>
#include <iostream>
#include <vector>
#define NONE 0
#define INTEL 1
#define ARM 2
#ifndef ISA
#define ISA INTEL
#endif
#if ISA == INTEL
#include <immintrin.h>
#else
#endif
#ifndef P
#define P 8
#endif
#ifndef VEC_WIDTH
#if ISA == INTEL
#define VEC_WIDTH 256
#else
#define VEC_WIDTH 128
#endif
#endif
#define VEC_BYTES (VEC_WIDTH / 8)
template <typename T> int32_t layer(int32_t a, T *weights);
template <typename T> int64_t layer_64(int64_t a, T *weights);
// 2bit represented as: a_k = a_1 & (0x1 << k) << 1 + a_0 & (0x1 << k);
int32_t layer_2bit(int32_t a_0, int32_t a_1, int16_t *weights) {
auto v0 = _mm256_set1_epi16(0);
auto v1 = _mm256_set1_epi16(0);
auto zero = _mm256_set1_epi16(0);
auto N = 32;
auto vec_width = VEC_BYTES / sizeof(int16_t);
auto psN = N / vec_width;
auto psP = 1 << P;
#pragma unroll
for (auto i = 0; i < 32; i += P) {
auto b = (a_0 >> i) & ((1 << P) - 1);
auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0);
auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1);
__m256i v0_ = _mm256_loadu_si256((__m256i *)&(weights[idx0]));
__m256i v1_ = _mm256_loadu_si256((__m256i *)&(weights[idx1]));
v0 = _mm256_add_epi16(v0_, v0);
v1 = _mm256_add_epi16(v1, v1_);
}
#pragma unroll
for (auto i = 0; i < 32; i += P) {
auto b = (a_1 >> i) & ((1 << P) - 1);
auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0);
auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1);
__m256i v0_ = _mm256_loadu_si256((__m256i *)&(weights[idx0]));
__m256i v1_ = _mm256_loadu_si256((__m256i *)&(weights[idx1]));
v0_ = _mm256_slli_epi16(v0_, 1);
v1_ = _mm256_slli_epi16(v1_, 1);
v0 = _mm256_add_epi16(v0_, v0);
v1 = _mm256_add_epi16(v1, v1_);
}
auto mask = _mm256_set_epi8(0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0);
v0 = _mm256_cmpgt_epi16(v0, zero);
v1 = _mm256_cmpgt_epi16(v1, zero);
auto outv = _mm256_blendv_epi8(v0, v1, mask);
return _mm256_movemask_epi8(outv);
}
#if ISA == INTEL
#if VEC_WIDTH == 256
template <> int32_t layer(int32_t a, int8_t *weights) {
auto psN = 1;
auto psP = 1 << P;
auto vec_width = VEC_BYTES / sizeof(int8_t);
auto v = _mm256_set1_epi8(0);
auto zero = _mm256_set1_epi8(0);
for (auto i = 0; i < 32; i += P) {
auto b = (a >> i) & ((1 << P) - 1);
auto idx = vec_width * (psN * (psP * (i / P) + b));
__m256i v_ = _mm256_loadu_si256((__m256i *)&(weights[idx]));
v = _mm256_add_epi8(v_, v);
}
v = _mm256_cmpgt_epi8(v, zero);
return _mm256_movemask_epi8(v);
}
template <> int32_t layer(int32_t a, int16_t *weights) {
auto v0 = _mm256_set1_epi16(0);
auto v1 = _mm256_set1_epi16(0);
auto zero = _mm256_set1_epi16(0);
auto N = 32;
auto vec_width = VEC_BYTES / sizeof(int16_t);
auto psN = N / vec_width;
auto psP = 1 << P;
#pragma unroll
for (auto i = 0; i < 32; i += P) {
auto b = (a >> i) & ((1 << P) - 1);
auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0);
auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1);
__m256i v0_ = _mm256_loadu_si256((__m256i *)&(weights[idx0]));
__m256i v1_ = _mm256_loadu_si256((__m256i *)&(weights[idx1]));
v0 = _mm256_add_epi16(v0_, v0);
v1 = _mm256_add_epi16(v1, v1_);
}
auto mask = _mm256_set_epi8(0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0);
v0 = _mm256_cmpgt_epi16(v0, zero);
v1 = _mm256_cmpgt_epi16(v1, zero);
auto outv = _mm256_blendv_epi8(v0, v1, mask);
return _mm256_movemask_epi8(outv);
}
#else // VEC_WIDTH == 512
template <> int32_t layer(int32_t a, int16_t *weights) {
auto v0 = _mm512_set1_epi16(0);
auto v1 = _mm512_set1_epi16(0);
auto zero = _mm512_set1_epi16(0);
auto N = 32;
auto vec_width = VEC_BYTES / sizeof(int16_t);
auto psN = N / vec_width;
auto psP = 1 << P;
#pragma unroll
for (auto i = 0; i < 32; i += P) {
auto b = (a >> i) & ((1 << P) - 1);
auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0);
__m512i v0_ = _mm512_loadu_si512((__m512i *)&(weights[idx0]));
v0 = _mm512_add_epi16(v0_, v0);
}
auto out_mask = _mm512_cmpgt_epi16_mask(v0, zero);
return _cvtmask32_u32(out_mask);
}
template <> int32_t layer(int32_t a, float *weights) {
auto v0 = _mm512_set1_ps(0);
auto v1 = _mm512_set1_ps(0);
auto zero = _mm512_set1_ps(0);
auto N = 32;
auto vec_width = VEC_BYTES / sizeof(float);
auto psN = N / vec_width;
auto psP = 1 << P;
#pragma unroll
for (auto i = 0; i < 32; i += P) {
auto b = (a >> i) & ((1 << P) - 1);
auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0);
auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1);
auto v0_ = _mm512_loadu_ps((__m512 *)&(weights[idx0]));
auto v1_ = _mm512_loadu_ps((__m512 *)&(weights[idx1]));
v0 = _mm512_add_ps(v0_, v0);
v1 = _mm512_add_ps(v1_, v1);
}
auto v0_m = _mm512_cmp_ps_mask(v0, zero, _CMP_GT_OQ);
auto v1_m = _mm512_cmp_ps_mask(v1, zero, _CMP_GT_OQ);
return _cvtmask16_u32(v0_m) | _cvtmask16_u32(v1_m) << 16;
}
template <> int64_t layer_64(int64_t a, int16_t *weights) {
auto v0 = _mm512_set1_epi16(0);
auto v1 = _mm512_set1_epi16(0);
auto zero = _mm512_set1_epi16(0);
auto N = 64;
auto K = 64;
auto vec_width = VEC_BYTES / sizeof(int16_t);
auto psN = N / vec_width;
auto psP = 1 << P;
#pragma unroll
for (auto i = 0; i < K; i += P) {
auto b = (a >> i) & ((1 << P) - 1);
auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0);
auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1);
__m512i v0_ = _mm512_loadu_si512((__m512i *)&(weights[idx0]));
__m512i v1_ = _mm512_loadu_si512((__m512i *)&(weights[idx1]));
v0 = _mm512_add_epi16(v0_, v0);
v1 = _mm512_add_epi16(v1, v1_);
}
auto v0_m = _mm512_cmpgt_epi16_mask(v0, zero);
auto v1_m = _mm512_cmpgt_epi16_mask(v1, zero);
return (int64_t)_cvtmask32_u32(v0_m) | (int64_t)_cvtmask32_u32(v1_m) << 32;
}
template <> int64_t layer_64(int64_t a, int8_t *weights) {
auto v = _mm512_set1_epi8(0);
auto zero = _mm512_set1_epi8(0);
auto N = 64;
auto K = 64;
auto vec_width = VEC_BYTES / sizeof(int8_t);
auto psN = N / vec_width;
auto psP = 1 << P;
#pragma unroll
for (auto i = 0; i < K; i += P) {
auto b = (a >> i) & ((1 << P) - 1);
auto idx = vec_width * (psN * (psP * (i / P) + b));
__m512i v_ = _mm512_loadu_si512((__m512i *)&(weights[idx]));
v = _mm512_add_epi8(v_, v);
}
auto v_m = _mm512_cmpgt_epi8_mask(v, zero);
return _cvtmask64_u64(v_m);
}
int32_t layer_64_32(int32_t a, int32_t b, int16_t *weights) {
auto zero = _mm512_set1_epi16(0);
auto N = 32;
auto K = 32;
auto vec_width = VEC_BYTES / sizeof(int16_t);
auto psN = 1;
auto psP = 1 << P;
auto v = _mm512_set1_epi16(0);
// first a
for (auto i = 0; i < K; i += P) {
auto bin = (a >> i) & ((1 << P) - 1);
auto idx = vec_width * (psN * (psP * (i / P) + bin));
__m512i v_ = _mm512_loadu_si512((__m512i *)&(weights[idx]));
v = _mm512_add_epi16(v_, v);
}
// then b
for (auto i = 0; i < K; i += P) {
auto bin = (b >> i) & ((1 << P) - 1);
auto idx = vec_width * (psN * (psP * ((i + K) / P) + bin));
__m512i v_ = _mm512_loadu_si512((__m512i *)&(weights[idx]));
v = _mm512_add_epi16(v_, v);
}
auto out_mask = _mm512_cmpgt_epi16_mask(v, zero);
return _cvtmask32_u32(out_mask);
}
void batch_layer_64_32(int32_t *inps, int N, int16_t *weights, int32_t *outs) {
for (auto i = 0; i < N; ++i) {
outs[i] = layer_64_32(inps[i], ~inps[i], weights);
}
}
#endif // VEC_WIDTH
#else // ISA != INTEL
template <typename T> int32_t layer(int32_t a, T *weights) {
auto psN = 1;
auto psP = 1 << P;
T v[32] = { 0 };
for (auto i = 0; i < 32; i += P) {
auto b = (a >> i) & ((1 << P) - 1);
auto idx = 32 * (psN * (psP * (i / P) + b));
for (auto k = 0; k < 32; ++k) {
v[k] += weights[idx + k];
}
}
uint32_t out = 0;
for (auto k = 0; k < 32; ++k) {
out |= v[k] > 0 ? (1 << k) : 0;
}
return out;
}
template <typename T> int64_t layer_64(int64_t a, T *weights) {
auto psN = 1;
auto psP = 1 << P;
T v[64] = { 0 };
for (auto i = 0; i < 64; i += P) {
auto b = (a >> i) & ((1 << P) - 1);
auto idx = 64 * (psN * (psP * (i / P) + b));
for (auto k = 0; k < 64; ++k) {
v[k] += weights[idx + k];
}
}
uint64_t out = 0;
for (auto k = 0; k < 64; ++k) {
out |= v[k] > 0 ? (1 << k) : 0;
}
return out;
}
#endif
template <typename T, typename BT>
void batch_layer(BT *inps, int N, T *weights, BT *outs) {
if (sizeof(BT) == 4) {
for (auto i = 0; i < N; ++i) {
outs[i] = layer<T>(inps[i], weights);
}
} else {
for (auto i = 0; i < N; ++i) {
outs[i] = layer_64(inps[i], weights);
}
}
}
// convert K x N weight vec into (K / P) x N pre-summed indexable vec
template <typename T>
std::vector<T> to_cache(std::vector<T> orig, int K, int N) {
auto vec_width = VEC_BYTES / sizeof(T);
assert(N % vec_width == 0);
assert(K % P == 0);
auto psN = N / vec_width;
auto psP = 1 << P;
auto psK = K / P;
std::vector<T> out(N * psK * psP);
for (auto n = 0; n < N; ++n) {
auto n_ = n;
// enables the quick blend stuff for merging int16 vectors after relu
if (sizeof(T) == 2 && VEC_WIDTH == 256) {
n_ = (n % psN) * vec_width + n / psN;
}
for (auto o_k = 0; o_k < psK; ++o_k) {
for (auto b = 0; b < psP; ++b) {
int16_t tot = 0;
for (auto i_k = 0; i_k < P; ++i_k) {
auto k = o_k * P + i_k;
if ((b >> i_k) & 1) {
tot += orig[k * N + n];
}
}
auto idx = psN * vec_width * (psP * (o_k) + b) + n_;
out[idx] = tot;
}
}
}
return out;
}
template <typename T> std::vector<T> rand_mat(int K, int N) {
std::vector<T> out(N * K);
for (auto k = 0; k < K; ++k) {
for (auto n = 0; n < N; ++n) {
auto i = rand() % 10 - 5;
if (sizeof(T) > 1) {
i = rand() % 100 - 50;
}
out[k * N + n] = i;
}
}
return out;
}
template <typename T, typename IN, typename OUT>
OUT ref_layer(IN inp, std::vector<T> W) {
OUT out = 0;
// std::cout << "ref ";
for (auto n = 0; n < sizeof(OUT) * 8; ++n) {
T tot = 0;
for (auto k = 0; k < sizeof(IN) * 8; ++k) {
if ((inp >> k) & 1) {
tot += W[k * sizeof(OUT) * 8 + n];
}
}
// std::cout << tot << " ";
if (tot > 0) {
out |= ((uint64_t)1 << n);
}
}
// std::cout << "\n";
return out;
}
template <typename T>
void ref_layer_full(T* inp, T* W, int M, int N, int K, T* out) {
//std::cout << "ref ";
for (auto n = 0; n < N; ++n) {
for (auto m = 0; m < M; ++m) {
T tot = 0;
for (auto k = 0; k < K; ++k) {
tot += inp[m * K + k] * W[k * N + n];
}
out[m * N + n] = tot;
//std::cout << tot << " ";
}
}
//std::cout << "\n";
}
template <typename T>
int32_t ref_layer(int32_t inp, int32_t inp2, std::vector<T> W) {
int32_t out = 0;
for (auto n = 0; n < 32; ++n) {
T tot = 0;
for (auto k = 0; k < 32; ++k) {
if ((inp >> k) & 1) {
tot += W[k * 32 + n];
}
}
for (auto k = 32; k < 64; ++k) {
if ((inp2 >> (k - 32)) & 1) {
tot += W[k * 32 + n];
}
}
if (tot > 0) {
out |= (1 << n);
}
}
return out;
}
// https://crypto.stackexchange.com/questions/16219/cryptographic-hash-function-for-32-bit-length-input-keys
uint32_t perm32(uint32_t x) {
int n = 12;
do {
x = ((x >> 8) ^ x) * 0x6B + n;
} while (--n != 0);
return x;
}
template <typename T> void bench() {
auto W = rand_mat<T>(32, 32);
auto psW = to_cache(W, 32, 32);
std::cout << "benchmarking " << 8 * sizeof(T)
<< "bit weight (binary activations)\n";
auto ref = ref_layer<T, int32_t, int32_t>(123, W);
std::cout << " ref: " << std::bitset<32>(ref) << "\n";
auto l = layer<T>(123, psW.data());
std::cout << " layer: " << std::bitset<32>(l) << "\n";
assert(ref == l);
for (auto i = 1; i < (1 << 30); i *= 2) {
auto test = i + 1337;
auto ref = ref_layer<T, int32_t, int32_t>(test, W);
auto l = layer<T>(test, psW.data());
if (ref != l) {
std::cerr << "ERROR! input " << test << " breaks test\n";
return;
}
}
auto X = 0xff1fff17;
{
auto W0 = rand_mat<T>(32, 32);
auto psW0 = to_cache(W, 32, 32);
auto iters = 10000000;
std::cout << " 1 layer 32 neuron\n";
for (size_t i = 0; i < 1000; ++i) {
X ^= layer<T>(X, psW0.data());
X = ~X >> 1;
}
auto start = std::chrono::steady_clock::now();
for (size_t i = 0; i < iters; ++i) {
X ^= layer<T>(X, psW0.data());
X = ~X >> 1;
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
auto gops = 1.0 * iters * 32 * 32 * 2 / diff.count() / 1e9;
std::cout << " iters/s: " << iters / diff.count();
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
std::cout << gops << " Gops) [" << X << "]\n";
}
{
auto W0 = rand_mat<T>(32, 32);
auto psW0 = to_cache(W, 32, 32);
auto W1 = rand_mat<T>(32, 32);
auto psW1 = to_cache(W, 32, 32);
auto iters = 10000000;
std::cout << " 2 layer 32 neuron\n";
for (size_t i = 0; i < 1000; ++i) {
X ^= layer<T>(X, psW0.data());
X ^= layer<T>(X, psW1.data());
}
auto start = std::chrono::steady_clock::now();
for (size_t i = 0; i < iters; ++i) {
X ^= layer<T>(X, psW0.data());
X = ~X;
X ^= layer<T>(X, psW1.data());
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
auto gops = 1.0 * iters * 32 * 32 * 2 * 2 / diff.count() / 1e9;
std::cout << " iters/s: " << iters / diff.count();
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
std::cout << gops << " Gops) [" << X << "]\n";
}
{
auto W0 = rand_mat<T>(32, 32);
auto psW0 = to_cache(W, 32, 32);
auto W1 = rand_mat<T>(32, 32);
auto psW1 = to_cache(W, 32, 32);
auto W2 = rand_mat<T>(32, 32);
auto psW2 = to_cache(W, 32, 32);
auto iters = 10000000;
std::cout << " 3 layer 32 neuron\n";
for (size_t i = 0; i < 1000; ++i) {
X ^= layer<T>(X, psW0.data());
X ^= layer<T>(X, psW1.data());
X ^= layer<T>(X, psW2.data());
}
auto start = std::chrono::steady_clock::now();
for (size_t i = 0; i < iters; ++i) {
X ^= layer<T>(X, psW0.data());
X = ~X;
X ^= layer<T>(X, psW1.data());
X = ~X;
X ^= layer<T>(X, psW2.data());
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
auto gops = 1.0 * iters * 32 * 32 * 2 * 3 / diff.count() / 1e9;
std::cout << " iters/s: " << iters / diff.count();
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
std::cout << gops << " Gops) [" << X << "]\n";
}
{
auto W0 = rand_mat<T>(32, 32);
auto psW0 = to_cache(W, 32, 32);
auto W1 = rand_mat<T>(32, 32);
auto psW1 = to_cache(W, 32, 32);
auto iters = 10000000;
auto bs = 4;
int32_t Xs[bs];
for (auto i = 0; i < bs; ++i) {
Xs[i] = perm32(i);
}
std::cout << " " << bs << " batched 2 layer bs neuron\n";
auto rot = [&]() {
for (auto i = 0; i < bs; ++i) {
// rotate and flip bottom bits
auto j = (i + 1) % bs;
Xs[j] = ((Xs[i] << 1) | ((uint32_t)Xs[i] >> (32 - 1))) ^ X;
X ^= Xs[j];
}
};
for (size_t i = 0; i < 1000; ++i) {
rot();
batch_layer<T, int32_t>(Xs, bs, psW0.data(), Xs);
batch_layer<T, int32_t>(Xs, bs, psW1.data(), Xs);
}
auto start = std::chrono::steady_clock::now();
for (size_t i = 0; i < iters; ++i) {
rot();
batch_layer<T, int32_t>(Xs, bs, psW0.data(), Xs);
batch_layer<T, int32_t>(Xs, bs, psW1.data(), Xs);
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
auto gops = bs * 1.0 * iters * 32 * 32 * 2 * 2 / diff.count() / 1e9;
std::cout << " infers/s: " << (iters * bs) / diff.count();
std::cout << " (" << diff.count() * 1e9 / (iters * bs) << " ns/infer ";
std::cout << gops << " Gops : ";
std::cout << diff.count() * 1e9 / iters << "ns/iter) [" << Xs[0] << "]\n";
}
{
auto W0 = rand_mat<T>(32, 32);
auto cache_killer = 10000;
std::vector<std::vector<T>> ws;
for (auto i = 0; i < cache_killer; ++i) {
ws.emplace_back(to_cache(W, 32, 32));
}
auto iters = 10000000;
std::cout << " cache-killed 1 layer 32 neuron\n";
for (size_t i = 0; i < 1000; ++i) {
auto &w = ws[i % cache_killer];
X ^= layer<T>(X, w.data());
X = ~X;
}
auto start = std::chrono::steady_clock::now();
for (size_t i = 0; i < iters; ++i) {
auto &w = ws[i % cache_killer];
X ^= layer<T>(X, w.data());
X = ~X;
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
auto gops = 1.0 * iters * 32 * 32 * 2 / diff.count() / 1e9;
std::cout << " iters/s: " << iters / diff.count();
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
std::cout << gops << " Gops) [" << X << "]\n";
}
}
template <typename T> void bench_64_32() {
std::cout << "benchmarking " << 8 * sizeof(T)
<< "bit weight 64->32 (binary activations)\n";
auto W = rand_mat<T>(64, 32);
auto psW = to_cache(W, 64, 32);
auto ref = ref_layer<T>(123, 123, W);
std::cout << " ref: " << std::bitset<32>(ref) << "\n";
auto l = layer_64_32(123, 123, psW.data());
std::cout << " layer: " << std::bitset<32>(l) << "\n";
assert(ref == l);
for (auto i = 1; i < (1 << 30); i *= 2) {
auto test = i + 1337;
auto ref = ref_layer<T>(test, ~test, W);
auto l = layer_64_32(test, ~test, psW.data());
if (ref != l) {
std::cerr << "ERROR! input " << test << " breaks test\n";
return;
}
}
auto X = 0xff1fff17;
{
auto W0 = rand_mat<T>(64, 32);
auto psW0 = to_cache(W, 64, 32);
auto iters = 10000000;
std::cout << " 1 layer 32 neuron\n";
for (size_t i = 0; i < 1000; ++i) {
X ^= layer_64_32(X, ~X, psW0.data());
X = ~X;
}
auto start = std::chrono::steady_clock::now();
for (size_t i = 0; i < iters; ++i) {
X ^= layer_64_32(X, ~X, psW0.data());
X = ~X;
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
auto gops = 1.0 * iters * 64 * 32 * 2 / diff.count() / 1e9;
std::cout << " iters/s: " << iters / diff.count();
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
std::cout << gops << " Gops) [" << X << "]\n";
}
{
auto W0 = rand_mat<T>(64, 32);
auto psW0 = to_cache(W, 64, 32);
auto iters = 10000000;
auto bs = 8;
int32_t Xs[bs];
for (auto i = 0; i < bs; ++i) {
Xs[i] = perm32(i);
}
std::cout << " " << bs << " batched 1 layer 32 neuron\n";
auto rot = [&]() {
for (auto i = 0; i < bs; ++i) {
// rotate and flip bottom bits
auto j = (i + 1) % bs;
Xs[j] = ((Xs[i] << 1) | ((uint32_t)Xs[i] >> (32 - 1))) ^ X;
X ^= Xs[j];
}
};
for (size_t i = 0; i < 1000; ++i) {
rot();
batch_layer_64_32(Xs, bs, psW0.data(), Xs);
}
auto start = std::chrono::steady_clock::now();
for (size_t i = 0; i < iters; ++i) {
rot();
batch_layer_64_32(Xs, bs, psW0.data(), Xs);
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
auto gops = bs * 1.0 * iters * 64 * 32 * 2 / diff.count() / 1e9;
std::cout << " infers/s: " << (iters * bs) / diff.count();
std::cout << " (" << diff.count() * 1e9 / (iters * bs) << " ns/infer ";
std::cout << gops << " Gops) [" << Xs[0] << "]\n";
}
}
template <typename T> void bench_64() {
auto W = rand_mat<T>(64, 64);
auto psW = to_cache(W, 64, 64);
std::cout << "benchmarking 64->64 " << 8 * sizeof(T)
<< "bit weight (binary activations)\n";
auto ref = ref_layer<T, int64_t, int64_t>(123, W);
std::cout << " ref: " << std::bitset<64>(ref) << "\n";
auto l = layer_64<T>(123, psW.data());
std::cout << " layer: " << std::bitset<64>(l) << "\n";
assert(ref == l);
for (auto i = 1; i < (1 << 30); i *= 2) {
auto test = i + 1337;
auto ref = ref_layer<T, int64_t, int64_t>(test, W);
auto l = layer_64<T>(test, psW.data());
if (ref != l) {
std::cerr << "ERROR! input " << test << " breaks test\n";
return;
}
}
int64_t X = 0xff1fff17ffff883f;
{
auto W0 = rand_mat<T>(64, 64);
auto psW0 = to_cache(W, 64, 64);
auto iters = 10000000;
std::cout << " 1 layer 64 neuron\n";
for (size_t i = 0; i < 1000; ++i) {
X ^= layer_64<T>(X, psW0.data());
X = ~X;
}
auto start = std::chrono::steady_clock::now();
for (size_t i = 0; i < iters; ++i) {
X ^= layer_64<T>(X, psW0.data());
X = ~X;
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
auto gops = 1.0 * iters * 64 * 64 * 2 / diff.count() / 1e9;
std::cout << " iters/s: " << iters / diff.count();
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
std::cout << gops << " Gops) [" << X << "]\n";
}
{
auto W0 = rand_mat<T>(64, 64);
auto psW0 = to_cache(W, 64, 64);
auto iters = 10000000;
auto bs = 8;
int64_t Xs[bs];
for (auto i = 0; i < bs; ++i) {
Xs[i] = (uint64_t)perm32(i) << 32 | perm32(~i);
}
std::cout << " " << bs << " batched 1 layer 64 neuron\n";
auto rot = [&]() {
for (auto i = 0; i < bs; ++i) {
// rotate and flip bottom bits
auto j = (i + 1) % bs;
Xs[j] = (((uint64_t)Xs[i] << 1) | ((uint64_t)Xs[i] >> (64 - 2))) ^ X;
X ^= Xs[j];
}
};
for (size_t i = 0; i < 1000; ++i) {
rot();
batch_layer<T, int64_t>(Xs, bs, psW0.data(), Xs);
}
auto start = std::chrono::steady_clock::now();
for (size_t i = 0; i < iters; ++i) {
rot();
batch_layer<T, int64_t>(Xs, bs, psW0.data(), Xs);
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
auto gops = bs * 1.0 * iters * 64 * 64 * 2 / diff.count() / 1e9;
std::cout << " infers/s: " << (iters * bs) / diff.count();
std::cout << " (" << diff.count() * 1e9 / (iters * bs) << " ns/infer ";
std::cout << gops << " Gops) [" << Xs[0] << "]\n";
}
}
template <typename T>
void bench_2bit() {
auto W = rand_mat<T>(32, 32);
auto psW = to_cache(W, 32, 32);
auto X = rand_mat<T>(1, 32);
for (auto& x : X) {
x &= 0b11;
}
int32_t Xb0 = 0;
int32_t Xb1 = 0;
for (auto i = 0; i < 32; ++i) {
Xb0 |= (X[i] & 0b1) << i;
Xb1 |= ((X[i] & 0b10) >> 1) << i;
}
auto o = layer_2bit(Xb0, Xb1, psW.data());
std::cout << "alg ";
std::cout << std::bitset<32>(o) << "\n";
std::vector<T> out(32);
ref_layer_full(X.data(), W.data(), 1, 32, 32, out.data());
std::cout << "ref ";
int32_t ref =0;
for (auto i = 0; i < out.size(); ++i) {
ref |= (out[i] > 0) << i;
}
std::cout << std::bitset<32>(ref) << "\n";
int32_t Xs[2];
Xs[0] = perm32(0 + 1337);
Xs[1] = perm32(1 + 1337);
auto iters = 100000000;
for (size_t i = 0; i < 10000; ++i) {
auto o = layer_2bit(Xs[0], Xs[1], psW.data());
Xs[0] ^= o;
Xs[0] = ~Xs[0] << 1;
Xs[1] ^= o;
Xs[1] = ~Xs[1] >> 1;
//std::cout << std::bitset<32>(Xs[0]) << ":" << std::bitset<32>(Xs[1]) << "\n";
}
auto start = std::chrono::steady_clock::now();
for (size_t i = 0; i < iters; ++i) {
auto o = layer_2bit(Xs[0], Xs[1], psW.data());
Xs[0] ^= o;
Xs[0] = ~Xs[0] << 1;
Xs[1] ^= o;
Xs[1] = ~Xs[1] >> 1;
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
auto gops = 1.0 * iters * 32 * 32 * 2 / diff.count() / 1e9;
std::cout << " iters/s: " << iters / diff.count();
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run ";
std::cout << gops << " Gops) [" << Xs[0] << "]\n";
}
int main(int argc, char **argv) {
#if VEC_WIDTH == 256
bench<int8_t>();
#endif
bench<int16_t>();
#if VEC_WIDTH == 512
bench<float>();
bench_64_32<int16_t>();
bench_64<int16_t>();
bench_64<int8_t>();
#endif
bench_2bit<int16_t>();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment