Last active
October 19, 2021 02:21
-
-
Save bwasti/897ae148a8bdce6b5fb520bb903b8eba to your computer and use it in GitHub Desktop.
binary activations for fast inference on intel devices
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
AVX2: | |
g++ dot_cache.cc -march=native -O3 | |
AVX512: | |
g++ -DVEC_WIDTH=512 dot_cache.cc -march=native -O3 | |
2x larger cache (P=8 is the sweetspot) | |
g++ -DP=16 dot_cache.cc -march=native -O3 | |
The premise is that we want to multiply binary activations | |
by high precision weights. | |
The trick is that we can precompute the partial dot products | |
(the value P is how big the precompute is) and use chunks of the | |
binary activations as indices (reinterpret the bits as integers). | |
Results on avx2, int16 weights, binary activations, batch=8, 2 layer 32 neuron | |
MLP + relu: ~71ns per batch, 8.89287 ns/infer 460.594 Gops | |
Extensions/TODOs: | |
- We can increase the activation precision by using multiple horizontal layers | |
and computing the bits individually (e.g. CSA logic). | |
- Need to test the limit of precompute size | |
- Need to test avx512 | |
- Need to test out prefetching | |
*/ | |
#include <bitset> | |
#include <cassert> | |
#include <chrono> | |
#include <iostream> | |
#include <vector> | |
#define NONE 0 | |
#define INTEL 1 | |
#define ARM 2 | |
#ifndef ISA | |
#define ISA INTEL | |
#endif | |
#if ISA == INTEL | |
#include <immintrin.h> | |
#else | |
#endif | |
#ifndef P | |
#define P 8 | |
#endif | |
#ifndef VEC_WIDTH | |
#if ISA == INTEL | |
#define VEC_WIDTH 256 | |
#else | |
#define VEC_WIDTH 128 | |
#endif | |
#endif | |
#define VEC_BYTES (VEC_WIDTH / 8) | |
template <typename T> int32_t layer(int32_t a, T *weights); | |
template <typename T> int64_t layer_64(int64_t a, T *weights); | |
// 2bit represented as: a_k = a_1 & (0x1 << k) << 1 + a_0 & (0x1 << k); | |
int32_t layer_2bit(int32_t a_0, int32_t a_1, int16_t *weights) { | |
auto v0 = _mm256_set1_epi16(0); | |
auto v1 = _mm256_set1_epi16(0); | |
auto zero = _mm256_set1_epi16(0); | |
auto N = 32; | |
auto vec_width = VEC_BYTES / sizeof(int16_t); | |
auto psN = N / vec_width; | |
auto psP = 1 << P; | |
#pragma unroll | |
for (auto i = 0; i < 32; i += P) { | |
auto b = (a_0 >> i) & ((1 << P) - 1); | |
auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0); | |
auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1); | |
__m256i v0_ = _mm256_loadu_si256((__m256i *)&(weights[idx0])); | |
__m256i v1_ = _mm256_loadu_si256((__m256i *)&(weights[idx1])); | |
v0 = _mm256_add_epi16(v0_, v0); | |
v1 = _mm256_add_epi16(v1, v1_); | |
} | |
#pragma unroll | |
for (auto i = 0; i < 32; i += P) { | |
auto b = (a_1 >> i) & ((1 << P) - 1); | |
auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0); | |
auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1); | |
__m256i v0_ = _mm256_loadu_si256((__m256i *)&(weights[idx0])); | |
__m256i v1_ = _mm256_loadu_si256((__m256i *)&(weights[idx1])); | |
v0_ = _mm256_slli_epi16(v0_, 1); | |
v1_ = _mm256_slli_epi16(v1_, 1); | |
v0 = _mm256_add_epi16(v0_, v0); | |
v1 = _mm256_add_epi16(v1, v1_); | |
} | |
auto mask = _mm256_set_epi8(0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, | |
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, | |
0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0); | |
v0 = _mm256_cmpgt_epi16(v0, zero); | |
v1 = _mm256_cmpgt_epi16(v1, zero); | |
auto outv = _mm256_blendv_epi8(v0, v1, mask); | |
return _mm256_movemask_epi8(outv); | |
} | |
#if ISA == INTEL | |
#if VEC_WIDTH == 256 | |
template <> int32_t layer(int32_t a, int8_t *weights) { | |
auto psN = 1; | |
auto psP = 1 << P; | |
auto vec_width = VEC_BYTES / sizeof(int8_t); | |
auto v = _mm256_set1_epi8(0); | |
auto zero = _mm256_set1_epi8(0); | |
for (auto i = 0; i < 32; i += P) { | |
auto b = (a >> i) & ((1 << P) - 1); | |
auto idx = vec_width * (psN * (psP * (i / P) + b)); | |
__m256i v_ = _mm256_loadu_si256((__m256i *)&(weights[idx])); | |
v = _mm256_add_epi8(v_, v); | |
} | |
v = _mm256_cmpgt_epi8(v, zero); | |
return _mm256_movemask_epi8(v); | |
} | |
template <> int32_t layer(int32_t a, int16_t *weights) { | |
auto v0 = _mm256_set1_epi16(0); | |
auto v1 = _mm256_set1_epi16(0); | |
auto zero = _mm256_set1_epi16(0); | |
auto N = 32; | |
auto vec_width = VEC_BYTES / sizeof(int16_t); | |
auto psN = N / vec_width; | |
auto psP = 1 << P; | |
#pragma unroll | |
for (auto i = 0; i < 32; i += P) { | |
auto b = (a >> i) & ((1 << P) - 1); | |
auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0); | |
auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1); | |
__m256i v0_ = _mm256_loadu_si256((__m256i *)&(weights[idx0])); | |
__m256i v1_ = _mm256_loadu_si256((__m256i *)&(weights[idx1])); | |
v0 = _mm256_add_epi16(v0_, v0); | |
v1 = _mm256_add_epi16(v1, v1_); | |
} | |
auto mask = _mm256_set_epi8(0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, | |
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, | |
0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0); | |
v0 = _mm256_cmpgt_epi16(v0, zero); | |
v1 = _mm256_cmpgt_epi16(v1, zero); | |
auto outv = _mm256_blendv_epi8(v0, v1, mask); | |
return _mm256_movemask_epi8(outv); | |
} | |
#else // VEC_WIDTH == 512 | |
template <> int32_t layer(int32_t a, int16_t *weights) { | |
auto v0 = _mm512_set1_epi16(0); | |
auto v1 = _mm512_set1_epi16(0); | |
auto zero = _mm512_set1_epi16(0); | |
auto N = 32; | |
auto vec_width = VEC_BYTES / sizeof(int16_t); | |
auto psN = N / vec_width; | |
auto psP = 1 << P; | |
#pragma unroll | |
for (auto i = 0; i < 32; i += P) { | |
auto b = (a >> i) & ((1 << P) - 1); | |
auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0); | |
__m512i v0_ = _mm512_loadu_si512((__m512i *)&(weights[idx0])); | |
v0 = _mm512_add_epi16(v0_, v0); | |
} | |
auto out_mask = _mm512_cmpgt_epi16_mask(v0, zero); | |
return _cvtmask32_u32(out_mask); | |
} | |
template <> int32_t layer(int32_t a, float *weights) { | |
auto v0 = _mm512_set1_ps(0); | |
auto v1 = _mm512_set1_ps(0); | |
auto zero = _mm512_set1_ps(0); | |
auto N = 32; | |
auto vec_width = VEC_BYTES / sizeof(float); | |
auto psN = N / vec_width; | |
auto psP = 1 << P; | |
#pragma unroll | |
for (auto i = 0; i < 32; i += P) { | |
auto b = (a >> i) & ((1 << P) - 1); | |
auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0); | |
auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1); | |
auto v0_ = _mm512_loadu_ps((__m512 *)&(weights[idx0])); | |
auto v1_ = _mm512_loadu_ps((__m512 *)&(weights[idx1])); | |
v0 = _mm512_add_ps(v0_, v0); | |
v1 = _mm512_add_ps(v1_, v1); | |
} | |
auto v0_m = _mm512_cmp_ps_mask(v0, zero, _CMP_GT_OQ); | |
auto v1_m = _mm512_cmp_ps_mask(v1, zero, _CMP_GT_OQ); | |
return _cvtmask16_u32(v0_m) | _cvtmask16_u32(v1_m) << 16; | |
} | |
template <> int64_t layer_64(int64_t a, int16_t *weights) { | |
auto v0 = _mm512_set1_epi16(0); | |
auto v1 = _mm512_set1_epi16(0); | |
auto zero = _mm512_set1_epi16(0); | |
auto N = 64; | |
auto K = 64; | |
auto vec_width = VEC_BYTES / sizeof(int16_t); | |
auto psN = N / vec_width; | |
auto psP = 1 << P; | |
#pragma unroll | |
for (auto i = 0; i < K; i += P) { | |
auto b = (a >> i) & ((1 << P) - 1); | |
auto idx0 = vec_width * (psN * (psP * (i / P) + b) + 0); | |
auto idx1 = vec_width * (psN * (psP * (i / P) + b) + 1); | |
__m512i v0_ = _mm512_loadu_si512((__m512i *)&(weights[idx0])); | |
__m512i v1_ = _mm512_loadu_si512((__m512i *)&(weights[idx1])); | |
v0 = _mm512_add_epi16(v0_, v0); | |
v1 = _mm512_add_epi16(v1, v1_); | |
} | |
auto v0_m = _mm512_cmpgt_epi16_mask(v0, zero); | |
auto v1_m = _mm512_cmpgt_epi16_mask(v1, zero); | |
return (int64_t)_cvtmask32_u32(v0_m) | (int64_t)_cvtmask32_u32(v1_m) << 32; | |
} | |
template <> int64_t layer_64(int64_t a, int8_t *weights) { | |
auto v = _mm512_set1_epi8(0); | |
auto zero = _mm512_set1_epi8(0); | |
auto N = 64; | |
auto K = 64; | |
auto vec_width = VEC_BYTES / sizeof(int8_t); | |
auto psN = N / vec_width; | |
auto psP = 1 << P; | |
#pragma unroll | |
for (auto i = 0; i < K; i += P) { | |
auto b = (a >> i) & ((1 << P) - 1); | |
auto idx = vec_width * (psN * (psP * (i / P) + b)); | |
__m512i v_ = _mm512_loadu_si512((__m512i *)&(weights[idx])); | |
v = _mm512_add_epi8(v_, v); | |
} | |
auto v_m = _mm512_cmpgt_epi8_mask(v, zero); | |
return _cvtmask64_u64(v_m); | |
} | |
int32_t layer_64_32(int32_t a, int32_t b, int16_t *weights) { | |
auto zero = _mm512_set1_epi16(0); | |
auto N = 32; | |
auto K = 32; | |
auto vec_width = VEC_BYTES / sizeof(int16_t); | |
auto psN = 1; | |
auto psP = 1 << P; | |
auto v = _mm512_set1_epi16(0); | |
// first a | |
for (auto i = 0; i < K; i += P) { | |
auto bin = (a >> i) & ((1 << P) - 1); | |
auto idx = vec_width * (psN * (psP * (i / P) + bin)); | |
__m512i v_ = _mm512_loadu_si512((__m512i *)&(weights[idx])); | |
v = _mm512_add_epi16(v_, v); | |
} | |
// then b | |
for (auto i = 0; i < K; i += P) { | |
auto bin = (b >> i) & ((1 << P) - 1); | |
auto idx = vec_width * (psN * (psP * ((i + K) / P) + bin)); | |
__m512i v_ = _mm512_loadu_si512((__m512i *)&(weights[idx])); | |
v = _mm512_add_epi16(v_, v); | |
} | |
auto out_mask = _mm512_cmpgt_epi16_mask(v, zero); | |
return _cvtmask32_u32(out_mask); | |
} | |
void batch_layer_64_32(int32_t *inps, int N, int16_t *weights, int32_t *outs) { | |
for (auto i = 0; i < N; ++i) { | |
outs[i] = layer_64_32(inps[i], ~inps[i], weights); | |
} | |
} | |
#endif // VEC_WIDTH | |
#else // ISA != INTEL | |
template <typename T> int32_t layer(int32_t a, T *weights) { | |
auto psN = 1; | |
auto psP = 1 << P; | |
T v[32] = { 0 }; | |
for (auto i = 0; i < 32; i += P) { | |
auto b = (a >> i) & ((1 << P) - 1); | |
auto idx = 32 * (psN * (psP * (i / P) + b)); | |
for (auto k = 0; k < 32; ++k) { | |
v[k] += weights[idx + k]; | |
} | |
} | |
uint32_t out = 0; | |
for (auto k = 0; k < 32; ++k) { | |
out |= v[k] > 0 ? (1 << k) : 0; | |
} | |
return out; | |
} | |
template <typename T> int64_t layer_64(int64_t a, T *weights) { | |
auto psN = 1; | |
auto psP = 1 << P; | |
T v[64] = { 0 }; | |
for (auto i = 0; i < 64; i += P) { | |
auto b = (a >> i) & ((1 << P) - 1); | |
auto idx = 64 * (psN * (psP * (i / P) + b)); | |
for (auto k = 0; k < 64; ++k) { | |
v[k] += weights[idx + k]; | |
} | |
} | |
uint64_t out = 0; | |
for (auto k = 0; k < 64; ++k) { | |
out |= v[k] > 0 ? (1 << k) : 0; | |
} | |
return out; | |
} | |
#endif | |
template <typename T, typename BT> | |
void batch_layer(BT *inps, int N, T *weights, BT *outs) { | |
if (sizeof(BT) == 4) { | |
for (auto i = 0; i < N; ++i) { | |
outs[i] = layer<T>(inps[i], weights); | |
} | |
} else { | |
for (auto i = 0; i < N; ++i) { | |
outs[i] = layer_64(inps[i], weights); | |
} | |
} | |
} | |
// convert K x N weight vec into (K / P) x N pre-summed indexable vec | |
template <typename T> | |
std::vector<T> to_cache(std::vector<T> orig, int K, int N) { | |
auto vec_width = VEC_BYTES / sizeof(T); | |
assert(N % vec_width == 0); | |
assert(K % P == 0); | |
auto psN = N / vec_width; | |
auto psP = 1 << P; | |
auto psK = K / P; | |
std::vector<T> out(N * psK * psP); | |
for (auto n = 0; n < N; ++n) { | |
auto n_ = n; | |
// enables the quick blend stuff for merging int16 vectors after relu | |
if (sizeof(T) == 2 && VEC_WIDTH == 256) { | |
n_ = (n % psN) * vec_width + n / psN; | |
} | |
for (auto o_k = 0; o_k < psK; ++o_k) { | |
for (auto b = 0; b < psP; ++b) { | |
int16_t tot = 0; | |
for (auto i_k = 0; i_k < P; ++i_k) { | |
auto k = o_k * P + i_k; | |
if ((b >> i_k) & 1) { | |
tot += orig[k * N + n]; | |
} | |
} | |
auto idx = psN * vec_width * (psP * (o_k) + b) + n_; | |
out[idx] = tot; | |
} | |
} | |
} | |
return out; | |
} | |
template <typename T> std::vector<T> rand_mat(int K, int N) { | |
std::vector<T> out(N * K); | |
for (auto k = 0; k < K; ++k) { | |
for (auto n = 0; n < N; ++n) { | |
auto i = rand() % 10 - 5; | |
if (sizeof(T) > 1) { | |
i = rand() % 100 - 50; | |
} | |
out[k * N + n] = i; | |
} | |
} | |
return out; | |
} | |
template <typename T, typename IN, typename OUT> | |
OUT ref_layer(IN inp, std::vector<T> W) { | |
OUT out = 0; | |
// std::cout << "ref "; | |
for (auto n = 0; n < sizeof(OUT) * 8; ++n) { | |
T tot = 0; | |
for (auto k = 0; k < sizeof(IN) * 8; ++k) { | |
if ((inp >> k) & 1) { | |
tot += W[k * sizeof(OUT) * 8 + n]; | |
} | |
} | |
// std::cout << tot << " "; | |
if (tot > 0) { | |
out |= ((uint64_t)1 << n); | |
} | |
} | |
// std::cout << "\n"; | |
return out; | |
} | |
template <typename T> | |
void ref_layer_full(T* inp, T* W, int M, int N, int K, T* out) { | |
//std::cout << "ref "; | |
for (auto n = 0; n < N; ++n) { | |
for (auto m = 0; m < M; ++m) { | |
T tot = 0; | |
for (auto k = 0; k < K; ++k) { | |
tot += inp[m * K + k] * W[k * N + n]; | |
} | |
out[m * N + n] = tot; | |
//std::cout << tot << " "; | |
} | |
} | |
//std::cout << "\n"; | |
} | |
template <typename T> | |
int32_t ref_layer(int32_t inp, int32_t inp2, std::vector<T> W) { | |
int32_t out = 0; | |
for (auto n = 0; n < 32; ++n) { | |
T tot = 0; | |
for (auto k = 0; k < 32; ++k) { | |
if ((inp >> k) & 1) { | |
tot += W[k * 32 + n]; | |
} | |
} | |
for (auto k = 32; k < 64; ++k) { | |
if ((inp2 >> (k - 32)) & 1) { | |
tot += W[k * 32 + n]; | |
} | |
} | |
if (tot > 0) { | |
out |= (1 << n); | |
} | |
} | |
return out; | |
} | |
// https://crypto.stackexchange.com/questions/16219/cryptographic-hash-function-for-32-bit-length-input-keys | |
uint32_t perm32(uint32_t x) { | |
int n = 12; | |
do { | |
x = ((x >> 8) ^ x) * 0x6B + n; | |
} while (--n != 0); | |
return x; | |
} | |
template <typename T> void bench() { | |
auto W = rand_mat<T>(32, 32); | |
auto psW = to_cache(W, 32, 32); | |
std::cout << "benchmarking " << 8 * sizeof(T) | |
<< "bit weight (binary activations)\n"; | |
auto ref = ref_layer<T, int32_t, int32_t>(123, W); | |
std::cout << " ref: " << std::bitset<32>(ref) << "\n"; | |
auto l = layer<T>(123, psW.data()); | |
std::cout << " layer: " << std::bitset<32>(l) << "\n"; | |
assert(ref == l); | |
for (auto i = 1; i < (1 << 30); i *= 2) { | |
auto test = i + 1337; | |
auto ref = ref_layer<T, int32_t, int32_t>(test, W); | |
auto l = layer<T>(test, psW.data()); | |
if (ref != l) { | |
std::cerr << "ERROR! input " << test << " breaks test\n"; | |
return; | |
} | |
} | |
auto X = 0xff1fff17; | |
{ | |
auto W0 = rand_mat<T>(32, 32); | |
auto psW0 = to_cache(W, 32, 32); | |
auto iters = 10000000; | |
std::cout << " 1 layer 32 neuron\n"; | |
for (size_t i = 0; i < 1000; ++i) { | |
X ^= layer<T>(X, psW0.data()); | |
X = ~X >> 1; | |
} | |
auto start = std::chrono::steady_clock::now(); | |
for (size_t i = 0; i < iters; ++i) { | |
X ^= layer<T>(X, psW0.data()); | |
X = ~X >> 1; | |
} | |
auto end = std::chrono::steady_clock::now(); | |
std::chrono::duration<double> diff = end - start; | |
auto gops = 1.0 * iters * 32 * 32 * 2 / diff.count() / 1e9; | |
std::cout << " iters/s: " << iters / diff.count(); | |
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run "; | |
std::cout << gops << " Gops) [" << X << "]\n"; | |
} | |
{ | |
auto W0 = rand_mat<T>(32, 32); | |
auto psW0 = to_cache(W, 32, 32); | |
auto W1 = rand_mat<T>(32, 32); | |
auto psW1 = to_cache(W, 32, 32); | |
auto iters = 10000000; | |
std::cout << " 2 layer 32 neuron\n"; | |
for (size_t i = 0; i < 1000; ++i) { | |
X ^= layer<T>(X, psW0.data()); | |
X ^= layer<T>(X, psW1.data()); | |
} | |
auto start = std::chrono::steady_clock::now(); | |
for (size_t i = 0; i < iters; ++i) { | |
X ^= layer<T>(X, psW0.data()); | |
X = ~X; | |
X ^= layer<T>(X, psW1.data()); | |
} | |
auto end = std::chrono::steady_clock::now(); | |
std::chrono::duration<double> diff = end - start; | |
auto gops = 1.0 * iters * 32 * 32 * 2 * 2 / diff.count() / 1e9; | |
std::cout << " iters/s: " << iters / diff.count(); | |
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run "; | |
std::cout << gops << " Gops) [" << X << "]\n"; | |
} | |
{ | |
auto W0 = rand_mat<T>(32, 32); | |
auto psW0 = to_cache(W, 32, 32); | |
auto W1 = rand_mat<T>(32, 32); | |
auto psW1 = to_cache(W, 32, 32); | |
auto W2 = rand_mat<T>(32, 32); | |
auto psW2 = to_cache(W, 32, 32); | |
auto iters = 10000000; | |
std::cout << " 3 layer 32 neuron\n"; | |
for (size_t i = 0; i < 1000; ++i) { | |
X ^= layer<T>(X, psW0.data()); | |
X ^= layer<T>(X, psW1.data()); | |
X ^= layer<T>(X, psW2.data()); | |
} | |
auto start = std::chrono::steady_clock::now(); | |
for (size_t i = 0; i < iters; ++i) { | |
X ^= layer<T>(X, psW0.data()); | |
X = ~X; | |
X ^= layer<T>(X, psW1.data()); | |
X = ~X; | |
X ^= layer<T>(X, psW2.data()); | |
} | |
auto end = std::chrono::steady_clock::now(); | |
std::chrono::duration<double> diff = end - start; | |
auto gops = 1.0 * iters * 32 * 32 * 2 * 3 / diff.count() / 1e9; | |
std::cout << " iters/s: " << iters / diff.count(); | |
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run "; | |
std::cout << gops << " Gops) [" << X << "]\n"; | |
} | |
{ | |
auto W0 = rand_mat<T>(32, 32); | |
auto psW0 = to_cache(W, 32, 32); | |
auto W1 = rand_mat<T>(32, 32); | |
auto psW1 = to_cache(W, 32, 32); | |
auto iters = 10000000; | |
auto bs = 4; | |
int32_t Xs[bs]; | |
for (auto i = 0; i < bs; ++i) { | |
Xs[i] = perm32(i); | |
} | |
std::cout << " " << bs << " batched 2 layer bs neuron\n"; | |
auto rot = [&]() { | |
for (auto i = 0; i < bs; ++i) { | |
// rotate and flip bottom bits | |
auto j = (i + 1) % bs; | |
Xs[j] = ((Xs[i] << 1) | ((uint32_t)Xs[i] >> (32 - 1))) ^ X; | |
X ^= Xs[j]; | |
} | |
}; | |
for (size_t i = 0; i < 1000; ++i) { | |
rot(); | |
batch_layer<T, int32_t>(Xs, bs, psW0.data(), Xs); | |
batch_layer<T, int32_t>(Xs, bs, psW1.data(), Xs); | |
} | |
auto start = std::chrono::steady_clock::now(); | |
for (size_t i = 0; i < iters; ++i) { | |
rot(); | |
batch_layer<T, int32_t>(Xs, bs, psW0.data(), Xs); | |
batch_layer<T, int32_t>(Xs, bs, psW1.data(), Xs); | |
} | |
auto end = std::chrono::steady_clock::now(); | |
std::chrono::duration<double> diff = end - start; | |
auto gops = bs * 1.0 * iters * 32 * 32 * 2 * 2 / diff.count() / 1e9; | |
std::cout << " infers/s: " << (iters * bs) / diff.count(); | |
std::cout << " (" << diff.count() * 1e9 / (iters * bs) << " ns/infer "; | |
std::cout << gops << " Gops : "; | |
std::cout << diff.count() * 1e9 / iters << "ns/iter) [" << Xs[0] << "]\n"; | |
} | |
{ | |
auto W0 = rand_mat<T>(32, 32); | |
auto cache_killer = 10000; | |
std::vector<std::vector<T>> ws; | |
for (auto i = 0; i < cache_killer; ++i) { | |
ws.emplace_back(to_cache(W, 32, 32)); | |
} | |
auto iters = 10000000; | |
std::cout << " cache-killed 1 layer 32 neuron\n"; | |
for (size_t i = 0; i < 1000; ++i) { | |
auto &w = ws[i % cache_killer]; | |
X ^= layer<T>(X, w.data()); | |
X = ~X; | |
} | |
auto start = std::chrono::steady_clock::now(); | |
for (size_t i = 0; i < iters; ++i) { | |
auto &w = ws[i % cache_killer]; | |
X ^= layer<T>(X, w.data()); | |
X = ~X; | |
} | |
auto end = std::chrono::steady_clock::now(); | |
std::chrono::duration<double> diff = end - start; | |
auto gops = 1.0 * iters * 32 * 32 * 2 / diff.count() / 1e9; | |
std::cout << " iters/s: " << iters / diff.count(); | |
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run "; | |
std::cout << gops << " Gops) [" << X << "]\n"; | |
} | |
} | |
template <typename T> void bench_64_32() { | |
std::cout << "benchmarking " << 8 * sizeof(T) | |
<< "bit weight 64->32 (binary activations)\n"; | |
auto W = rand_mat<T>(64, 32); | |
auto psW = to_cache(W, 64, 32); | |
auto ref = ref_layer<T>(123, 123, W); | |
std::cout << " ref: " << std::bitset<32>(ref) << "\n"; | |
auto l = layer_64_32(123, 123, psW.data()); | |
std::cout << " layer: " << std::bitset<32>(l) << "\n"; | |
assert(ref == l); | |
for (auto i = 1; i < (1 << 30); i *= 2) { | |
auto test = i + 1337; | |
auto ref = ref_layer<T>(test, ~test, W); | |
auto l = layer_64_32(test, ~test, psW.data()); | |
if (ref != l) { | |
std::cerr << "ERROR! input " << test << " breaks test\n"; | |
return; | |
} | |
} | |
auto X = 0xff1fff17; | |
{ | |
auto W0 = rand_mat<T>(64, 32); | |
auto psW0 = to_cache(W, 64, 32); | |
auto iters = 10000000; | |
std::cout << " 1 layer 32 neuron\n"; | |
for (size_t i = 0; i < 1000; ++i) { | |
X ^= layer_64_32(X, ~X, psW0.data()); | |
X = ~X; | |
} | |
auto start = std::chrono::steady_clock::now(); | |
for (size_t i = 0; i < iters; ++i) { | |
X ^= layer_64_32(X, ~X, psW0.data()); | |
X = ~X; | |
} | |
auto end = std::chrono::steady_clock::now(); | |
std::chrono::duration<double> diff = end - start; | |
auto gops = 1.0 * iters * 64 * 32 * 2 / diff.count() / 1e9; | |
std::cout << " iters/s: " << iters / diff.count(); | |
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run "; | |
std::cout << gops << " Gops) [" << X << "]\n"; | |
} | |
{ | |
auto W0 = rand_mat<T>(64, 32); | |
auto psW0 = to_cache(W, 64, 32); | |
auto iters = 10000000; | |
auto bs = 8; | |
int32_t Xs[bs]; | |
for (auto i = 0; i < bs; ++i) { | |
Xs[i] = perm32(i); | |
} | |
std::cout << " " << bs << " batched 1 layer 32 neuron\n"; | |
auto rot = [&]() { | |
for (auto i = 0; i < bs; ++i) { | |
// rotate and flip bottom bits | |
auto j = (i + 1) % bs; | |
Xs[j] = ((Xs[i] << 1) | ((uint32_t)Xs[i] >> (32 - 1))) ^ X; | |
X ^= Xs[j]; | |
} | |
}; | |
for (size_t i = 0; i < 1000; ++i) { | |
rot(); | |
batch_layer_64_32(Xs, bs, psW0.data(), Xs); | |
} | |
auto start = std::chrono::steady_clock::now(); | |
for (size_t i = 0; i < iters; ++i) { | |
rot(); | |
batch_layer_64_32(Xs, bs, psW0.data(), Xs); | |
} | |
auto end = std::chrono::steady_clock::now(); | |
std::chrono::duration<double> diff = end - start; | |
auto gops = bs * 1.0 * iters * 64 * 32 * 2 / diff.count() / 1e9; | |
std::cout << " infers/s: " << (iters * bs) / diff.count(); | |
std::cout << " (" << diff.count() * 1e9 / (iters * bs) << " ns/infer "; | |
std::cout << gops << " Gops) [" << Xs[0] << "]\n"; | |
} | |
} | |
template <typename T> void bench_64() { | |
auto W = rand_mat<T>(64, 64); | |
auto psW = to_cache(W, 64, 64); | |
std::cout << "benchmarking 64->64 " << 8 * sizeof(T) | |
<< "bit weight (binary activations)\n"; | |
auto ref = ref_layer<T, int64_t, int64_t>(123, W); | |
std::cout << " ref: " << std::bitset<64>(ref) << "\n"; | |
auto l = layer_64<T>(123, psW.data()); | |
std::cout << " layer: " << std::bitset<64>(l) << "\n"; | |
assert(ref == l); | |
for (auto i = 1; i < (1 << 30); i *= 2) { | |
auto test = i + 1337; | |
auto ref = ref_layer<T, int64_t, int64_t>(test, W); | |
auto l = layer_64<T>(test, psW.data()); | |
if (ref != l) { | |
std::cerr << "ERROR! input " << test << " breaks test\n"; | |
return; | |
} | |
} | |
int64_t X = 0xff1fff17ffff883f; | |
{ | |
auto W0 = rand_mat<T>(64, 64); | |
auto psW0 = to_cache(W, 64, 64); | |
auto iters = 10000000; | |
std::cout << " 1 layer 64 neuron\n"; | |
for (size_t i = 0; i < 1000; ++i) { | |
X ^= layer_64<T>(X, psW0.data()); | |
X = ~X; | |
} | |
auto start = std::chrono::steady_clock::now(); | |
for (size_t i = 0; i < iters; ++i) { | |
X ^= layer_64<T>(X, psW0.data()); | |
X = ~X; | |
} | |
auto end = std::chrono::steady_clock::now(); | |
std::chrono::duration<double> diff = end - start; | |
auto gops = 1.0 * iters * 64 * 64 * 2 / diff.count() / 1e9; | |
std::cout << " iters/s: " << iters / diff.count(); | |
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run "; | |
std::cout << gops << " Gops) [" << X << "]\n"; | |
} | |
{ | |
auto W0 = rand_mat<T>(64, 64); | |
auto psW0 = to_cache(W, 64, 64); | |
auto iters = 10000000; | |
auto bs = 8; | |
int64_t Xs[bs]; | |
for (auto i = 0; i < bs; ++i) { | |
Xs[i] = (uint64_t)perm32(i) << 32 | perm32(~i); | |
} | |
std::cout << " " << bs << " batched 1 layer 64 neuron\n"; | |
auto rot = [&]() { | |
for (auto i = 0; i < bs; ++i) { | |
// rotate and flip bottom bits | |
auto j = (i + 1) % bs; | |
Xs[j] = (((uint64_t)Xs[i] << 1) | ((uint64_t)Xs[i] >> (64 - 2))) ^ X; | |
X ^= Xs[j]; | |
} | |
}; | |
for (size_t i = 0; i < 1000; ++i) { | |
rot(); | |
batch_layer<T, int64_t>(Xs, bs, psW0.data(), Xs); | |
} | |
auto start = std::chrono::steady_clock::now(); | |
for (size_t i = 0; i < iters; ++i) { | |
rot(); | |
batch_layer<T, int64_t>(Xs, bs, psW0.data(), Xs); | |
} | |
auto end = std::chrono::steady_clock::now(); | |
std::chrono::duration<double> diff = end - start; | |
auto gops = bs * 1.0 * iters * 64 * 64 * 2 / diff.count() / 1e9; | |
std::cout << " infers/s: " << (iters * bs) / diff.count(); | |
std::cout << " (" << diff.count() * 1e9 / (iters * bs) << " ns/infer "; | |
std::cout << gops << " Gops) [" << Xs[0] << "]\n"; | |
} | |
} | |
template <typename T> | |
void bench_2bit() { | |
auto W = rand_mat<T>(32, 32); | |
auto psW = to_cache(W, 32, 32); | |
auto X = rand_mat<T>(1, 32); | |
for (auto& x : X) { | |
x &= 0b11; | |
} | |
int32_t Xb0 = 0; | |
int32_t Xb1 = 0; | |
for (auto i = 0; i < 32; ++i) { | |
Xb0 |= (X[i] & 0b1) << i; | |
Xb1 |= ((X[i] & 0b10) >> 1) << i; | |
} | |
auto o = layer_2bit(Xb0, Xb1, psW.data()); | |
std::cout << "alg "; | |
std::cout << std::bitset<32>(o) << "\n"; | |
std::vector<T> out(32); | |
ref_layer_full(X.data(), W.data(), 1, 32, 32, out.data()); | |
std::cout << "ref "; | |
int32_t ref =0; | |
for (auto i = 0; i < out.size(); ++i) { | |
ref |= (out[i] > 0) << i; | |
} | |
std::cout << std::bitset<32>(ref) << "\n"; | |
int32_t Xs[2]; | |
Xs[0] = perm32(0 + 1337); | |
Xs[1] = perm32(1 + 1337); | |
auto iters = 100000000; | |
for (size_t i = 0; i < 10000; ++i) { | |
auto o = layer_2bit(Xs[0], Xs[1], psW.data()); | |
Xs[0] ^= o; | |
Xs[0] = ~Xs[0] << 1; | |
Xs[1] ^= o; | |
Xs[1] = ~Xs[1] >> 1; | |
//std::cout << std::bitset<32>(Xs[0]) << ":" << std::bitset<32>(Xs[1]) << "\n"; | |
} | |
auto start = std::chrono::steady_clock::now(); | |
for (size_t i = 0; i < iters; ++i) { | |
auto o = layer_2bit(Xs[0], Xs[1], psW.data()); | |
Xs[0] ^= o; | |
Xs[0] = ~Xs[0] << 1; | |
Xs[1] ^= o; | |
Xs[1] = ~Xs[1] >> 1; | |
} | |
auto end = std::chrono::steady_clock::now(); | |
std::chrono::duration<double> diff = end - start; | |
auto gops = 1.0 * iters * 32 * 32 * 2 / diff.count() / 1e9; | |
std::cout << " iters/s: " << iters / diff.count(); | |
std::cout << " (" << diff.count() * 1e9 / iters << " ns/run "; | |
std::cout << gops << " Gops) [" << Xs[0] << "]\n"; | |
} | |
int main(int argc, char **argv) { | |
#if VEC_WIDTH == 256 | |
bench<int8_t>(); | |
#endif | |
bench<int16_t>(); | |
#if VEC_WIDTH == 512 | |
bench<float>(); | |
bench_64_32<int16_t>(); | |
bench_64<int16_t>(); | |
bench_64<int8_t>(); | |
#endif | |
bench_2bit<int16_t>(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment