Skip to content

Instantly share code, notes, and snippets.

@dzaima
Last active May 10, 2023 16:29
Show Gist options
  • Save dzaima/5909a0c00e27ad05d1343edca9040b66 to your computer and use it in GitHub Desktop.
Save dzaima/5909a0c00e27ad05d1343edca9040b66 to your computer and use it in GitHub Desktop.
// WIDTH: number of bytes to load/store in one instruction for bandwidth tests
// default: 32
// WIDTH∊1 2 4 8: regular integer loads
// x86-64 WIDTH==16 - SSE; WIDTH==32 - AVX2
// aarch64: WIDTH==16 - NEON loads; WIDTH==32 uses an ldp
// CLOCK_RATE: number of clock cycles in a second on the target processor
// TEST_LAT: test latency instead of bandwidth; latency tests aligned 8-byte loads
#ifndef CLOCK_RATE
#define CLOCK_RATE 3.6e9 // 3.6GHz
#endif
#ifndef TEST_LAT
#define TEST_LAT 0 // test latency
#endif
// #define TP_ONLY 1 // only load
// #define TP_ONLY 2 // only store
#if __x86_64__
#include <immintrin.h>
#elif __aarch64__
#include <arm_neon.h>
#endif
#include <inttypes.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stddef.h>
#define NOINLINE __attribute__((noinline))
#define FORCE_INLINE __attribute__((always_inline)) static inline
#define NORETURN __attribute__((noreturn))
#define KEEPG(X) __asm__("" :: "r"(X))
#define KEEPV(X) __asm__("" :: "x"(X))
#define IDENTG(X) ({ __auto_type x_ = (X); __asm__("" : "=r"(x_) : "0"(x_)); x_; })
#define MEMM __asm__("" ::: "memory")
//#define KEEP(X) ({ __auto_type x=(X); _Generic(x, void*: KEEPG(x), KEEPV(x)); 1; })
typedef int8_t i8;
typedef uint8_t u8;
typedef int16_t i16;
typedef uint16_t u16;
typedef int32_t i32;
typedef uint32_t u32;
typedef int64_t i64;
typedef uint64_t u64;
typedef double f64;
typedef size_t ux;
static inline void _wymum(uint64_t *A, uint64_t *B){
__uint128_t r=*A; r*=*B;
*A=(uint64_t)r; *B=(uint64_t)(r>>64);
}
static inline uint64_t _wymix(uint64_t A, uint64_t B){ _wymum(&A,&B); return A^B; }
//The wyrand PRNG that pass BigCrush and PractRand
static inline uint64_t wyrand(uint64_t *seed){ *seed+=0xa0761d6478bd642full; return _wymix(*seed,*seed^0xe7037ed1a0b428dbull);}
//fast range integer random number generation on [0,k) credit to Daniel Lemire. May not work when WYHASH_32BIT_MUM=1. It can be combined with wyrand, wyhash64 or wyhash.
static inline uint64_t wy2u0k(uint64_t r, uint64_t k){ _wymum(&r,&k); return k; }
#include <time.h>
static inline u64 nsTime() {
struct timespec t;
clock_gettime(CLOCK_REALTIME, &t);
return (u64)(t.tv_sec*1000000000ll + t.tv_nsec);
}
#ifndef WIDTH
#define WIDTH 32
#endif
#if WIDTH>8
#define KEEPL(X) KEEPV(X)
#if __x86_64__
#if WIDTH==32
#define INIT __m256 val = _mm256_set1_ps(123);
#if UNALIGNED
#define STORE _mm256_storeu_ps
#define LOAD _mm256_loadu_ps
#else
#define STORE _mm256_store_ps
#define LOAD _mm256_load_ps
#endif
#elif WIDTH==16
#define INIT __m128 val = _mm_set1_ps(123);
#if UNALIGNED
#define STORE _mm_storeu_ps
#define LOAD _mm_loadu_ps
#else
#define STORE _mm_store_ps
#define LOAD _mm_load_ps
#endif
#endif
#elif __aarch64__
#define INIT float32x4_t val = vdupq_n_f32(123);
#if WIDTH==16
#define LOAD vld1q_f32
#define STORE vst1q_f32
#elif WIDTH==32
#undef KEEPL
#define KEEPL(X) ({ __auto_type x=X; KEEPV(x[0]); KEEPV(x[1]); })
#define LOAD(P) (float32x4_t[2]){vld1q_f32(P), vld1q_f32(P+4)}
#define STORE(P,V) vst1q_f32(P,V); vst1q_f32(P+4,V);
#endif
#endif
#else
#define INIT TY val = IDENTG((TY)123);
#if WIDTH==8
#define TY u64
#elif WIDTH==4
#define TY u32
#elif WIDTH==2
#define TY u16
#elif WIDTH==1
#define TY u8
#endif
#define KEEPL(X) KEEPG(X)
#define STORE(P, V) *(TY*)(P) = (V)
#define LOAD(P) *(TY*)(P)
#endif
NOINLINE u64 testThroughput(u8* data, u64 testsize, u64 rep, u64 breakonns, bool store) {
u64 tns = 0;
u64 minns = 1e18;
while(tns < 1e8) {
u64 sns = nsTime();
INIT;
if (store) {
for (u64 k = 0; k < rep; k++) {
for (u64 i = 0; i < testsize; i+= WIDTH*32) {
u8* ptr = IDENTG(data+i); // IDENTG to avoid the addition propagating to complex addressing modes
#pragma GCC unroll(32)
for (u64 j = 0; j < 32; j++) { STORE((float*)(ptr + j*WIDTH), val); MEMM; }
}
}
} else {
for (u64 k = 0; k < rep; k++) {
for (u64 i = 0; i < testsize; i+= WIDTH*32) {
u8* ptr = IDENTG(data+i);
#pragma GCC unroll(32)
for (u64 j = 0; j < 32; j++) KEEPL(LOAD((float*)(ptr + j*WIDTH)));
}
}
}
u64 ens = nsTime();
u64 cns = (ens-sns) / rep;
if (cns < minns) {
minns = cns;
if (minns < breakonns) return minns;
}
tns+= ens-sns;
}
return minns;
}
NOINLINE u64 testLatency(u8* data, u64 bytes, u64 rep, u64 breakonns, bool unused) {
u64 tns = 0;
u64 minns = 1e18;
while(tns < 1e8) {
u64 sns = nsTime();
for (u64 k = 0; k < rep; k++) {
void** ptr = (void**)data;
for (u64 i = 0; i < bytes/sizeof(void*); i+= 32) {
#pragma GCC unroll(32)
for (u64 j = 0; j < 32; j++) ptr = *ptr;
}
KEEPG(ptr);
}
u64 ens = nsTime();
u64 cns = (ens-sns) / rep;
if (cns < minns) {
minns = cns;
if (minns < breakonns) return minns;
}
tns+= ens-sns;
}
return minns;
}
#if TEST_LAT
#define test testLatency
#else
#define test testThroughput
#endif
int main() {
u64 max = 104857600; // 100MB
u8* data = malloc(max+10240);
data = (u8*) (((u64)data+4096) & ~(u64)4095);
#if UNALIGNED
data+= UNALIGNED;
#endif
#if TEST_LAT
u64 seed = 12345;
void** indexes = malloc(max+10240);
for (u64 i = 0; i < max/sizeof(void*); i++) indexes[i] = (void*)i;
#else
for (u64 i = 0; i < max; i++) data[i] = i;
#endif
u64 bytes = 1024;
while (bytes < max) {
#if TEST_LAT
// create random pointer chain
u64 ptram = bytes/sizeof(void*);
for (u64 i = 0; i < ptram; i++) { // shuffle ↕ptram pointers
u64 j = wy2u0k(wyrand(&seed), i);
void* t=indexes[i];
indexes[i] = indexes[j];
indexes[j] = t;
}
void** ptrs = (void**)data;
u64 ci = indexes[ptram]-NULL;
for (u64 i = 0; i < ptram+1; i++) { // create the chain
ptrs[ci] = ptrs + (indexes[i]-NULL);
ci = indexes[i]-NULL;
}
#endif
u64 nses[2];
for (int i = 0; i < (TEST_LAT? 1 : 2); i++) {
#if TP_ONLY
if (i!=TP_ONLY-1) continue;
#endif
u64 minns = test(data, bytes, 1, 10000, false);
if (minns < 10000) {
u64 rep = 10000 / minns;
minns = test(data, bytes, rep, 0, i);
}
nses[i] = minns;
}
#if TEST_LAT
printf("%9ldB: %6.2fns, %6.2fc; %ld ptrs, %ld total ns\n", bytes, nses[0]*1.0/ptram, (nses[0]/1e9*CLOCK_RATE)/ptram, ptram, nses[0]);
#else
printf("%9ldB: ", bytes);
#if !TP_ONLY || TP_ONLY==1
printf("load:%7.2fGB/s,%6.2fB/c", bytes/(nses[0]/1e9) / 1e9, bytes/(nses[0]/1e9) / CLOCK_RATE);
#endif
#ifndef TP_ONLY
printf(" ");
#endif
#if !TP_ONLY || TP_ONLY==2
printf("store:%7.2fGB/s,%6.2fB/c", bytes/(nses[1]/1e9) / 1e9, bytes/(nses[1]/1e9) / CLOCK_RATE);
#endif
printf("\n");
#endif
fflush(stdout);
bytes*= 1.1;
bytes = (bytes+1023) & ~(u64)1023;
}
}
@dzaima
Copy link
Author

dzaima commented Mar 13, 2023

i3-4160 3.6GHz:

     1024B: load: 204.80GB/s, 56.89B/c      store: 128.00GB/s, 35.56B/c
     2048B: load: 227.56GB/s, 63.21B/c      store: 120.47GB/s, 33.46B/c
     3072B: load: 236.31GB/s, 65.64B/c      store: 118.15GB/s, 32.82B/c
     4096B: load: 227.56GB/s, 63.21B/c      store: 117.03GB/s, 32.51B/c
     5120B: load: 232.73GB/s, 64.65B/c      store: 116.36GB/s, 32.32B/c
     6144B: load: 227.56GB/s, 63.21B/c      store: 115.92GB/s, 32.20B/c
     7168B: load: 231.23GB/s, 64.23B/c      store: 115.61GB/s, 32.11B/c
     8192B: load: 234.06GB/s, 65.02B/c      store: 115.38GB/s, 32.05B/c
     9216B: load: 224.78GB/s, 62.44B/c      store: 115.20GB/s, 32.00B/c
    10240B: load: 227.56GB/s, 63.21B/c      store: 115.06GB/s, 31.96B/c
    11264B: load: 225.28GB/s, 62.58B/c      store: 114.94GB/s, 31.93B/c
    13312B: load: 221.87GB/s, 61.63B/c      store: 114.76GB/s, 31.88B/c
    15360B: load: 225.88GB/s, 62.75B/c      store: 115.49GB/s, 32.08B/c
    17408B: load: 226.08GB/s, 62.80B/c      store: 115.28GB/s, 32.02B/c
    19456B: load: 226.23GB/s, 62.84B/c      store: 115.12GB/s, 31.98B/c
    21504B: load: 224.00GB/s, 62.22B/c      store: 114.99GB/s, 31.94B/c
    24576B: load: 225.47GB/s, 62.63B/c      store: 114.84GB/s, 31.90B/c
    27648B: load: 224.78GB/s, 62.44B/c      store: 115.20GB/s, 32.00B/c
    30720B: load: 222.61GB/s, 61.84B/c      store: 114.63GB/s, 31.84B/c
    33792B: load: 152.90GB/s, 42.47B/c      store:  81.62GB/s, 22.67B/c
    37888B: load: 110.14GB/s, 30.59B/c      store:  37.48GB/s, 10.41B/c
    41984B: load: 111.36GB/s, 30.93B/c      store:  38.31GB/s, 10.64B/c
    47104B: load: 110.06GB/s, 30.57B/c      store:  38.17GB/s, 10.60B/c
    52224B: load: 111.11GB/s, 30.87B/c      store:  38.29GB/s, 10.64B/c
    58368B: load: 110.97GB/s, 30.82B/c      store:  38.27GB/s, 10.63B/c
    64512B: load: 110.47GB/s, 30.68B/c      store:  37.42GB/s, 10.39B/c
    71680B: load: 109.94GB/s, 30.54B/c      store:  38.23GB/s, 10.62B/c
    78848B: load: 112.32GB/s, 31.20B/c      store:  37.44GB/s, 10.40B/c
    87040B: load: 112.02GB/s, 31.12B/c      store:  37.40GB/s, 10.39B/c
    96256B: load: 107.67GB/s, 29.91B/c      store:  37.44GB/s, 10.40B/c
   106496B: load: 107.25GB/s, 29.79B/c      store:  37.38GB/s, 10.38B/c
   117760B: load: 106.28GB/s, 29.52B/c      store:  38.89GB/s, 10.80B/c
   130048B: load: 107.30GB/s, 29.81B/c      store:  37.99GB/s, 10.55B/c
   143360B: load: 101.31GB/s, 28.14B/c      store:  38.02GB/s, 10.56B/c
   157696B: load:  98.07GB/s, 27.24B/c      store:  37.70GB/s, 10.47B/c
   174080B: load: 101.74GB/s, 28.26B/c      store:  37.93GB/s, 10.54B/c
   191488B: load:  96.27GB/s, 26.74B/c      store:  37.99GB/s, 10.55B/c
   210944B: load:  82.50GB/s, 22.92B/c      store:  37.24GB/s, 10.34B/c
   232448B: load:  62.71GB/s, 17.42B/c      store:  35.21GB/s,  9.78B/c
   256000B: load:  61.93GB/s, 17.20B/c      store:  33.07GB/s,  9.19B/c
   281600B: load:  62.07GB/s, 17.24B/c      store:  32.60GB/s,  9.06B/c
   310272B: load:  57.58GB/s, 15.99B/c      store:  33.74GB/s,  9.37B/c
   342016B: load:  50.94GB/s, 14.15B/c      store:  30.49GB/s,  8.47B/c
   376832B: load:  48.19GB/s, 13.39B/c      store:  31.78GB/s,  8.83B/c
   414720B: load:  48.26GB/s, 13.40B/c      store:  31.23GB/s,  8.67B/c
   456704B: load:  48.60GB/s, 13.50B/c      store:  30.82GB/s,  8.56B/c
   502784B: load:  46.89GB/s, 13.02B/c      store:  46.76GB/s, 12.99B/c
   553984B: load:  46.65GB/s, 12.96B/c      store:  46.68GB/s, 12.97B/c
   610304B: load:  46.42GB/s, 12.89B/c      store:  46.84GB/s, 13.01B/c
   671744B: load:  46.66GB/s, 12.96B/c      store:  46.41GB/s, 12.89B/c
   739328B: load:  46.75GB/s, 12.98B/c      store:  46.40GB/s, 12.89B/c
   814080B: load:  46.77GB/s, 12.99B/c      store:  46.47GB/s, 12.91B/c
   896000B: load:  46.80GB/s, 13.00B/c      store:  46.73GB/s, 12.98B/c
   986112B: load:  46.40GB/s, 12.89B/c      store:  46.69GB/s, 12.97B/c
  1085440B: load:  46.58GB/s, 12.94B/c      store:  46.42GB/s, 12.90B/c
  1193984B: load:  46.72GB/s, 12.98B/c      store:  46.73GB/s, 12.98B/c
  1313792B: load:  46.75GB/s, 12.98B/c      store:  46.65GB/s, 12.96B/c
  1445888B: load:  46.70GB/s, 12.97B/c      store:  46.41GB/s, 12.89B/c
  1591296B: load:  46.43GB/s, 12.90B/c      store:  46.79GB/s, 13.00B/c
  1751040B: load:  46.02GB/s, 12.78B/c      store:  45.68GB/s, 12.69B/c
  1926144B: load:  45.75GB/s, 12.71B/c      store:  45.75GB/s, 12.71B/c
  2119680B: load:  42.42GB/s, 11.78B/c      store:  42.27GB/s, 11.74B/c
  2331648B: load:  39.21GB/s, 10.89B/c      store:  39.17GB/s, 10.88B/c
  2565120B: load:  35.93GB/s,  9.98B/c      store:  35.93GB/s,  9.98B/c
  2822144B: load:  32.05GB/s,  8.90B/c      store:  32.12GB/s,  8.92B/c
  3104768B: load:  29.26GB/s,  8.13B/c      store:  29.17GB/s,  8.10B/c
  3416064B: load:  30.77GB/s,  8.55B/c      store:  30.87GB/s,  8.57B/c
  3758080B: load:  23.91GB/s,  6.64B/c      store:  23.96GB/s,  6.66B/c
  4133888B: load:  21.81GB/s,  6.06B/c      store:  21.82GB/s,  6.06B/c
  4547584B: load:  23.16GB/s,  6.43B/c      store:  23.19GB/s,  6.44B/c
  5003264B: load:  21.46GB/s,  5.96B/c      store:  21.50GB/s,  5.97B/c
  5504000B: load:  20.18GB/s,  5.61B/c      store:  20.22GB/s,  5.62B/c
  6054912B: load:  19.27GB/s,  5.35B/c      store:  19.30GB/s,  5.36B/c
  6661120B: load:  18.68GB/s,  5.19B/c      store:  18.74GB/s,  5.21B/c
  7327744B: load:  18.33GB/s,  5.09B/c      store:  18.31GB/s,  5.09B/c
  8060928B: load:  17.99GB/s,  5.00B/c      store:  18.04GB/s,  5.01B/c
  8867840B: load:  17.79GB/s,  4.94B/c      store:  17.87GB/s,  4.96B/c
  9754624B: load:  17.73GB/s,  4.93B/c      store:  17.38GB/s,  4.83B/c
 10730496B: load:  17.60GB/s,  4.89B/c      store:  17.64GB/s,  4.90B/c
 11803648B: load:  17.49GB/s,  4.86B/c      store:  17.41GB/s,  4.84B/c
 12984320B: load:  17.34GB/s,  4.82B/c      store:  17.39GB/s,  4.83B/c
 14282752B: load:  17.32GB/s,  4.81B/c      store:  17.33GB/s,  4.82B/c
 15711232B: load:  17.29GB/s,  4.80B/c      store:  17.31GB/s,  4.81B/c
 17283072B: load:  17.30GB/s,  4.81B/c      store:  17.33GB/s,  4.81B/c
 19011584B: load:  17.25GB/s,  4.79B/c      store:  17.27GB/s,  4.80B/c
 20913152B: load:  17.26GB/s,  4.80B/c      store:  17.32GB/s,  4.81B/c
 23005184B: load:  17.23GB/s,  4.79B/c      store:  17.30GB/s,  4.80B/c
 25306112B: load:  17.16GB/s,  4.77B/c      store:  17.21GB/s,  4.78B/c
 27837440B: load:  17.15GB/s,  4.76B/c      store:  17.22GB/s,  4.78B/c
 30621696B: load:  17.16GB/s,  4.77B/c      store:  17.09GB/s,  4.75B/c
 33684480B: load:  17.09GB/s,  4.75B/c      store:  17.05GB/s,  4.74B/c
 37053440B: load:  17.12GB/s,  4.76B/c      store:  17.23GB/s,  4.79B/c
 40759296B: load:  17.11GB/s,  4.75B/c      store:  17.14GB/s,  4.76B/c
 44835840B: load:  17.19GB/s,  4.78B/c      store:  17.10GB/s,  4.75B/c
 49319936B: load:  17.20GB/s,  4.78B/c      store:  17.22GB/s,  4.78B/c
 54252544B: load:  17.21GB/s,  4.78B/c      store:  17.35GB/s,  4.82B/c
 59678720B: load:  17.30GB/s,  4.81B/c      store:  17.31GB/s,  4.81B/c
 65646592B: load:  17.14GB/s,  4.76B/c      store:  17.00GB/s,  4.72B/c
 72211456B: load:  17.01GB/s,  4.73B/c      store:  17.08GB/s,  4.74B/c
 79432704B: load:  16.82GB/s,  4.67B/c      store:  17.06GB/s,  4.74B/c
 87376896B: load:  17.08GB/s,  4.74B/c      store:  17.08GB/s,  4.74B/c
 96114688B: load:  16.95GB/s,  4.71B/c      store:  17.00GB/s,  4.72B/c

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment