Last active
May 10, 2023 16:29
-
-
Save dzaima/5909a0c00e27ad05d1343edca9040b66 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// WIDTH: number of bytes to load/store in one instruction for bandwidth tests | |
// default: 32 | |
// WIDTH∊1 2 4 8: regular integer loads | |
// x86-64 WIDTH==16 - SSE; WIDTH==32 - AVX2 | |
// aarch64: WIDTH==16 - NEON loads; WIDTH==32 uses an ldp | |
// CLOCK_RATE: number of clock cycles in a second on the target processor | |
// TEST_LAT: test latency instead of bandwidth; latency tests aligned 8-byte loads | |
#ifndef CLOCK_RATE | |
#define CLOCK_RATE 3.6e9 // 3.6GHz | |
#endif | |
#ifndef TEST_LAT | |
#define TEST_LAT 0 // test latency | |
#endif | |
// #define TP_ONLY 1 // only load | |
// #define TP_ONLY 2 // only store | |
#if __x86_64__ | |
#include <immintrin.h> | |
#elif __aarch64__ | |
#include <arm_neon.h> | |
#endif | |
#include <inttypes.h> | |
#include <stdbool.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <stddef.h> | |
#define NOINLINE __attribute__((noinline)) | |
#define FORCE_INLINE __attribute__((always_inline)) static inline | |
#define NORETURN __attribute__((noreturn)) | |
#define KEEPG(X) __asm__("" :: "r"(X)) | |
#define KEEPV(X) __asm__("" :: "x"(X)) | |
#define IDENTG(X) ({ __auto_type x_ = (X); __asm__("" : "=r"(x_) : "0"(x_)); x_; }) | |
#define MEMM __asm__("" ::: "memory") | |
//#define KEEP(X) ({ __auto_type x=(X); _Generic(x, void*: KEEPG(x), KEEPV(x)); 1; }) | |
typedef int8_t i8; | |
typedef uint8_t u8; | |
typedef int16_t i16; | |
typedef uint16_t u16; | |
typedef int32_t i32; | |
typedef uint32_t u32; | |
typedef int64_t i64; | |
typedef uint64_t u64; | |
typedef double f64; | |
typedef size_t ux; | |
static inline void _wymum(uint64_t *A, uint64_t *B){ | |
__uint128_t r=*A; r*=*B; | |
*A=(uint64_t)r; *B=(uint64_t)(r>>64); | |
} | |
static inline uint64_t _wymix(uint64_t A, uint64_t B){ _wymum(&A,&B); return A^B; } | |
//The wyrand PRNG that pass BigCrush and PractRand | |
static inline uint64_t wyrand(uint64_t *seed){ *seed+=0xa0761d6478bd642full; return _wymix(*seed,*seed^0xe7037ed1a0b428dbull);} | |
//fast range integer random number generation on [0,k) credit to Daniel Lemire. May not work when WYHASH_32BIT_MUM=1. It can be combined with wyrand, wyhash64 or wyhash. | |
static inline uint64_t wy2u0k(uint64_t r, uint64_t k){ _wymum(&r,&k); return k; } | |
#include <time.h> | |
static inline u64 nsTime() { | |
struct timespec t; | |
clock_gettime(CLOCK_REALTIME, &t); | |
return (u64)(t.tv_sec*1000000000ll + t.tv_nsec); | |
} | |
#ifndef WIDTH | |
#define WIDTH 32 | |
#endif | |
#if WIDTH>8 | |
#define KEEPL(X) KEEPV(X) | |
#if __x86_64__ | |
#if WIDTH==32 | |
#define INIT __m256 val = _mm256_set1_ps(123); | |
#if UNALIGNED | |
#define STORE _mm256_storeu_ps | |
#define LOAD _mm256_loadu_ps | |
#else | |
#define STORE _mm256_store_ps | |
#define LOAD _mm256_load_ps | |
#endif | |
#elif WIDTH==16 | |
#define INIT __m128 val = _mm_set1_ps(123); | |
#if UNALIGNED | |
#define STORE _mm_storeu_ps | |
#define LOAD _mm_loadu_ps | |
#else | |
#define STORE _mm_store_ps | |
#define LOAD _mm_load_ps | |
#endif | |
#endif | |
#elif __aarch64__ | |
#define INIT float32x4_t val = vdupq_n_f32(123); | |
#if WIDTH==16 | |
#define LOAD vld1q_f32 | |
#define STORE vst1q_f32 | |
#elif WIDTH==32 | |
#undef KEEPL | |
#define KEEPL(X) ({ __auto_type x=X; KEEPV(x[0]); KEEPV(x[1]); }) | |
#define LOAD(P) (float32x4_t[2]){vld1q_f32(P), vld1q_f32(P+4)} | |
#define STORE(P,V) vst1q_f32(P,V); vst1q_f32(P+4,V); | |
#endif | |
#endif | |
#else | |
#define INIT TY val = IDENTG((TY)123); | |
#if WIDTH==8 | |
#define TY u64 | |
#elif WIDTH==4 | |
#define TY u32 | |
#elif WIDTH==2 | |
#define TY u16 | |
#elif WIDTH==1 | |
#define TY u8 | |
#endif | |
#define KEEPL(X) KEEPG(X) | |
#define STORE(P, V) *(TY*)(P) = (V) | |
#define LOAD(P) *(TY*)(P) | |
#endif | |
NOINLINE u64 testThroughput(u8* data, u64 testsize, u64 rep, u64 breakonns, bool store) { | |
u64 tns = 0; | |
u64 minns = 1e18; | |
while(tns < 1e8) { | |
u64 sns = nsTime(); | |
INIT; | |
if (store) { | |
for (u64 k = 0; k < rep; k++) { | |
for (u64 i = 0; i < testsize; i+= WIDTH*32) { | |
u8* ptr = IDENTG(data+i); // IDENTG to avoid the addition propagating to complex addressing modes | |
#pragma GCC unroll(32) | |
for (u64 j = 0; j < 32; j++) { STORE((float*)(ptr + j*WIDTH), val); MEMM; } | |
} | |
} | |
} else { | |
for (u64 k = 0; k < rep; k++) { | |
for (u64 i = 0; i < testsize; i+= WIDTH*32) { | |
u8* ptr = IDENTG(data+i); | |
#pragma GCC unroll(32) | |
for (u64 j = 0; j < 32; j++) KEEPL(LOAD((float*)(ptr + j*WIDTH))); | |
} | |
} | |
} | |
u64 ens = nsTime(); | |
u64 cns = (ens-sns) / rep; | |
if (cns < minns) { | |
minns = cns; | |
if (minns < breakonns) return minns; | |
} | |
tns+= ens-sns; | |
} | |
return minns; | |
} | |
NOINLINE u64 testLatency(u8* data, u64 bytes, u64 rep, u64 breakonns, bool unused) { | |
u64 tns = 0; | |
u64 minns = 1e18; | |
while(tns < 1e8) { | |
u64 sns = nsTime(); | |
for (u64 k = 0; k < rep; k++) { | |
void** ptr = (void**)data; | |
for (u64 i = 0; i < bytes/sizeof(void*); i+= 32) { | |
#pragma GCC unroll(32) | |
for (u64 j = 0; j < 32; j++) ptr = *ptr; | |
} | |
KEEPG(ptr); | |
} | |
u64 ens = nsTime(); | |
u64 cns = (ens-sns) / rep; | |
if (cns < minns) { | |
minns = cns; | |
if (minns < breakonns) return minns; | |
} | |
tns+= ens-sns; | |
} | |
return minns; | |
} | |
#if TEST_LAT | |
#define test testLatency | |
#else | |
#define test testThroughput | |
#endif | |
int main() { | |
u64 max = 104857600; // 100MB | |
u8* data = malloc(max+10240); | |
data = (u8*) (((u64)data+4096) & ~(u64)4095); | |
#if UNALIGNED | |
data+= UNALIGNED; | |
#endif | |
#if TEST_LAT | |
u64 seed = 12345; | |
void** indexes = malloc(max+10240); | |
for (u64 i = 0; i < max/sizeof(void*); i++) indexes[i] = (void*)i; | |
#else | |
for (u64 i = 0; i < max; i++) data[i] = i; | |
#endif | |
u64 bytes = 1024; | |
while (bytes < max) { | |
#if TEST_LAT | |
// create random pointer chain | |
u64 ptram = bytes/sizeof(void*); | |
for (u64 i = 0; i < ptram; i++) { // shuffle ↕ptram pointers | |
u64 j = wy2u0k(wyrand(&seed), i); | |
void* t=indexes[i]; | |
indexes[i] = indexes[j]; | |
indexes[j] = t; | |
} | |
void** ptrs = (void**)data; | |
u64 ci = indexes[ptram]-NULL; | |
for (u64 i = 0; i < ptram+1; i++) { // create the chain | |
ptrs[ci] = ptrs + (indexes[i]-NULL); | |
ci = indexes[i]-NULL; | |
} | |
#endif | |
u64 nses[2]; | |
for (int i = 0; i < (TEST_LAT? 1 : 2); i++) { | |
#if TP_ONLY | |
if (i!=TP_ONLY-1) continue; | |
#endif | |
u64 minns = test(data, bytes, 1, 10000, false); | |
if (minns < 10000) { | |
u64 rep = 10000 / minns; | |
minns = test(data, bytes, rep, 0, i); | |
} | |
nses[i] = minns; | |
} | |
#if TEST_LAT | |
printf("%9ldB: %6.2fns, %6.2fc; %ld ptrs, %ld total ns\n", bytes, nses[0]*1.0/ptram, (nses[0]/1e9*CLOCK_RATE)/ptram, ptram, nses[0]); | |
#else | |
printf("%9ldB: ", bytes); | |
#if !TP_ONLY || TP_ONLY==1 | |
printf("load:%7.2fGB/s,%6.2fB/c", bytes/(nses[0]/1e9) / 1e9, bytes/(nses[0]/1e9) / CLOCK_RATE); | |
#endif | |
#ifndef TP_ONLY | |
printf(" "); | |
#endif | |
#if !TP_ONLY || TP_ONLY==2 | |
printf("store:%7.2fGB/s,%6.2fB/c", bytes/(nses[1]/1e9) / 1e9, bytes/(nses[1]/1e9) / CLOCK_RATE); | |
#endif | |
printf("\n"); | |
#endif | |
fflush(stdout); | |
bytes*= 1.1; | |
bytes = (bytes+1023) & ~(u64)1023; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
i3-4160 3.6GHz: