Created
March 31, 2019 16:04
-
-
Save tanakamura/4970a923ae7d10bb952f3fababf6b10a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <unistd.h> | |
#include <time.h> | |
#include <sys/mman.h> | |
#include <getopt.h> | |
#include <sys/time.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <immintrin.h> | |
#include <pthread.h> | |
#include <clzerointrin.h> | |
double sec(void) { | |
struct timespec ts; | |
clock_gettime(CLOCK_MONOTONIC, &ts); | |
return ts.tv_sec + ts.tv_nsec / 1000000000.0; | |
} | |
typedef void *(*opfn_t) (void *dst, const void *src, size_t sz); | |
#define compiler_mb() __asm__ __volatile__ ("" ::: "memory"); | |
#define cpu_rmb() _mm_lfence() | |
#define cpu_wmb() _mm_sfence() | |
struct __attribute__((aligned(64))) thread_shared { | |
pthread_t t; | |
int ti; | |
char *src; | |
char *dst; | |
size_t max_size; | |
int start; | |
int end; | |
opfn_t op_fn; | |
int num_iter; | |
size_t copy_size; | |
}; | |
static void | |
run_test1(char *dstp, char *srcp, opfn_t opfn, int num_iter, size_t copy_size) | |
{ | |
for (int ii=0; ii<num_iter; ii++) { | |
opfn(dstp, srcp, copy_size); | |
} | |
} | |
void *thread_fn(void *p) | |
{ | |
struct thread_shared *ts = (struct thread_shared*)p; | |
char *srcp = ts->src + ts->ti * ts->max_size; | |
char *dstp = ts->dst + ts->ti * ts->max_size; | |
while (1) { | |
if (ts->start == 2) { | |
break; | |
} | |
if (ts->start != 1) { | |
compiler_mb(); | |
continue; | |
} | |
ts->start = 0; | |
cpu_rmb(); | |
run_test1(dstp, srcp, ts->op_fn, ts->num_iter, ts->copy_size); | |
cpu_wmb(); | |
ts->end = 1; | |
} | |
return NULL; | |
} | |
static void | |
do_test(struct thread_shared *clients, | |
char *dst, char *src, size_t max_size, int nproc, opfn_t opfn, const char *test_name, int mul) | |
{ | |
int ni=1; | |
int last=0; | |
while (!last) { | |
size_t copy_size = 1024; | |
if (ni>=nproc) { | |
ni = nproc; | |
last = 1; | |
} | |
printf("num_thread = %d\n", ni); | |
while (copy_size <= max_size) { | |
int niter = 16384; | |
niter /= (copy_size/512); | |
if (niter < 4) { | |
niter = 4; | |
} | |
double t0 = sec(); | |
for (int ti=0; ti<ni-1; ti++) { | |
clients[ti].op_fn = opfn; | |
clients[ti].num_iter = niter; | |
clients[ti].copy_size = copy_size; | |
cpu_wmb(); | |
clients[ti].start = 1; | |
} | |
run_test1(dst, src, opfn, niter, copy_size); | |
for (int ti=0; ti<ni-1; ti++) { | |
while (1) { | |
if (clients[ti].end == 1) { | |
clients[ti].end = 0; | |
break; | |
} | |
compiler_mb(); | |
} | |
} | |
double t1 = sec(); | |
size_t transfer_size = copy_size * mul; | |
double total = ni * transfer_size * niter; | |
double bps = total / (t1-t0); | |
if (transfer_size < 16*1024) { | |
printf("%-16s : %8d[ B] %f[GB/s]\n", test_name, (int)transfer_size, bps/(1024*1024*1024.0)); | |
} else if (transfer_size < 16*1024ULL*1024ULL) { | |
printf("%-16s : %8d[KB] %f[GB/s]\n", test_name, (int)transfer_size/1024, bps/(1024*1024*1024.0)); | |
} else { | |
printf("%-16s : %8d[MB] %f[GB/s]\n", test_name, (int)transfer_size/(1024*1024), bps/(1024*1024*1024.0)); | |
} | |
copy_size *= 2; | |
} | |
ni *= 2; | |
} | |
} | |
void *libc_memset(void *dst, const void *src, size_t sz) | |
{ | |
memset(dst, 0, sz); | |
return NULL; | |
} | |
void *amd_clzero(void *dst, const void *src, size_t sz) | |
{ | |
size_t line_size = 64; | |
size_t num_line = sz / line_size; | |
unsigned char *d = (unsigned char*)dst; | |
unsigned char *s = (unsigned char*)src; | |
for (int i=0; i<num_line; i++) { | |
_mm_clzero(d); | |
d += line_size; | |
} | |
return NULL; | |
} | |
void *sse_load(void *dst, const void *src, size_t sz) | |
{ | |
size_t sz_sse = sz/16; | |
//__m128 *vdst = dst; | |
const volatile __m128 *vsrc = src; | |
for (size_t i=0; i<sz_sse; i+=8) { | |
vsrc[i+0]; | |
vsrc[i+1]; | |
vsrc[i+2]; | |
vsrc[i+3]; | |
vsrc[i+4]; | |
vsrc[i+5]; | |
vsrc[i+6]; | |
vsrc[i+7]; | |
} | |
return NULL; | |
} | |
void *sse_store(void *dst, const void *src, size_t sz) | |
{ | |
size_t sz_sse = sz/16; | |
__m128 *vdst = dst; | |
//const volatile __m128 *vsrc = src; | |
for (size_t i=0; i<sz_sse; i+=8) { | |
vdst[i+0] = _mm_setzero_ps(); | |
vdst[i+1] = _mm_setzero_ps(); | |
vdst[i+2] = _mm_setzero_ps(); | |
vdst[i+3] = _mm_setzero_ps(); | |
vdst[i+4] = _mm_setzero_ps(); | |
vdst[i+5] = _mm_setzero_ps(); | |
vdst[i+6] = _mm_setzero_ps(); | |
vdst[i+7] = _mm_setzero_ps(); | |
} | |
return NULL; | |
} | |
void *avx_load(void *dst, const void *src, size_t sz) | |
{ | |
size_t sz_sse = sz/32; | |
//__m128 *vdst = dst; | |
const volatile __m256 *vsrc = src; | |
for (size_t i=0; i<sz_sse; i+=4) { | |
vsrc[i+0]; | |
vsrc[i+1]; | |
vsrc[i+2]; | |
vsrc[i+3]; | |
} | |
return NULL; | |
} | |
void *avx_store(void *dst, const void *src, size_t sz) | |
{ | |
size_t sz_sse = sz/32; | |
__m256 *vdst = dst; | |
//const volatile __m128 *vsrc = src; | |
for (size_t i=0; i<sz_sse; i+=4) { | |
vdst[i+0] = _mm256_setzero_ps(); | |
vdst[i+1] = _mm256_setzero_ps(); | |
vdst[i+2] = _mm256_setzero_ps(); | |
vdst[i+3] = _mm256_setzero_ps(); | |
} | |
return NULL; | |
} | |
void *sse_stream_store(void *dst, const void *src, size_t sz) | |
{ | |
size_t sz_sse = sz/16; | |
__m128 *vdst = dst; | |
//const volatile __m128 *vsrc = src; | |
for (size_t i=0; i<sz_sse; i+=4) { | |
_mm_stream_ps((float*)&vdst[i+0], _mm_setzero_ps()); | |
_mm_stream_ps((float*)&vdst[i+1], _mm_setzero_ps()); | |
_mm_stream_ps((float*)&vdst[i+2], _mm_setzero_ps()); | |
_mm_stream_ps((float*)&vdst[i+3], _mm_setzero_ps()); | |
} | |
return NULL; | |
} | |
void *rep_movsb(void *dst, const void *src, size_t sz) | |
{ | |
asm volatile ("rep movsb" | |
:"=D" (dst), "=S" (src), "=c" (sz) | |
: "0" (dst), "1" (src), "2" (sz) | |
: "memory"); | |
return NULL; | |
} | |
void *rep_stosb(void *dst, const void *src, size_t sz) | |
{ | |
asm volatile ("rep stosb" | |
:"=D" (dst), "=S" (src), "=c" (sz) | |
: "0" (dst), "1" (src), "2" (sz) | |
: "memory"); | |
return NULL; | |
} | |
int main(int argc, char **argv) | |
{ | |
int nproc = sysconf(_SC_NPROCESSORS_ONLN); | |
size_t max_size = 128 * 1024 * 1024; | |
int opt; | |
nproc -= 2; | |
if (nproc <= 1) { | |
nproc = 1; | |
} | |
while ((opt = getopt(argc, argv, "t:s:")) != -1) { | |
switch (opt) { | |
case 't': | |
nproc = atoi(optarg); | |
break; | |
case 's': | |
max_size = atoi(optarg) * 1024 * 1024; | |
break; | |
default: | |
fprintf(stderr, "Usage: %s [-t num_thread] [-s size[MB]]\n", | |
argv[0]); | |
return 1; | |
} | |
} | |
char *src = mmap(0, max_size * nproc, PROT_READ|PROT_WRITE, | |
MAP_PRIVATE|MAP_POPULATE|MAP_ANONYMOUS, 0, 0); | |
char *dst = mmap(0, max_size * nproc, PROT_READ|PROT_WRITE, | |
MAP_PRIVATE|MAP_POPULATE|MAP_ANONYMOUS, 0, 0); | |
if (src == MAP_FAILED || dst == MAP_FAILED) { | |
perror("mmap"); | |
exit(1); | |
} | |
memset(src, 0, max_size*nproc); | |
memset(dst, 0, max_size*nproc); | |
struct thread_shared *ts = calloc(1, sizeof(*ts) * nproc-1); | |
for (int ti=0; ti<nproc-1; ti++) { | |
ts[ti].ti = ti+1; | |
ts[ti].src = src; | |
ts[ti].dst = dst; | |
ts[ti].max_size = max_size; | |
pthread_create(&ts[ti].t, NULL, | |
thread_fn, &ts[ti]); | |
} | |
do_test(ts, dst, src, max_size, nproc, libc_memset, "libc-memset", 1); | |
do_test(ts, dst, src, max_size, nproc, memcpy, "libc-memcpy", 2); | |
do_test(ts, dst, src, max_size, nproc, sse_load, "sse-load", 1); | |
do_test(ts, dst, src, max_size, nproc, sse_store, "sse-store", 1); | |
do_test(ts, dst, src, max_size, nproc, sse_store, "sse-stream-store", 1); | |
do_test(ts, dst, src, max_size, nproc, avx_load, "avx-load", 1); | |
do_test(ts, dst, src, max_size, nproc, avx_store, "avx-store", 1); | |
do_test(ts, dst, src, max_size, nproc, rep_movsb, "rep-movsb", 2); | |
do_test(ts, dst, src, max_size, nproc, rep_stosb, "rep-stosb", 1); | |
do_test(ts, dst, src, max_size, nproc, amd_clzero, "amd_clzero", 1); | |
for (int ti=0; ti<nproc-1; ti++) { | |
ts[ti].start = 2; | |
pthread_join(ts[ti].t, NULL); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment