Last active
March 1, 2025 06:57
-
-
Save kazuho/541effebcadae35af6c105b843ecfce8 to your computer and use it in GitHub Desktop.
tiny memory load generator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl | |
use strict; | |
use warnings; | |
while (my $l = <STDIN>) { | |
chomp $l; | |
if ($l !~ /MiB/ and $l =~ /([0-9]+,[0-9,]+)/) { | |
$l .= toMiB($1); | |
} | |
print "$l\n"; | |
} | |
sub toMiB { | |
my $v = shift; | |
$v =~ s/,//g; | |
sprintf "%.3f MiB", $v * 64 / 1024 / 1024; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <errno.h> | |
#include <pthread.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <sys/time.h> | |
#include <unistd.h> | |
#include <immintrin.h> | |
#define BYTES_PER_BLOCK 8192 | |
static size_t num_blocks = 128 * 1024 * 1024 / BYTES_PER_BLOCK; /* default is 128MB */ | |
double now(void) | |
{ | |
struct timeval tv; | |
gettimeofday(&tv, NULL); | |
return tv.tv_sec + tv.tv_usec * 0.000001; | |
} | |
struct llbuf { | |
char *p; | |
int fd; | |
size_t blkcnt; | |
}; | |
static void llbuf_init(struct llbuf *buf, size_t blkcnt) | |
{ | |
int ret = posix_memalign((void **)&buf->p, 64, blkcnt * BYTES_PER_BLOCK); | |
assert(ret == 0); | |
memset(buf->p, 1, blkcnt * BYTES_PER_BLOCK); | |
buf->fd = -1; | |
buf->blkcnt = blkcnt; | |
} | |
static void llbuf_init_fd(struct llbuf *buf, size_t blkcnt) | |
{ | |
char fn[] = "/tmp/llcload.tmp.XXXXXX"; | |
if ((buf->fd = mkstemp(fn)) == -1) { | |
fprintf(stderr, "failed to create temporary file:%s:%s\n", fn, strerror(errno)); | |
abort(); | |
} | |
unlink(fn); | |
for (size_t i = 0; i < blkcnt; ++i) { | |
static const char zeros[BYTES_PER_BLOCK]; | |
if (write(buf->fd, zeros, BYTES_PER_BLOCK) != BYTES_PER_BLOCK) { | |
fprintf(stderr, "I/O error while writing to temporary file:%s:%s\n", fn, strerror(errno)); | |
abort(); | |
} | |
} | |
fsync(buf->fd); | |
buf->p = NULL; | |
buf->blkcnt = blkcnt; | |
} | |
static void do_test_memcpy(struct llbuf *dest, struct llbuf *src, | |
void *(*doit)(void *, const void *, size_t)) | |
{ | |
for (size_t i = 0; i < num_blocks; ++i) { | |
doit(dest->p + (i % dest->blkcnt) * BYTES_PER_BLOCK, | |
src->p + (i % src->blkcnt) * BYTES_PER_BLOCK, BYTES_PER_BLOCK); | |
} | |
} | |
static void test_memcpy(struct llbuf *dest, struct llbuf *src) | |
{ | |
do_test_memcpy(dest, src, memcpy); | |
} | |
static void *rep_movsb_core(void *dest, const void *src, size_t n) | |
{ | |
asm volatile ( | |
"rep movsb" | |
: "=D" (dest), "=S" (src), "=c" (n) | |
: "0" (dest), "1" (src), "2" (n) | |
: "memory" | |
); | |
return dest; | |
} | |
static void test_rep_movsb(struct llbuf *dest, struct llbuf *src) | |
{ | |
do_test_memcpy(dest, src, rep_movsb_core); | |
} | |
static void *rep_movsq_core(void *dest, const void *src, size_t n) | |
{ | |
asm volatile ( | |
"rep movsq" | |
: "=D" (dest), "=S" (src), "=c" (n) | |
: "0" (dest), "1" (src), "2" (n / 8) | |
: "memory" | |
); | |
return dest; | |
} | |
static void test_rep_movsq(struct llbuf *dest, struct llbuf *src) | |
{ | |
do_test_memcpy(dest, src, rep_movsq_core); | |
} | |
static inline void reg64_copy64(void *dst, const void *src) | |
{ | |
asm volatile( | |
"mov (%%rsi), %%r8\n\t" | |
"mov 8(%%rsi), %%r9\n\t" | |
"mov 16(%%rsi), %%r10\n\t" | |
"mov 24(%%rsi), %%r11\n\t" | |
"mov %%r8, (%%rdi)\n\t" | |
"mov %%r9, 8(%%rdi)\n\t" | |
"mov %%r10, 16(%%rdi)\n\t" | |
"mov %%r11, 24(%%rdi)\n\t" | |
"mov 32(%%rsi), %%r8\n\t" | |
"mov 40(%%rsi), %%r9\n\t" | |
"mov 48(%%rsi), %%r10\n\t" | |
"mov 56(%%rsi), %%r11\n\t" | |
"mov %%r8, 32(%%rdi)\n\t" | |
"mov %%r9, 40(%%rdi)\n\t" | |
"mov %%r10, 48(%%rdi)\n\t" | |
"mov %%r11, 56(%%rdi)\n\t" | |
: | |
: "D" (dst), "S" (src) | |
: "r8", "r9", "r10", "r11", "memory" | |
); | |
} | |
static void *test_reg64_core(void *dest, const void *src, size_t n) | |
{ | |
for (size_t off = 0; off < n; off += 64) | |
reg64_copy64((char *)dest + off, (char *)src + off); | |
return dest; | |
} | |
static void test_reg64(struct llbuf *dest, struct llbuf *src) | |
{ | |
do_test_memcpy(dest, src, test_reg64_core); | |
} | |
static void *avx256_core(void *dest, const void *src, size_t n) | |
{ | |
__m256i *d = (__m256i *)dest; | |
const __m256i *s = (const __m256i *)src; | |
size_t num_vecs = n / sizeof(__m256i); | |
for (size_t i = 0; i < num_vecs; ++i) { | |
__m256i v = _mm256_load_si256(s++); | |
_mm256_store_si256(d++, v); | |
} | |
return dest; | |
} | |
static void test_avx256(struct llbuf *dest, struct llbuf *src) | |
{ | |
do_test_memcpy(dest, src, avx256_core); | |
} | |
static void *avx256nt_core(void *dest, const void *src, size_t n) | |
{ | |
__m256i *d = (__m256i *)dest; | |
const __m256i *s = (const __m256i *)src; | |
size_t num_vecs = n / sizeof(__m256i); | |
for (size_t i = 0; i < num_vecs; ++i) { | |
__m256i v = _mm256_load_si256(s++); | |
_mm256_stream_si256(d++, v); | |
} | |
_mm_sfence(); // Ensure stores are visible in memory | |
return dest; | |
} | |
static void test_avx256nt(struct llbuf *dest, struct llbuf *src) | |
{ | |
do_test_memcpy(dest, src, avx256nt_core); | |
} | |
static void *memset_core(void *dst, const void *src, size_t n) | |
{ | |
return memset(dst, 123, n); | |
} | |
static void test_memset(struct llbuf *dest, struct llbuf *src) | |
{ | |
do_test_memcpy(dest, src, memset_core); | |
} | |
static void *rep_stosb_core(void *dest, const void *src, size_t n) | |
{ | |
asm volatile ( | |
"rep stosb" | |
: "=D" (dest), "=c" (n) | |
: "0" (dest), "1" (n), "a" (0) | |
: "memory" | |
); | |
return dest; | |
} | |
static void test_rep_stosb(struct llbuf *dest, struct llbuf *src) | |
{ | |
do_test_memcpy(dest, src, rep_stosb_core); | |
} | |
static void test_syscall(struct llbuf *dest, struct llbuf *src) | |
{ | |
if (dest->fd != -1) { | |
assert(src->fd == -1 && "m=rw between files not supported (test splice?)"); | |
for (size_t i = 0; i < num_blocks; ++i) { | |
ssize_t ret = pwrite(dest->fd, src->p + (i % src->blkcnt) * BYTES_PER_BLOCK, BYTES_PER_BLOCK, | |
(i % dest->blkcnt) * BYTES_PER_BLOCK); | |
assert(ret == BYTES_PER_BLOCK); | |
} | |
} else if (src->fd != -1) { | |
for (size_t i = 0; i < num_blocks; ++i) { | |
ssize_t ret = pread(src->fd, dest->p + (i % dest->blkcnt) * BYTES_PER_BLOCK, BYTES_PER_BLOCK, | |
(i % src->blkcnt) * BYTES_PER_BLOCK); | |
assert(ret == BYTES_PER_BLOCK); | |
} | |
} else { | |
assert(!"neither side is file"); | |
} | |
} | |
static struct { | |
enum { | |
MODE_READ, | |
MODE_WRITE, | |
MODE_RW, | |
} mode; | |
void (*algo)(struct llbuf *dest, struct llbuf *src); | |
size_t threads; | |
double duration; | |
} settings = { | |
.mode = MODE_READ, | |
.threads = 1, | |
.duration = 10, | |
}; | |
static pthread_mutex_t start_lock = PTHREAD_MUTEX_INITIALIZER; | |
static void *thread_main(void *is_starter_thread) | |
{ | |
struct llbuf dest, src; | |
if (settings.mode == MODE_WRITE && settings.algo == test_syscall) { | |
llbuf_init_fd(&dest, num_blocks); | |
} else { | |
llbuf_init(&dest, settings.mode == MODE_READ ? 1 : num_blocks); | |
} | |
if (settings.mode == MODE_READ && settings.algo == test_syscall) { | |
llbuf_init_fd(&src, num_blocks); | |
} else { | |
llbuf_init(&src, settings.mode == MODE_WRITE ? 1 : num_blocks); | |
} | |
static size_t threads_ready = 0; | |
__sync_fetch_and_add(&threads_ready, 1); | |
if (is_starter_thread) { | |
while (__sync_fetch_and_add(&threads_ready, 0) < settings.threads) | |
usleep(100 * 1000); | |
fprintf(stderr, "all threads are ready... starting benchmark\n"); | |
} else { | |
pthread_mutex_lock(&start_lock); | |
} | |
pthread_mutex_unlock(&start_lock); | |
double start_at = now(), until = start_at + settings.duration; | |
size_t cnt; | |
for (cnt = 0; now() < until; ++cnt) | |
settings.algo(&dest, &src); | |
double elapsed = now() - start_at; | |
double *speed = malloc(sizeof(*speed)); | |
assert(speed != NULL); | |
*speed = (double)cnt * num_blocks * BYTES_PER_BLOCK / elapsed; | |
return speed; | |
} | |
static void usage(const char *arg0) | |
{ | |
fprintf(stderr, "Usage: %s -m <mode> -a <algorithm> -t <threads>\n" | |
"Options:\n" | |
" -m rd|wr|rw\n" | |
" -a memcpy|rep-movsb|rep-movsq|reg64|avx256|avx256nt|memset|rep-stosb|syscall\n" | |
" -t <number>\n" | |
" -l <large_size>\n" | |
" -d <duration>\n" | |
"\n", arg0); | |
} | |
int main(int argc, char **argv) | |
{ | |
int ch; | |
while ((ch = getopt(argc, argv, "m:a:t:l:d:")) != -1) { | |
switch (ch) { | |
case 'm': /* mode */ | |
if (strcmp(optarg, "rd") == 0) { | |
settings.mode = MODE_READ; | |
} else if (strcmp(optarg, "wr") == 0) { | |
settings.mode = MODE_WRITE; | |
} else if (strcmp(optarg, "rw") == 0) { | |
settings.mode = MODE_RW; | |
} else { | |
fprintf(stderr, "%s: -m takes one of: rd, wr, rw\n", argv[0]); | |
return 1; | |
} | |
break; | |
case 'a': | |
if (strcmp(optarg, "memcpy") == 0) { | |
settings.algo = test_memcpy; | |
} else if (strcmp(optarg, "rep-movsb") == 0) { | |
settings.algo = test_rep_movsb; | |
} else if (strcmp(optarg, "rep-movsq") == 0) { | |
settings.algo = test_rep_movsq; | |
} else if (strcmp(optarg, "reg64") == 0) { | |
settings.algo = test_reg64; | |
} else if (strcmp(optarg, "avx256nt") == 0) { | |
settings.algo = test_avx256nt; | |
} else if (strcmp(optarg, "avx256") == 0) { | |
settings.algo = test_avx256; | |
} else if (strcmp(optarg, "memset") == 0) { | |
settings.algo = test_memset; | |
} else if (strcmp(optarg, "rep-stosb") == 0) { | |
settings.algo = test_rep_stosb; | |
} else if (strcmp(optarg, "syscall") == 0) { | |
settings.algo = test_syscall; | |
} else { | |
fprintf(stderr, "%s: -a takes one of: memcpy, rep-movsb, avx256, avx256nt, memset, rep-stosb, syscall\n", argv[0]); | |
return 1; | |
} | |
break; | |
case 't': | |
if (sscanf(optarg, "%zu", &settings.threads) != 1 || settings.threads < 1) { | |
fprintf(stderr, "%s: -t takes a poitive number\n", argv[0]); | |
return 1; | |
} | |
break; | |
case 'l': | |
if (sscanf(optarg, "%zu", &num_blocks) != 1 || num_blocks % BYTES_PER_BLOCK != 0 || | |
(num_blocks /= BYTES_PER_BLOCK) == 0) { | |
fprintf(stderr, "%s: -l must be a multiple of 8192 (bytes)\n", argv[0]); | |
return 1; | |
} | |
break; | |
case 'd': | |
if (sscanf(optarg, "%lf", &settings.duration) != 1 || !(settings.duration > 0)) { | |
fprintf(stderr, "%s: -d must be a positive number (in seconds)\n", argv[0]); | |
return 1; | |
} | |
break; | |
default: | |
usage(argv[0]); | |
return 1; | |
} | |
} | |
if (settings.algo == NULL) { | |
fprintf(stderr, "%s: missing mandatory option -a\n", argv[0]); | |
return 1; | |
} | |
pthread_mutex_lock(&start_lock); | |
pthread_t tids[settings.threads]; | |
for (size_t i = 1; i < settings.threads; ++i) | |
pthread_create(&tids[i], NULL, thread_main, NULL); | |
double throughput = 0, *ret; | |
ret = thread_main((void *)1); | |
throughput = *ret; | |
free(ret); | |
for (size_t i = 1; i < settings.threads; ++i) { | |
pthread_join(tids[i], (void **)&ret); | |
throughput += *ret; | |
free(ret); | |
} | |
printf("%.3f GB/s\n", throughput / 1000000000); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment