Skip to content

Instantly share code, notes, and snippets.

@kazuho
Last active March 1, 2025 06:57
Show Gist options
  • Save kazuho/541effebcadae35af6c105b843ecfce8 to your computer and use it in GitHub Desktop.
Save kazuho/541effebcadae35af6c105b843ecfce8 to your computer and use it in GitHub Desktop.
tiny memory load generator
#! /usr/bin/perl
use strict;
use warnings;
while (my $l = <STDIN>) {
chomp $l;
if ($l !~ /MiB/ and $l =~ /([0-9]+,[0-9,]+)/) {
$l .= toMiB($1);
}
print "$l\n";
}
sub toMiB {
my $v = shift;
$v =~ s/,//g;
sprintf "%.3f MiB", $v * 64 / 1024 / 1024;
}
#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <unistd.h>
#include <immintrin.h>
#define BYTES_PER_BLOCK 8192
static size_t num_blocks = 128 * 1024 * 1024 / BYTES_PER_BLOCK; /* default is 128MB */
double now(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec * 0.000001;
}
struct llbuf {
char *p;
int fd;
size_t blkcnt;
};
static void llbuf_init(struct llbuf *buf, size_t blkcnt)
{
int ret = posix_memalign((void **)&buf->p, 64, blkcnt * BYTES_PER_BLOCK);
assert(ret == 0);
memset(buf->p, 1, blkcnt * BYTES_PER_BLOCK);
buf->fd = -1;
buf->blkcnt = blkcnt;
}
static void llbuf_init_fd(struct llbuf *buf, size_t blkcnt)
{
char fn[] = "/tmp/llcload.tmp.XXXXXX";
if ((buf->fd = mkstemp(fn)) == -1) {
fprintf(stderr, "failed to create temporary file:%s:%s\n", fn, strerror(errno));
abort();
}
unlink(fn);
for (size_t i = 0; i < blkcnt; ++i) {
static const char zeros[BYTES_PER_BLOCK];
if (write(buf->fd, zeros, BYTES_PER_BLOCK) != BYTES_PER_BLOCK) {
fprintf(stderr, "I/O error while writing to temporary file:%s:%s\n", fn, strerror(errno));
abort();
}
}
fsync(buf->fd);
buf->p = NULL;
buf->blkcnt = blkcnt;
}
static void do_test_memcpy(struct llbuf *dest, struct llbuf *src,
void *(*doit)(void *, const void *, size_t))
{
for (size_t i = 0; i < num_blocks; ++i) {
doit(dest->p + (i % dest->blkcnt) * BYTES_PER_BLOCK,
src->p + (i % src->blkcnt) * BYTES_PER_BLOCK, BYTES_PER_BLOCK);
}
}
static void test_memcpy(struct llbuf *dest, struct llbuf *src)
{
do_test_memcpy(dest, src, memcpy);
}
static void *rep_movsb_core(void *dest, const void *src, size_t n)
{
asm volatile (
"rep movsb"
: "=D" (dest), "=S" (src), "=c" (n)
: "0" (dest), "1" (src), "2" (n)
: "memory"
);
return dest;
}
static void test_rep_movsb(struct llbuf *dest, struct llbuf *src)
{
do_test_memcpy(dest, src, rep_movsb_core);
}
static void *rep_movsq_core(void *dest, const void *src, size_t n)
{
asm volatile (
"rep movsq"
: "=D" (dest), "=S" (src), "=c" (n)
: "0" (dest), "1" (src), "2" (n / 8)
: "memory"
);
return dest;
}
static void test_rep_movsq(struct llbuf *dest, struct llbuf *src)
{
do_test_memcpy(dest, src, rep_movsq_core);
}
static inline void reg64_copy64(void *dst, const void *src)
{
asm volatile(
"mov (%%rsi), %%r8\n\t"
"mov 8(%%rsi), %%r9\n\t"
"mov 16(%%rsi), %%r10\n\t"
"mov 24(%%rsi), %%r11\n\t"
"mov %%r8, (%%rdi)\n\t"
"mov %%r9, 8(%%rdi)\n\t"
"mov %%r10, 16(%%rdi)\n\t"
"mov %%r11, 24(%%rdi)\n\t"
"mov 32(%%rsi), %%r8\n\t"
"mov 40(%%rsi), %%r9\n\t"
"mov 48(%%rsi), %%r10\n\t"
"mov 56(%%rsi), %%r11\n\t"
"mov %%r8, 32(%%rdi)\n\t"
"mov %%r9, 40(%%rdi)\n\t"
"mov %%r10, 48(%%rdi)\n\t"
"mov %%r11, 56(%%rdi)\n\t"
:
: "D" (dst), "S" (src)
: "r8", "r9", "r10", "r11", "memory"
);
}
static void *test_reg64_core(void *dest, const void *src, size_t n)
{
for (size_t off = 0; off < n; off += 64)
reg64_copy64((char *)dest + off, (char *)src + off);
return dest;
}
static void test_reg64(struct llbuf *dest, struct llbuf *src)
{
do_test_memcpy(dest, src, test_reg64_core);
}
static void *avx256_core(void *dest, const void *src, size_t n)
{
__m256i *d = (__m256i *)dest;
const __m256i *s = (const __m256i *)src;
size_t num_vecs = n / sizeof(__m256i);
for (size_t i = 0; i < num_vecs; ++i) {
__m256i v = _mm256_load_si256(s++);
_mm256_store_si256(d++, v);
}
return dest;
}
static void test_avx256(struct llbuf *dest, struct llbuf *src)
{
do_test_memcpy(dest, src, avx256_core);
}
static void *avx256nt_core(void *dest, const void *src, size_t n)
{
__m256i *d = (__m256i *)dest;
const __m256i *s = (const __m256i *)src;
size_t num_vecs = n / sizeof(__m256i);
for (size_t i = 0; i < num_vecs; ++i) {
__m256i v = _mm256_load_si256(s++);
_mm256_stream_si256(d++, v);
}
_mm_sfence(); // Ensure stores are visible in memory
return dest;
}
static void test_avx256nt(struct llbuf *dest, struct llbuf *src)
{
do_test_memcpy(dest, src, avx256nt_core);
}
static void *memset_core(void *dst, const void *src, size_t n)
{
return memset(dst, 123, n);
}
static void test_memset(struct llbuf *dest, struct llbuf *src)
{
do_test_memcpy(dest, src, memset_core);
}
static void *rep_stosb_core(void *dest, const void *src, size_t n)
{
asm volatile (
"rep stosb"
: "=D" (dest), "=c" (n)
: "0" (dest), "1" (n), "a" (0)
: "memory"
);
return dest;
}
static void test_rep_stosb(struct llbuf *dest, struct llbuf *src)
{
do_test_memcpy(dest, src, rep_stosb_core);
}
static void test_syscall(struct llbuf *dest, struct llbuf *src)
{
if (dest->fd != -1) {
assert(src->fd == -1 && "m=rw between files not supported (test splice?)");
for (size_t i = 0; i < num_blocks; ++i) {
ssize_t ret = pwrite(dest->fd, src->p + (i % src->blkcnt) * BYTES_PER_BLOCK, BYTES_PER_BLOCK,
(i % dest->blkcnt) * BYTES_PER_BLOCK);
assert(ret == BYTES_PER_BLOCK);
}
} else if (src->fd != -1) {
for (size_t i = 0; i < num_blocks; ++i) {
ssize_t ret = pread(src->fd, dest->p + (i % dest->blkcnt) * BYTES_PER_BLOCK, BYTES_PER_BLOCK,
(i % src->blkcnt) * BYTES_PER_BLOCK);
assert(ret == BYTES_PER_BLOCK);
}
} else {
assert(!"neither side is file");
}
}
static struct {
enum {
MODE_READ,
MODE_WRITE,
MODE_RW,
} mode;
void (*algo)(struct llbuf *dest, struct llbuf *src);
size_t threads;
double duration;
} settings = {
.mode = MODE_READ,
.threads = 1,
.duration = 10,
};
static pthread_mutex_t start_lock = PTHREAD_MUTEX_INITIALIZER;
static void *thread_main(void *is_starter_thread)
{
struct llbuf dest, src;
if (settings.mode == MODE_WRITE && settings.algo == test_syscall) {
llbuf_init_fd(&dest, num_blocks);
} else {
llbuf_init(&dest, settings.mode == MODE_READ ? 1 : num_blocks);
}
if (settings.mode == MODE_READ && settings.algo == test_syscall) {
llbuf_init_fd(&src, num_blocks);
} else {
llbuf_init(&src, settings.mode == MODE_WRITE ? 1 : num_blocks);
}
static size_t threads_ready = 0;
__sync_fetch_and_add(&threads_ready, 1);
if (is_starter_thread) {
while (__sync_fetch_and_add(&threads_ready, 0) < settings.threads)
usleep(100 * 1000);
fprintf(stderr, "all threads are ready... starting benchmark\n");
} else {
pthread_mutex_lock(&start_lock);
}
pthread_mutex_unlock(&start_lock);
double start_at = now(), until = start_at + settings.duration;
size_t cnt;
for (cnt = 0; now() < until; ++cnt)
settings.algo(&dest, &src);
double elapsed = now() - start_at;
double *speed = malloc(sizeof(*speed));
assert(speed != NULL);
*speed = (double)cnt * num_blocks * BYTES_PER_BLOCK / elapsed;
return speed;
}
static void usage(const char *arg0)
{
fprintf(stderr, "Usage: %s -m <mode> -a <algorithm> -t <threads>\n"
"Options:\n"
" -m rd|wr|rw\n"
" -a memcpy|rep-movsb|rep-movsq|reg64|avx256|avx256nt|memset|rep-stosb|syscall\n"
" -t <number>\n"
" -l <large_size>\n"
" -d <duration>\n"
"\n", arg0);
}
int main(int argc, char **argv)
{
int ch;
while ((ch = getopt(argc, argv, "m:a:t:l:d:")) != -1) {
switch (ch) {
case 'm': /* mode */
if (strcmp(optarg, "rd") == 0) {
settings.mode = MODE_READ;
} else if (strcmp(optarg, "wr") == 0) {
settings.mode = MODE_WRITE;
} else if (strcmp(optarg, "rw") == 0) {
settings.mode = MODE_RW;
} else {
fprintf(stderr, "%s: -m takes one of: rd, wr, rw\n", argv[0]);
return 1;
}
break;
case 'a':
if (strcmp(optarg, "memcpy") == 0) {
settings.algo = test_memcpy;
} else if (strcmp(optarg, "rep-movsb") == 0) {
settings.algo = test_rep_movsb;
} else if (strcmp(optarg, "rep-movsq") == 0) {
settings.algo = test_rep_movsq;
} else if (strcmp(optarg, "reg64") == 0) {
settings.algo = test_reg64;
} else if (strcmp(optarg, "avx256nt") == 0) {
settings.algo = test_avx256nt;
} else if (strcmp(optarg, "avx256") == 0) {
settings.algo = test_avx256;
} else if (strcmp(optarg, "memset") == 0) {
settings.algo = test_memset;
} else if (strcmp(optarg, "rep-stosb") == 0) {
settings.algo = test_rep_stosb;
} else if (strcmp(optarg, "syscall") == 0) {
settings.algo = test_syscall;
} else {
fprintf(stderr, "%s: -a takes one of: memcpy, rep-movsb, avx256, avx256nt, memset, rep-stosb, syscall\n", argv[0]);
return 1;
}
break;
case 't':
if (sscanf(optarg, "%zu", &settings.threads) != 1 || settings.threads < 1) {
fprintf(stderr, "%s: -t takes a poitive number\n", argv[0]);
return 1;
}
break;
case 'l':
if (sscanf(optarg, "%zu", &num_blocks) != 1 || num_blocks % BYTES_PER_BLOCK != 0 ||
(num_blocks /= BYTES_PER_BLOCK) == 0) {
fprintf(stderr, "%s: -l must be a multiple of 8192 (bytes)\n", argv[0]);
return 1;
}
break;
case 'd':
if (sscanf(optarg, "%lf", &settings.duration) != 1 || !(settings.duration > 0)) {
fprintf(stderr, "%s: -d must be a positive number (in seconds)\n", argv[0]);
return 1;
}
break;
default:
usage(argv[0]);
return 1;
}
}
if (settings.algo == NULL) {
fprintf(stderr, "%s: missing mandatory option -a\n", argv[0]);
return 1;
}
pthread_mutex_lock(&start_lock);
pthread_t tids[settings.threads];
for (size_t i = 1; i < settings.threads; ++i)
pthread_create(&tids[i], NULL, thread_main, NULL);
double throughput = 0, *ret;
ret = thread_main((void *)1);
throughput = *ret;
free(ret);
for (size_t i = 1; i < settings.threads; ++i) {
pthread_join(tids[i], (void **)&ret);
throughput += *ret;
free(ret);
}
printf("%.3f GB/s\n", throughput / 1000000000);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment