Skip to content

Instantly share code, notes, and snippets.

@awreece
Last active December 17, 2015 08:28
Show Gist options
  • Save awreece/5580195 to your computer and use it in GitHub Desktop.
Save awreece/5580195 to your computer and use it in GitHub Desktop.
// A simple memory profiler.
//
// Each of the write_memory_* functions read from a 1GB array. Each of the
// write_memory_* writes to the 1GB array. The goal is to get the max memory
// bandwidth as advertised by the intel specs: 25.6 GB/s (http://goo.gl/r8Aab)
//
// Compile with clang -mavx memory_profiler.c
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <sys/time.h>
#ifdef __SSE4_1__
#include <smmintrin.h>
void write_memory_nontemporal_sse(void*,size_t);
void write_memory_sse(void*,size_t);
void read_memory_sse(void*,size_t);
#endif
#ifdef __AVX__
#include <immintrin.h>
void write_memory_nontemporal_avx(void*,size_t);
void write_memory_avx(void*,size_t);
void read_memory_avx(void*,size_t);
#endif
void write_memory_loop(void*,size_t);
void read_memory_loop(void*,size_t);
void write_memory_memset(void*,size_t);
#define SAMPLES 5
#define BYTES_PER_GB (1024*1024*1024LL)
#define USECS_PER_SEC (1000*1000)
#define SIZE (1*BYTES_PER_GB)
// This must be at least 32 byte aligned to make some AVX instructions happy.
char array[SIZE] __attribute__ ((aligned (32)));;
// Compute the bandwidth in GiB/s.
static inline double to_bw(size_t bytes, size_t usecs) {
double size_bytes = (double) bytes;
double size_gb = size_bytes / ((double) BYTES_PER_GB);
double time_secs = usecs / ((double) USECS_PER_SEC);
return size_gb / time_secs;
}
// Time a function, printing out time to perform the memory operation and
// the computed memory bandwidth.
#define timefun(f) timeit(f, #f)
void timeit(void (*function)(void*,size_t), char* name) {
size_t min_usec = 9999999999;
size_t i;
for (i = 0; i < SAMPLES; i++) {
struct timeval before, after, total;
gettimeofday(&before, NULL);
function(array, SIZE);
gettimeofday(&after, NULL);
timersub(&after, &before, &total);
size_t time = (total.tv_sec * 1000000) + total.tv_usec;
if (time < min_usec) {
min_usec = time;
}
}
printf("%30s: %5.2f GiB/s\n", name, to_bw(SIZE, min_usec));
}
int main() {
memset(array, 0xFF, SIZE); // un-ZFOD the page.
timefun(read_memory_loop);
#ifdef __SSE4_1__
timefun(read_memory_sse);
#endif
#ifdef __AVX__
timefun(read_memory_avx);
#endif
timefun(write_memory_loop);
#ifdef __SSE4_1__
timefun(write_memory_sse);
timefun(write_memory_nontemporal_sse);
#endif
#ifdef __AVX__
timefun(write_memory_avx);
timefun(write_memory_nontemporal_avx);
#endif
timefun(write_memory_memset);
return 0;
}
void write_memory_memset(void* array, size_t size) {
memset(array, 0xff, size);
}
void write_memory_loop(void* array, size_t size) {
size_t* carray = (size_t*) array;
size_t i;
for (i = 0; i < size / sizeof(size_t); i++) {
carray[i] = 1;
}
}
void read_memory_loop(void* array, size_t size) {
size_t* carray = (size_t*) array;
size_t val = 0;
size_t i;
for (i = 0; i < size / sizeof(size_t); i++) {
val += carray[i];
}
// This is unlikely, and we want to make sure the reads are not optimized
// away.
assert(val != 0xdeadbeef);
}
#ifdef __SSE4_1__
void write_memory_nontemporal_sse(void* array, size_t size) {
__m128i* varray = (__m128i*) array;
__m128i vals = _mm_set1_epi32(1);
size_t i;
for (i = 0; i < size / sizeof(__m128i); i++) {
_mm_stream_si128(&varray[i], vals);
vals = _mm_add_epi16(vals, vals);
}
}
void write_memory_sse(void* array, size_t size) {
__m128i* varray = (__m128i*) array;
__m128i vals = _mm_set1_epi32(1);
size_t i;
for (i = 0; i < size / sizeof(__m128i); i++) {
_mm_store_si128(&varray[i], vals);
vals = _mm_add_epi16(vals, vals);
}
}
void read_memory_sse(void* array, size_t size) {
__m128i* varray = (__m128i*) array;
__m128i accum = _mm_set1_epi32(0xDEADBEEF);
size_t i;
for (i = 0; i < size / sizeof(__m128i); i++) {
accum = _mm_add_epi16(varray[i], accum);
}
// This is unlikely, and we want to make sure the reads are not optimized
// away.
assert(!_mm_testz_si128(accum, accum));
}
#endif
#ifdef __AVX__
void write_memory_nontemporal_avx(void* array, size_t size) {
__m256* varray = (__m256*) array;
__m256 vals = _mm256_set1_ps((float) 0xDEADBEEF);
size_t i;
for (i = 0; i < size / sizeof(__m256); i++) {
_mm256_stream_si256((__m256i*) &varray[i], vals);
}
}
void write_memory_avx(void* array, size_t size) {
__m256* varray = (__m256*) array;
__m256 vals = _mm256_set1_ps((float) 0xDEADBEEF);
size_t i;
for (i = 0; i < size / sizeof(__m256i); i++) {
_mm256_store_si256((__m256i*) &varray[i], vals);
}
}
void read_memory_avx(void* array, size_t size) {
__m256* varray = (__m256*) array;
__m256 accum = _mm256_set1_ps((float) 0xDEADBEEF);
size_t i;
for (i = 0; i < size / sizeof(__m256i); i++) {
accum = _mm256_add_ps(varray[i], accum);
}
// This is unlikely, and we want to make sure the reads are not optimized
// away.
assert(!_mm256_testz_si256(accum, accum));
}
#endif // __AVX__
$ clang -mavx -O3 memory_profiler.c -o memory_profiler
$ ./memory_profiler
read_memory_loop: 13.23 GiB/s
read_memory_sse: 15.93 GiB/s
read_memory_avx: 16.81 GiB/s
write_memory_loop: 16.87 GiB/s
write_memory_sse: 9.15 GiB/s
write_memory_nontemporal_sse: 16.87 GiB/s
write_memory_avx: 9.13 GiB/s
write_memory_nontemporal_avx: 16.64 GiB/s
write_memory_memset: 16.86 GiB/s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment