Last active
December 17, 2015 08:28
-
-
Save awreece/5580195 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// A simple memory profiler. | |
// | |
// Each of the write_memory_* functions read from a 1GB array. Each of the | |
// write_memory_* writes to the 1GB array. The goal is to get the max memory | |
// bandwidth as advertised by the intel specs: 25.6 GB/s (http://goo.gl/r8Aab) | |
// | |
// Compile with clang -mavx memory_profiler.c | |
#include <assert.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <sys/time.h> | |
#ifdef __SSE4_1__ | |
#include <smmintrin.h> | |
void write_memory_nontemporal_sse(void*,size_t); | |
void write_memory_sse(void*,size_t); | |
void read_memory_sse(void*,size_t); | |
#endif | |
#ifdef __AVX__ | |
#include <immintrin.h> | |
void write_memory_nontemporal_avx(void*,size_t); | |
void write_memory_avx(void*,size_t); | |
void read_memory_avx(void*,size_t); | |
#endif | |
void write_memory_loop(void*,size_t); | |
void read_memory_loop(void*,size_t); | |
void write_memory_memset(void*,size_t); | |
#define SAMPLES 5 | |
#define BYTES_PER_GB (1024*1024*1024LL) | |
#define USECS_PER_SEC (1000*1000) | |
#define SIZE (1*BYTES_PER_GB) | |
// This must be at least 32 byte aligned to make some AVX instructions happy. | |
char array[SIZE] __attribute__ ((aligned (32)));; | |
// Compute the bandwidth in GiB/s. | |
static inline double to_bw(size_t bytes, size_t usecs) { | |
double size_bytes = (double) bytes; | |
double size_gb = size_bytes / ((double) BYTES_PER_GB); | |
double time_secs = usecs / ((double) USECS_PER_SEC); | |
return size_gb / time_secs; | |
} | |
// Time a function, printing out time to perform the memory operation and | |
// the computed memory bandwidth. | |
#define timefun(f) timeit(f, #f) | |
void timeit(void (*function)(void*,size_t), char* name) { | |
size_t min_usec = 9999999999; | |
size_t i; | |
for (i = 0; i < SAMPLES; i++) { | |
struct timeval before, after, total; | |
gettimeofday(&before, NULL); | |
function(array, SIZE); | |
gettimeofday(&after, NULL); | |
timersub(&after, &before, &total); | |
size_t time = (total.tv_sec * 1000000) + total.tv_usec; | |
if (time < min_usec) { | |
min_usec = time; | |
} | |
} | |
printf("%30s: %5.2f GiB/s\n", name, to_bw(SIZE, min_usec)); | |
} | |
int main() { | |
memset(array, 0xFF, SIZE); // un-ZFOD the page. | |
timefun(read_memory_loop); | |
#ifdef __SSE4_1__ | |
timefun(read_memory_sse); | |
#endif | |
#ifdef __AVX__ | |
timefun(read_memory_avx); | |
#endif | |
timefun(write_memory_loop); | |
#ifdef __SSE4_1__ | |
timefun(write_memory_sse); | |
timefun(write_memory_nontemporal_sse); | |
#endif | |
#ifdef __AVX__ | |
timefun(write_memory_avx); | |
timefun(write_memory_nontemporal_avx); | |
#endif | |
timefun(write_memory_memset); | |
return 0; | |
} | |
void write_memory_memset(void* array, size_t size) { | |
memset(array, 0xff, size); | |
} | |
void write_memory_loop(void* array, size_t size) { | |
size_t* carray = (size_t*) array; | |
size_t i; | |
for (i = 0; i < size / sizeof(size_t); i++) { | |
carray[i] = 1; | |
} | |
} | |
void read_memory_loop(void* array, size_t size) { | |
size_t* carray = (size_t*) array; | |
size_t val = 0; | |
size_t i; | |
for (i = 0; i < size / sizeof(size_t); i++) { | |
val += carray[i]; | |
} | |
// This is unlikely, and we want to make sure the reads are not optimized | |
// away. | |
assert(val != 0xdeadbeef); | |
} | |
#ifdef __SSE4_1__ | |
void write_memory_nontemporal_sse(void* array, size_t size) { | |
__m128i* varray = (__m128i*) array; | |
__m128i vals = _mm_set1_epi32(1); | |
size_t i; | |
for (i = 0; i < size / sizeof(__m128i); i++) { | |
_mm_stream_si128(&varray[i], vals); | |
vals = _mm_add_epi16(vals, vals); | |
} | |
} | |
void write_memory_sse(void* array, size_t size) { | |
__m128i* varray = (__m128i*) array; | |
__m128i vals = _mm_set1_epi32(1); | |
size_t i; | |
for (i = 0; i < size / sizeof(__m128i); i++) { | |
_mm_store_si128(&varray[i], vals); | |
vals = _mm_add_epi16(vals, vals); | |
} | |
} | |
void read_memory_sse(void* array, size_t size) { | |
__m128i* varray = (__m128i*) array; | |
__m128i accum = _mm_set1_epi32(0xDEADBEEF); | |
size_t i; | |
for (i = 0; i < size / sizeof(__m128i); i++) { | |
accum = _mm_add_epi16(varray[i], accum); | |
} | |
// This is unlikely, and we want to make sure the reads are not optimized | |
// away. | |
assert(!_mm_testz_si128(accum, accum)); | |
} | |
#endif | |
#ifdef __AVX__ | |
void write_memory_nontemporal_avx(void* array, size_t size) { | |
__m256* varray = (__m256*) array; | |
__m256 vals = _mm256_set1_ps((float) 0xDEADBEEF); | |
size_t i; | |
for (i = 0; i < size / sizeof(__m256); i++) { | |
_mm256_stream_si256((__m256i*) &varray[i], vals); | |
} | |
} | |
void write_memory_avx(void* array, size_t size) { | |
__m256* varray = (__m256*) array; | |
__m256 vals = _mm256_set1_ps((float) 0xDEADBEEF); | |
size_t i; | |
for (i = 0; i < size / sizeof(__m256i); i++) { | |
_mm256_store_si256((__m256i*) &varray[i], vals); | |
} | |
} | |
void read_memory_avx(void* array, size_t size) { | |
__m256* varray = (__m256*) array; | |
__m256 accum = _mm256_set1_ps((float) 0xDEADBEEF); | |
size_t i; | |
for (i = 0; i < size / sizeof(__m256i); i++) { | |
accum = _mm256_add_ps(varray[i], accum); | |
} | |
// This is unlikely, and we want to make sure the reads are not optimized | |
// away. | |
assert(!_mm256_testz_si256(accum, accum)); | |
} | |
#endif // __AVX__ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ clang -mavx -O3 memory_profiler.c -o memory_profiler | |
$ ./memory_profiler | |
read_memory_loop: 13.23 GiB/s | |
read_memory_sse: 15.93 GiB/s | |
read_memory_avx: 16.81 GiB/s | |
write_memory_loop: 16.87 GiB/s | |
write_memory_sse: 9.15 GiB/s | |
write_memory_nontemporal_sse: 16.87 GiB/s | |
write_memory_avx: 9.13 GiB/s | |
write_memory_nontemporal_avx: 16.64 GiB/s | |
write_memory_memset: 16.86 GiB/s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment