Created
December 27, 2015 23:41
-
-
Save nkurz/439ca1044e11181c1089 to your computer and use it in GitHub Desktop.
Alignment strongly affects vector load bandwidth
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// gcc -fno-inline -std=gnu99 -Wall -O3 -g -march=native avx.c -o avx | |
#include <sys/types.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <x86intrin.h> | |
#include <math.h> | |
#include <malloc.h> | |
#ifndef SIZE // number of floats to load per test | |
#define SIZE 4096 | |
#endif | |
#ifndef RETRY // retries of test to find minimum time | |
#define RETRY 100000 | |
#endif | |
#define RDTSC_START(cycles) \ | |
do { \ | |
register unsigned cyc_high, cyc_low; \ | |
__asm volatile("cpuid\n\t" \ | |
"rdtsc\n\t" \ | |
"mov %%edx, %0\n\t" \ | |
"mov %%eax, %1\n\t" \ | |
: "=r" (cyc_high), "=r" (cyc_low) \ | |
:: "%rax", "%rbx", "%rcx", "%rdx"); \ | |
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ | |
} while (0) | |
#define RDTSC_FINAL(cycles) \ | |
do { \ | |
register unsigned cyc_high, cyc_low; \ | |
__asm volatile("rdtscp\n\t" \ | |
"mov %%edx, %0\n\t" \ | |
"mov %%eax, %1\n\t" \ | |
"cpuid\n\t" \ | |
: "=r" (cyc_high), "=r" (cyc_low) \ | |
:: "%rax", "%rbx", "%rcx", "%rdx"); \ | |
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ | |
} while(0) | |
#define BEST_TIME(func, args...) \ | |
do { \ | |
printf("%-30s: ", #func); \ | |
fflush(NULL); \ | |
uint64_t cycles_start, cycles_final, cycles_diff; \ | |
uint64_t min_diff = (uint64_t) -1; \ | |
for (int i = 0; i < RETRY; i++) { \ | |
__asm volatile (""::: /* pretend to clobber */ "memory"); \ | |
RDTSC_START(cycles_start); \ | |
func(args); \ | |
RDTSC_FINAL(cycles_final); \ | |
cycles_diff = (cycles_final - cycles_start); \ | |
if (cycles_diff < min_diff) min_diff = cycles_diff; \ | |
} \ | |
float bytes_per_cycle = SIZE*sizeof(float)/(float)cycles_diff; \ | |
printf("%.2f bytes/cycle\n", bytes_per_cycle); \ | |
fflush(NULL); \ | |
} while (0) | |
#define VEC_LOAD_OFFSET_BASE(load, offset, base) \ | |
__asm volatile ("vmovups %c1(%2), %0": \ | |
"=x" (load): /* xmm or ymm destination register */ \ | |
"i" (offset), /* constant array offset in bytes */ \ | |
"r" (base) /* read only memory location */ \ | |
) | |
typedef __m256 ymm_t; | |
typedef __m128 xmm_t; | |
// issue 32B loads covering all elements in array (linear order) | |
void load_ymm(float *array, size_t size) { | |
if (size % 32 != 0) exit(1); | |
float *end = array + size; | |
while (array < end) { | |
ymm_t dummy; | |
VEC_LOAD_OFFSET_BASE(dummy, 0, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 32, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 64, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 96, array); | |
array += 32; | |
} | |
} | |
// issue 32B loads covering all elements in array (non-sequential) | |
void load_ymm_nonsequential(float *array, size_t size) { | |
if (size % 32 != 0) exit(1); | |
float *end = array + size; | |
while (array < end) { | |
ymm_t dummy; | |
VEC_LOAD_OFFSET_BASE(dummy, 96, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 32, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 64, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 0, array); | |
array += 32; | |
} | |
} | |
// issue 16B loads for all elements in array (linear order) | |
void load_xmm(float *array, size_t size) { | |
if (size % 32 != 0) exit(1); | |
float *end = array + size; | |
while (array < end) { | |
xmm_t dummy; | |
VEC_LOAD_OFFSET_BASE(dummy, 0, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 16, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 32, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 48, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 64, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 80, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 96, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 112, array); | |
array += 32; | |
} | |
} | |
// issue 16B loads for all elements in array (nonsequential) | |
void load_xmm_nonsequential(float *array, size_t size) { | |
if (size % 32 != 0) exit(1); | |
float *end = array + size; | |
while (array < end) { | |
xmm_t dummy; | |
VEC_LOAD_OFFSET_BASE(dummy, 0, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 64, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 32, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 96, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 16, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 80, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 48, array); | |
VEC_LOAD_OFFSET_BASE(dummy, 112, array); | |
array += 32; | |
} | |
} | |
int main(int argc, char **argv) { | |
size_t size = SIZE; | |
size_t raw_align = 64; | |
size_t raw_size = size * sizeof(float) + raw_align; | |
void *raw_ptr = memalign(raw_align, raw_size); | |
memset(raw_ptr, 0, raw_size); | |
printf("Loading %d floats with %ld byte raw alignment\n", SIZE, raw_align); | |
for (size_t offset = 8; offset <= 32; offset += 8) { | |
float *array = raw_ptr + offset; | |
printf("Vector alignment %ld:\n", offset); | |
BEST_TIME(load_xmm, array, size); | |
BEST_TIME(load_xmm_nonsequential, array, size); | |
BEST_TIME(load_ymm, array, size); | |
BEST_TIME(load_ymm_nonsequential, array, size); | |
printf("\n"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment