Created
March 27, 2016 12:26
-
-
Save dmikushin/2df0a707f9c93a708c9d to your computer and use it in GitHub Desktop.
AVX-512 horizontal multiply for k1om (Intel Xeon Phi Knights Corner) using Intel compiler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// AVX-512 horizontal multiply for k1om (Intel Xeon Phi Knights Corner) | |
// | |
// (c) 2016 Dmitry Mikushin [email protected] | |
// | |
// $ icc -mmic -std=c99 -O3 reduce_mul.c -o reduce_mul | |
// $ micnativeloadex ./reduce_mul | |
// -0.004276 vs -0.004276 | |
#include <immintrin.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
static __m512i reverse_mask; | |
__attribute__((constructor)) void init_consts() | |
{ | |
reverse_mask = _mm512_set_epi32(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); | |
} | |
static inline __m512d _mm512_reverse_pd(__m512d val) | |
{ | |
return (__m512d)_mm512_permutevar_epi32(reverse_mask, (__m512i)val); | |
} | |
static inline double _mm512_reduce_mul_pd(__m512d val) | |
{ | |
__m512d hgfe_dcba = val; | |
__m512d ghef_cdab = _mm512_swizzle_pd(val, _MM_SWIZ_REG_CDAB); | |
__m512d fehg_badc = _mm512_swizzle_pd(val, _MM_SWIZ_REG_BADC); | |
__m512d efgh_abcd = _mm512_swizzle_pd(_mm512_swizzle_pd(val, _MM_SWIZ_REG_BADC), _MM_SWIZ_REG_CDAB); | |
hgfe_dcba = _mm512_mul_pd(_mm512_mul_pd(hgfe_dcba, ghef_cdab), _mm512_mul_pd(fehg_badc, efgh_abcd)); | |
__m512d abcd_efgh = _mm512_reverse_pd(hgfe_dcba); | |
double values[8] __attribute__((aligned(64))); | |
_mm512_store_pd(values, _mm512_mul_pd(hgfe_dcba, abcd_efgh)); | |
return values[0]; | |
} | |
int main(int argc, char* argv[]) | |
{ | |
// Generate random input vector of [-1, 1] values. | |
double values[8] __attribute__((aligned(64))); | |
for (int i = 0; i < 8; i++) | |
values[i] = 2 * (0.5 - rand() / (double)RAND_MAX); | |
__m512d val = _mm512_load_pd(values); | |
double vector = _mm512_reduce_mul_pd(val); | |
double scalar = values[0]; | |
for (int i = 1; i < 8; i++) | |
scalar *= values[i]; | |
printf("%f vs %f\n", vector, scalar); | |
fflush(stdout); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment