Skip to content

Instantly share code, notes, and snippets.

@tanakamura
Created October 7, 2016 18:37
Show Gist options
  • Save tanakamura/7c159d27f744fc24ff8243522b166820 to your computer and use it in GitHub Desktop.
Save tanakamura/7c159d27f744fc24ff8243522b166820 to your computer and use it in GitHub Desktop.
#include <immintrin.h>
#include <x86intrin.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
static inline
__m256i mm256_u8gather_epu8(const uint8_t* lut, __m256i vindex, __m256i andMask) {
__m256i lo = _mm256_unpacklo_epi8(vindex, _mm256_setzero_si256());
__m256i hi = _mm256_unpackhi_epi8(vindex, _mm256_setzero_si256());
__m256i idx0 = _mm256_unpacklo_epi16(lo, _mm256_setzero_si256());
__m256i idx1 = _mm256_unpackhi_epi16(lo, _mm256_setzero_si256());
__m256i idx2 = _mm256_unpacklo_epi16(hi, _mm256_setzero_si256());
__m256i idx3 = _mm256_unpackhi_epi16(hi, _mm256_setzero_si256());
const int* base = (const int*)lut;
__m256i nidx0 = _mm256_i32gather_epi32(base, idx0, 1);
__m256i nidx1 = _mm256_i32gather_epi32(base, idx1, 1);
__m256i nidx2 = _mm256_i32gather_epi32(base, idx2, 1);
__m256i nidx3 = _mm256_i32gather_epi32(base, idx3, 1);
nidx0 = _mm256_and_si256(nidx0, andMask);
nidx1 = _mm256_and_si256(nidx1, andMask);
nidx2 = _mm256_and_si256(nidx2, andMask);
nidx3 = _mm256_and_si256(nidx3, andMask);
nidx0 = _mm256_packus_epi32(nidx0, nidx1);
nidx2 = _mm256_packus_epi32(nidx2, nidx3);
nidx0 = _mm256_packus_epi16(nidx0, nidx2);
__m256i ret = nidx0;
return ret;
}
#define UNROLL16(F) \
F(0) \
F(1) \
F(2) \
F(3) \
\
F(4) \
F(5) \
F(6) \
F(7) \
\
F(8) \
F(9) \
F(10) \
F(11) \
\
F(12) \
F(13) \
F(14) \
F(15) \
#define UNROLL32(F) \
F(0) \
F(1) \
F(2) \
F(3) \
\
F(4) \
F(5) \
F(6) \
F(7) \
\
F(8) \
F(9) \
F(10) \
F(11) \
\
F(12) \
F(13) \
F(14) \
F(15) \
\
F(16) \
F(17) \
F(18) \
F(19) \
\
F(20) \
F(21) \
F(22) \
F(23) \
\
F(24) \
F(25) \
F(26) \
F(27) \
\
F(28) \
F(29) \
F(30) \
F(31) \
#define UNROLL31(F) \
F(0) \
F(1) \
F(2) \
F(3) \
\
F(4) \
F(5) \
F(6) \
F(7) \
\
F(8) \
F(9) \
F(10) \
F(11) \
\
F(12) \
F(13) \
F(14) \
F(15) \
\
F(16) \
F(17) \
F(18) \
F(19) \
\
F(20) \
F(21) \
F(22) \
F(23) \
\
F(24) \
F(25) \
F(26) \
F(27) \
\
F(28) \
F(29) \
F(30) \
static inline __m256i
mov32(unsigned char *table, __m256i idx)
{
__m128i idx_lo = _mm256_extractf128_si256(idx,0);
__m128i idx_hi = _mm256_extractf128_si256(idx,1);
__m128i lo = _mm_undefined_si128(), hi = _mm_undefined_si128();
unsigned char c;
#define INDEX_LO(N) \
c = table[_mm_extract_epi8(idx_lo,N)]; \
lo = _mm_insert_epi8(lo, c, N); \
#define INDEX_HI(N) \
c = table[_mm_extract_epi8(idx_hi,N)]; \
hi = _mm_insert_epi8(hi, c, N);
UNROLL16(INDEX_LO);
UNROLL16(INDEX_HI);
__m256i ret = _mm256_castsi128_si256(lo);
return _mm256_insertf128_si256(ret, hi, 1);
}
static inline __m256i
full_scalar(unsigned char *table, __m256i idx)
{
unsigned char idx_scalar[32] __attribute__((aligned(32)));
unsigned char result[32] __attribute__((aligned(32)));
_mm256_store_si256((__m256i*)idx_scalar, idx);
/* result[0] = table[idx_scalar[0]];
* result[1] = table[idx_scalar[1]];
* ...
* result[31] = table[idx_scalar[31]];
*/
#define SCALAR(N)\
result[N] = table[idx_scalar[N]];
UNROLL32(SCALAR);
return _mm256_load_si256((__m256i*)result);
}
static __attribute__((noinline)) void
test(__m256i idx, unsigned char *val)
{
unsigned int t0,t1;
int nloop = 1024*1024*8;
t0 = __rdtsc();
for (int i=0; i<nloop; i++) {
__m256i r = mm256_u8gather_epu8(val, idx, _mm256_set1_epi32(0xFF));
__asm__ __volatile__ (" "
:"+x"(r),"+x"(idx));
}
t1 = __rdtsc();
printf("gather %f\n", (t1-t0) / (double)nloop);
t0 = __rdtsc();
for (int i=0; i<nloop; i++) {
__m256i r = mov32(val, idx);
__asm__ __volatile__ (" "
:"+x"(r),"+x"(idx));
}
t1 = __rdtsc();
printf("mov32 %f\n", (t1-t0) / (double)nloop);
t0 = __rdtsc();
for (int i=0; i<nloop; i++) {
__m256i r = full_scalar(val, idx);
__asm__ __volatile__ (" "
:"+x"(r),"+x"(idx));
}
t1 = __rdtsc();
printf("full scalar %f\n", (t1-t0) / (double)nloop);
}
int
main(int argc, char **argv)
{
unsigned char mem_idx[32];
unsigned char mem_val[256];
int seed = 0;
int size = 4096;
void *vp;
if (argc > 1) {
srand(atoi(argv[1]));
} else {
srand(0);
}
for (int i=0; i<256; i++) {
mem_val[i] = rand();
}
for (int i=0; i<32; i++) {
mem_idx[i] = rand();
}
__m256i idx = _mm256_loadu_si256((__m256i*)mem_idx);
test(idx, mem_val);
test(idx, mem_val);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment