Created
October 7, 2016 18:37
-
-
Save tanakamura/7c159d27f744fc24ff8243522b166820 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <immintrin.h> | |
| #include <x86intrin.h> | |
| #include <stdlib.h> | |
| #include <stdint.h> | |
| #include <stdio.h> | |
| static inline | |
| __m256i mm256_u8gather_epu8(const uint8_t* lut, __m256i vindex, __m256i andMask) { | |
| __m256i lo = _mm256_unpacklo_epi8(vindex, _mm256_setzero_si256()); | |
| __m256i hi = _mm256_unpackhi_epi8(vindex, _mm256_setzero_si256()); | |
| __m256i idx0 = _mm256_unpacklo_epi16(lo, _mm256_setzero_si256()); | |
| __m256i idx1 = _mm256_unpackhi_epi16(lo, _mm256_setzero_si256()); | |
| __m256i idx2 = _mm256_unpacklo_epi16(hi, _mm256_setzero_si256()); | |
| __m256i idx3 = _mm256_unpackhi_epi16(hi, _mm256_setzero_si256()); | |
| const int* base = (const int*)lut; | |
| __m256i nidx0 = _mm256_i32gather_epi32(base, idx0, 1); | |
| __m256i nidx1 = _mm256_i32gather_epi32(base, idx1, 1); | |
| __m256i nidx2 = _mm256_i32gather_epi32(base, idx2, 1); | |
| __m256i nidx3 = _mm256_i32gather_epi32(base, idx3, 1); | |
| nidx0 = _mm256_and_si256(nidx0, andMask); | |
| nidx1 = _mm256_and_si256(nidx1, andMask); | |
| nidx2 = _mm256_and_si256(nidx2, andMask); | |
| nidx3 = _mm256_and_si256(nidx3, andMask); | |
| nidx0 = _mm256_packus_epi32(nidx0, nidx1); | |
| nidx2 = _mm256_packus_epi32(nidx2, nidx3); | |
| nidx0 = _mm256_packus_epi16(nidx0, nidx2); | |
| __m256i ret = nidx0; | |
| return ret; | |
| } | |
| #define UNROLL16(F) \ | |
| F(0) \ | |
| F(1) \ | |
| F(2) \ | |
| F(3) \ | |
| \ | |
| F(4) \ | |
| F(5) \ | |
| F(6) \ | |
| F(7) \ | |
| \ | |
| F(8) \ | |
| F(9) \ | |
| F(10) \ | |
| F(11) \ | |
| \ | |
| F(12) \ | |
| F(13) \ | |
| F(14) \ | |
| F(15) \ | |
| #define UNROLL32(F) \ | |
| F(0) \ | |
| F(1) \ | |
| F(2) \ | |
| F(3) \ | |
| \ | |
| F(4) \ | |
| F(5) \ | |
| F(6) \ | |
| F(7) \ | |
| \ | |
| F(8) \ | |
| F(9) \ | |
| F(10) \ | |
| F(11) \ | |
| \ | |
| F(12) \ | |
| F(13) \ | |
| F(14) \ | |
| F(15) \ | |
| \ | |
| F(16) \ | |
| F(17) \ | |
| F(18) \ | |
| F(19) \ | |
| \ | |
| F(20) \ | |
| F(21) \ | |
| F(22) \ | |
| F(23) \ | |
| \ | |
| F(24) \ | |
| F(25) \ | |
| F(26) \ | |
| F(27) \ | |
| \ | |
| F(28) \ | |
| F(29) \ | |
| F(30) \ | |
| F(31) \ | |
| #define UNROLL31(F) \ | |
| F(0) \ | |
| F(1) \ | |
| F(2) \ | |
| F(3) \ | |
| \ | |
| F(4) \ | |
| F(5) \ | |
| F(6) \ | |
| F(7) \ | |
| \ | |
| F(8) \ | |
| F(9) \ | |
| F(10) \ | |
| F(11) \ | |
| \ | |
| F(12) \ | |
| F(13) \ | |
| F(14) \ | |
| F(15) \ | |
| \ | |
| F(16) \ | |
| F(17) \ | |
| F(18) \ | |
| F(19) \ | |
| \ | |
| F(20) \ | |
| F(21) \ | |
| F(22) \ | |
| F(23) \ | |
| \ | |
| F(24) \ | |
| F(25) \ | |
| F(26) \ | |
| F(27) \ | |
| \ | |
| F(28) \ | |
| F(29) \ | |
| F(30) \ | |
| static inline __m256i | |
| mov32(unsigned char *table, __m256i idx) | |
| { | |
| __m128i idx_lo = _mm256_extractf128_si256(idx,0); | |
| __m128i idx_hi = _mm256_extractf128_si256(idx,1); | |
| __m128i lo = _mm_undefined_si128(), hi = _mm_undefined_si128(); | |
| unsigned char c; | |
| #define INDEX_LO(N) \ | |
| c = table[_mm_extract_epi8(idx_lo,N)]; \ | |
| lo = _mm_insert_epi8(lo, c, N); \ | |
| #define INDEX_HI(N) \ | |
| c = table[_mm_extract_epi8(idx_hi,N)]; \ | |
| hi = _mm_insert_epi8(hi, c, N); | |
| UNROLL16(INDEX_LO); | |
| UNROLL16(INDEX_HI); | |
| __m256i ret = _mm256_castsi128_si256(lo); | |
| return _mm256_insertf128_si256(ret, hi, 1); | |
| } | |
| static inline __m256i | |
| full_scalar(unsigned char *table, __m256i idx) | |
| { | |
| unsigned char idx_scalar[32] __attribute__((aligned(32))); | |
| unsigned char result[32] __attribute__((aligned(32))); | |
| _mm256_store_si256((__m256i*)idx_scalar, idx); | |
| /* result[0] = table[idx_scalar[0]]; | |
| * result[1] = table[idx_scalar[1]]; | |
| * ... | |
| * result[31] = table[idx_scalar[31]]; | |
| */ | |
| #define SCALAR(N)\ | |
| result[N] = table[idx_scalar[N]]; | |
| UNROLL32(SCALAR); | |
| return _mm256_load_si256((__m256i*)result); | |
| } | |
| static __attribute__((noinline)) void | |
| test(__m256i idx, unsigned char *val) | |
| { | |
| unsigned int t0,t1; | |
| int nloop = 1024*1024*8; | |
| t0 = __rdtsc(); | |
| for (int i=0; i<nloop; i++) { | |
| __m256i r = mm256_u8gather_epu8(val, idx, _mm256_set1_epi32(0xFF)); | |
| __asm__ __volatile__ (" " | |
| :"+x"(r),"+x"(idx)); | |
| } | |
| t1 = __rdtsc(); | |
| printf("gather %f\n", (t1-t0) / (double)nloop); | |
| t0 = __rdtsc(); | |
| for (int i=0; i<nloop; i++) { | |
| __m256i r = mov32(val, idx); | |
| __asm__ __volatile__ (" " | |
| :"+x"(r),"+x"(idx)); | |
| } | |
| t1 = __rdtsc(); | |
| printf("mov32 %f\n", (t1-t0) / (double)nloop); | |
| t0 = __rdtsc(); | |
| for (int i=0; i<nloop; i++) { | |
| __m256i r = full_scalar(val, idx); | |
| __asm__ __volatile__ (" " | |
| :"+x"(r),"+x"(idx)); | |
| } | |
| t1 = __rdtsc(); | |
| printf("full scalar %f\n", (t1-t0) / (double)nloop); | |
| } | |
| int | |
| main(int argc, char **argv) | |
| { | |
| unsigned char mem_idx[32]; | |
| unsigned char mem_val[256]; | |
| int seed = 0; | |
| int size = 4096; | |
| void *vp; | |
| if (argc > 1) { | |
| srand(atoi(argv[1])); | |
| } else { | |
| srand(0); | |
| } | |
| for (int i=0; i<256; i++) { | |
| mem_val[i] = rand(); | |
| } | |
| for (int i=0; i<32; i++) { | |
| mem_idx[i] = rand(); | |
| } | |
| __m256i idx = _mm256_loadu_si256((__m256i*)mem_idx); | |
| test(idx, mem_val); | |
| test(idx, mem_val); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment