Last active
June 30, 2023 16:22
-
-
Save Const-me/90a52f291c1fcb06142307facdb8e54e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Transform 4 inputs with 4 lookup tables, making 4 outputs | |
// The 4 inputs are packed in uint32_t value, each byte is expected to be in [ 0 .. 15 ] interval | |
// The 4 tables are in a single AVX2 vector | |
uint32_t applyLookup4( uint32_t i4, __m256i tables4 ) | |
{ | |
// Move 4 bytes into SSE vector | |
__m128i bytes = _mm_cvtsi32_si128( (int)i4 ); | |
// Expand bytes into uint64_t lanes | |
__m256i v = _mm256_cvtepu8_epi64( bytes ); | |
// Multiply them by 4 to get shift amounts in bits | |
v = _mm256_slli_epi64( v, 2 ); | |
// Shift numbers in the 4 tables | |
v = _mm256_srlv_epi64( tables4, v ); | |
// Move bytes into the correct positions, within 16-byte pieces | |
const __m256i perm = _mm256_setr_epi8( | |
0, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, 0, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 ); | |
v = _mm256_shuffle_epi8( v, perm ); | |
// Merge 16-byte pieces into a single vector | |
__m128i res = _mm256_extracti128_si256( v, 1 ); | |
res = _mm_or_si128( res, _mm256_castsi256_si128( v ) ); | |
// Move result into a scalar register | |
uint32_t scalar = (uint32_t)_mm_cvtsi128_si32( res ); | |
// Mask away higher 4 bits in each byte | |
// They are artifacts from the lookup tables | |
scalar &= 0x0F0F0F0Fu; | |
// Return the 4 outputs, same packing as the inputs | |
return scalar; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment