Created
October 30, 2023 15:45
-
-
Save Const-me/e897e4565b2c6a2e69d8b5d2c1457730 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <stdint.h> | |
// 1 = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads | |
// It seems on AMD CPUs scalar loads are slightly faster | |
#define USE_GATHER_INSTUCTIONS 0 | |
// Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers | |
// When the sum of all bytes exceeds 0xFF, the output is garbage | |
// Which is fine here because our bytes are in [0..8] interval | |
inline __m128i inclusivePrefixSum( __m128i v ) | |
{ | |
// https://en.wikipedia.org/wiki/Prefix_sum#/media/File:Hillis-Steele_Prefix_Sum.svg | |
v = _mm_add_epi8( v, _mm_slli_si128( v, 1 ) ); | |
v = _mm_add_epi8( v, _mm_slli_si128( v, 2 ) ); | |
v = _mm_add_epi8( v, _mm_slli_si128( v, 4 ) ); | |
v = _mm_add_epi8( v, _mm_slli_si128( v, 8 ) ); | |
return v; | |
} | |
// Extract the last byte from the vector | |
inline uint8_t extractLast( __m128i v ) | |
{ | |
uint16_t tmp = _mm_extract_epi16( v, 7 ); | |
tmp >>= 8; | |
return (uint8_t)tmp; | |
} | |
// Load 4 uint64_t numbers from the correct locations, without AVX2 gathers | |
inline __m256i loadNumbers( const uint8_t* rsi, uint32_t offsets ) | |
{ | |
const int64_t* s0 = (const int64_t*)( rsi + (uint8_t)offsets ); | |
const int64_t* s1 = (const int64_t*)( rsi + (uint8_t)( offsets >> 8 ) ); | |
const int64_t* s2 = (const int64_t*)( rsi + (uint8_t)( offsets >> 16 ) ); | |
const int64_t* s3 = (const int64_t*)( rsi + ( offsets >> 24 ) ); | |
return _mm256_setr_epi64x( *s0, *s1, *s2, *s3 ); | |
} | |
// Load 4 uint64_t numbers from the correct locations, using AVX2 gathers | |
inline __m256i loadNumbers( const uint8_t* rsi, __m128i offsets ) | |
{ | |
// Zero extend bytes to int32 | |
__m128i off = _mm_cvtepu8_epi32( offsets ); | |
// Load 4 numbers with 1 instruction; unfortunately, on AMD this is slower | |
return _mm256_i32gather_epi64( (const int64_t*)rsi, off, 1 ); | |
} | |
// Shift the highest ( 64 - bits[ i ] ) bits in the int64 numbers into the low position | |
inline __m256i shiftNumbers( __m256i v, __m128i bits ) | |
{ | |
// Zero extend bytes to int64 | |
__m256i shift = _mm256_cvtepu8_epi64( bits ); | |
// Shift the numbers | |
return _mm256_srlv_epi64( v, shift ); | |
} | |
// Conditionally negate int64 numbers based on the 0x80 bit in the lowest 4 bytes of the second argument | |
inline __m256i applySigns( __m256i v, __m128i signs ) | |
{ | |
// Sign extend the masks from bytes to int64 | |
__m256i mask = _mm256_cvtepi8_epi64( signs ); | |
// Conditionally negate | |
__m256i neg = _mm256_sub_epi64( _mm256_setzero_si256(), v ); | |
return _mm256_blendv_epi8( v, neg, mask ); | |
} | |
struct BlockHeader | |
{ | |
// Load offsets in bytes related to the start of the block header | |
__m128i offsetBytes; | |
// Right shift amounts to move loaded values to the correct positions, [ 0 .. 64 ] | |
__m128i shifts; | |
// 16 bytes with the 0x80 bit set when the corresponding input was negative; the rest of the bits are unused | |
__m128i signs; | |
// Count of payload bytes in the complete block | |
size_t payloadBytes; | |
}; | |
inline BlockHeader loadHeader( const uint8_t* rsi ) | |
{ | |
// Load 8 bytes, and zero extend them into uint16_t | |
const __m128i v = _mm_cvtepu8_epi16( _mm_loadu_si64( rsi ) ); | |
// Unpack lengths | |
const __m128i seven = _mm_set1_epi8( 7 ); | |
const __m128i l4 = _mm_slli_epi16( v, 4 ); | |
__m128i lengths = _mm_or_si128( v, l4 ); | |
lengths = _mm_and_si128( lengths, seven ); | |
// Transform 7 into 8 | |
__m128i tmp = _mm_cmpeq_epi8( lengths, seven ); | |
lengths = _mm_sub_epi8( lengths, tmp ); | |
BlockHeader header; | |
// Byte offsets to load 16 numbers, relative to the start of the header | |
header.offsetBytes = inclusivePrefixSum( lengths ); | |
// Count of payload bytes in the complete block | |
header.payloadBytes = extractLast( header.offsetBytes ); | |
// Shift amounts, 64 - lengths * 8 | |
header.shifts = _mm_sub_epi8( _mm_set1_epi8( 64 ), _mm_slli_epi16( lengths, 3 ) ); | |
// Signs vector, we only use the highest 0x80 bit in these bytes | |
header.signs = _mm_or_si128( _mm_slli_epi16( v, 8 ), l4 ); | |
return header; | |
} | |
template<int slice> | |
inline void decodeSlice( const BlockHeader& block, int64_t* rdi, const uint8_t* payload ) | |
{ | |
#if USE_GATHER_INSTUCTIONS | |
__m128i off; | |
#else | |
uint32_t off; | |
#endif | |
__m128i bits, s; | |
if constexpr( slice != 0 ) | |
{ | |
constexpr int imm = _MM_SHUFFLE( slice, slice, slice, slice ); | |
#if USE_GATHER_INSTUCTIONS | |
off = _mm_shuffle_epi32( block.offsetBytes, imm ); | |
#else | |
off = (uint32_t)_mm_extract_epi32( block.offsetBytes, slice ); | |
#endif | |
bits = _mm_shuffle_epi32( block.shifts, imm ); | |
s = _mm_shuffle_epi32( block.signs, imm ); | |
} | |
else | |
{ | |
// For the first slice of the block, the 4 lowest bytes are in the correct locations already | |
#if USE_GATHER_INSTUCTIONS | |
off = block.offsetBytes; | |
#else | |
off = (uint32_t)_mm_cvtsi128_si32( block.offsetBytes ); | |
#endif | |
bits = block.shifts; | |
s = block.signs; | |
} | |
__m256i v = loadNumbers( payload, off ); | |
v = shiftNumbers( v, bits ); | |
v = applySigns( v, s ); | |
_mm256_storeu_si256( ( __m256i* )rdi, v ); | |
} | |
// Decode and store a block of 16 numbers, and return pointer to the next encoded block. | |
// BTW, it helps to make sure this function is inlined by the compiler | |
const uint8_t* decodeBlock( int64_t* rdi, const uint8_t* rsi ) | |
{ | |
const BlockHeader block = loadHeader( rsi ); | |
decodeSlice<0>( block, rdi, rsi ); | |
decodeSlice<1>( block, rdi + 4, rsi ); | |
decodeSlice<2>( block, rdi + 8, rsi ); | |
decodeSlice<3>( block, rdi + 12, rsi ); | |
return rsi + block.payloadBytes + 8; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment