Const-me · October 30, 2023 15:45
diff --git a/varIntDecoder.cpp b/varIntDecoder.cpp
 #include <immintrin.h>
 #include <stdint.h>

 // 1  = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads
 // It seems on AMD CPUs scalar loads are slightly faster
 #define USE_GATHER_INSTUCTIONS 0

 // Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers
 // When the sum of all bytes exceeds 0xFF, the output is garbage
 // Which is fine here because our bytes are in [0..8] interval
 inline __m128i inclusivePrefixSum( __m128i v )
 {
 	// https://en.wikipedia.org/wiki/Prefix_sum#/media/File:Hillis-Steele_Prefix_Sum.svg
 	v = _mm_add_epi8( v, _mm_slli_si128( v, 1 ) );
 	v = _mm_add_epi8( v, _mm_slli_si128( v, 2 ) );
 	v = _mm_add_epi8( v, _mm_slli_si128( v, 4 ) );
 	v = _mm_add_epi8( v, _mm_slli_si128( v, 8 ) );
 	return v;
 }

 // Extract the last byte from the vector
 inline uint8_t extractLast( __m128i v )
 {
 	uint16_t tmp = _mm_extract_epi16( v, 7 );
 	tmp >>= 8;
 	return (uint8_t)tmp;
 }

 // Load 4 uint64_t numbers from the correct locations, without AVX2 gathers
 inline __m256i loadNumbers( const uint8_t* rsi, uint32_t offsets )
 {
 	const int64_t* s0 = (const int64_t*)( rsi + (uint8_t)offsets );
 	const int64_t* s1 = (const int64_t*)( rsi + (uint8_t)( offsets >> 8 ) );
 	const int64_t* s2 = (const int64_t*)( rsi + (uint8_t)( offsets >> 16 ) );
 	const int64_t* s3 = (const int64_t*)( rsi + ( offsets >> 24 ) );
 	return _mm256_setr_epi64x( *s0, *s1, *s2, *s3 );
 }

 // Load 4 uint64_t numbers from the correct locations, using AVX2 gathers
 inline __m256i loadNumbers( const uint8_t* rsi, __m128i offsets )
 {
 	// Zero extend bytes to int32
 	__m128i off = _mm_cvtepu8_epi32( offsets );
 	// Load 4 numbers with 1 instruction; unfortunately, on AMD this is slower
 	return _mm256_i32gather_epi64( (const int64_t*)rsi, off, 1 );
 }

 // Shift the highest ( 64 - bits[ i ] ) bits in the int64 numbers into the low position
 inline __m256i shiftNumbers( __m256i v, __m128i bits )
 {
 	// Zero extend bytes to int64
 	__m256i shift = _mm256_cvtepu8_epi64( bits );
 	// Shift the numbers
 	return _mm256_srlv_epi64( v, shift );
 }

 // Conditionally negate int64 numbers based on the 0x80 bit in the lowest 4 bytes of the second argument
 inline __m256i applySigns( __m256i v, __m128i signs )
 {
 	// Sign extend the masks from bytes to int64
 	__m256i mask = _mm256_cvtepi8_epi64( signs );
 	// Conditionally negate
 	__m256i neg = _mm256_sub_epi64( _mm256_setzero_si256(), v );
 	return _mm256_blendv_epi8( v, neg, mask );
 }

 struct BlockHeader
 {
 	// Load offsets in bytes related to the start of the block header
 	__m128i offsetBytes;
 	// Right shift amounts to move loaded values to the correct positions, [ 0 .. 64 ]
 	__m128i shifts;
 	// 16 bytes with the 0x80 bit set when the corresponding input was negative; the rest of the bits are unused
 	__m128i signs;
 	// Count of payload bytes in the complete block
 	size_t payloadBytes;
 };

 inline BlockHeader loadHeader( const uint8_t* rsi )
 {
 	// Load 8 bytes, and zero extend them into uint16_t
 	const __m128i v = _mm_cvtepu8_epi16( _mm_loadu_si64( rsi ) );

 	// Unpack lengths
 	const __m128i seven = _mm_set1_epi8( 7 );
 	const __m128i l4 = _mm_slli_epi16( v, 4 );
 	__m128i lengths = _mm_or_si128( v, l4 );
 	lengths = _mm_and_si128( lengths, seven );
 	// Transform 7 into 8
 	__m128i tmp = _mm_cmpeq_epi8( lengths, seven );
 	lengths = _mm_sub_epi8( lengths, tmp );

 	BlockHeader header;
 	// Byte offsets to load 16 numbers, relative to the start of the header
 	header.offsetBytes = inclusivePrefixSum( lengths );
 	// Count of payload bytes in the complete block
 	header.payloadBytes = extractLast( header.offsetBytes );
 	// Shift amounts, 64 - lengths * 8
 	header.shifts = _mm_sub_epi8( _mm_set1_epi8( 64 ), _mm_slli_epi16( lengths, 3 ) );
 	// Signs vector, we only use the highest 0x80 bit in these bytes
 	header.signs = _mm_or_si128( _mm_slli_epi16( v, 8 ), l4 );

 	return header;
 }

 template<int slice>
 inline void decodeSlice( const BlockHeader& block, int64_t* rdi, const uint8_t* payload )
 {
 #if USE_GATHER_INSTUCTIONS
 	__m128i off;
 #else
 	uint32_t off;
 #endif
 	__m128i bits, s;
 	if constexpr( slice != 0 )
 	{
 		constexpr int imm = _MM_SHUFFLE( slice, slice, slice, slice );
 #if USE_GATHER_INSTUCTIONS
 		off = _mm_shuffle_epi32( block.offsetBytes, imm );
 #else
 		off = (uint32_t)_mm_extract_epi32( block.offsetBytes, slice );
 #endif
 		bits = _mm_shuffle_epi32( block.shifts, imm );
 		s = _mm_shuffle_epi32( block.signs, imm );
 	}
 	else
 	{
 		// For the first slice of the block, the 4 lowest bytes are in the correct locations already
 #if USE_GATHER_INSTUCTIONS
 		off = block.offsetBytes;
 #else
 		off = (uint32_t)_mm_cvtsi128_si32( block.offsetBytes );
 #endif
 		bits = block.shifts;
 		s = block.signs;
 	}

 	__m256i v = loadNumbers( payload, off );
 	v = shiftNumbers( v, bits );
 	v = applySigns( v, s );
 	_mm256_storeu_si256( ( __m256i* )rdi, v );
 }

 // Decode and store a block of 16 numbers, and return pointer to the next encoded block.
 // BTW, it helps to make sure this function is inlined by the compiler
 const uint8_t* decodeBlock( int64_t* rdi, const uint8_t* rsi )
 {
 	const BlockHeader block = loadHeader( rsi );

 	decodeSlice<0>( block, rdi, rsi );
 	decodeSlice<1>( block, rdi + 4, rsi );
 	decodeSlice<2>( block, rdi + 8, rsi );
 	decodeSlice<3>( block, rdi + 12, rsi );

 	return rsi + block.payloadBytes + 8;
 }
	#include <immintrin.h>
	#include <stdint.h>

	// 1 = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads
	// It seems on AMD CPUs scalar loads are slightly faster
	#define USE_GATHER_INSTUCTIONS 0

	// Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers
	// When the sum of all bytes exceeds 0xFF, the output is garbage
	// Which is fine here because our bytes are in [0..8] interval
	inline __m128i inclusivePrefixSum( __m128i v )
	{
	// https://en.wikipedia.org/wiki/Prefix_sum#/media/File:Hillis-Steele_Prefix_Sum.svg
	v = _mm_add_epi8( v, _mm_slli_si128( v, 1 ) );
	v = _mm_add_epi8( v, _mm_slli_si128( v, 2 ) );
	v = _mm_add_epi8( v, _mm_slli_si128( v, 4 ) );
	v = _mm_add_epi8( v, _mm_slli_si128( v, 8 ) );
	return v;
	}

	// Extract the last byte from the vector
	inline uint8_t extractLast( __m128i v )
	{
	uint16_t tmp = _mm_extract_epi16( v, 7 );
	tmp >>= 8;
	return (uint8_t)tmp;
	}

	// Load 4 uint64_t numbers from the correct locations, without AVX2 gathers
	inline __m256i loadNumbers( const uint8_t* rsi, uint32_t offsets )
	{
	const int64_t* s0 = (const int64_t*)( rsi + (uint8_t)offsets );
	const int64_t* s1 = (const int64_t*)( rsi + (uint8_t)( offsets >> 8 ) );
	const int64_t* s2 = (const int64_t*)( rsi + (uint8_t)( offsets >> 16 ) );
	const int64_t* s3 = (const int64_t*)( rsi + ( offsets >> 24 ) );
	return _mm256_setr_epi64x( s0, s1, s2, s3 );
	}

	// Load 4 uint64_t numbers from the correct locations, using AVX2 gathers
	inline __m256i loadNumbers( const uint8_t* rsi, __m128i offsets )
	{
	// Zero extend bytes to int32
	__m128i off = _mm_cvtepu8_epi32( offsets );
	// Load 4 numbers with 1 instruction; unfortunately, on AMD this is slower
	return _mm256_i32gather_epi64( (const int64_t*)rsi, off, 1 );
	}

	// Shift the highest ( 64 - bits[ i ] ) bits in the int64 numbers into the low position
	inline __m256i shiftNumbers( __m256i v, __m128i bits )
	{
	// Zero extend bytes to int64
	__m256i shift = _mm256_cvtepu8_epi64( bits );
	// Shift the numbers
	return _mm256_srlv_epi64( v, shift );
	}

	// Conditionally negate int64 numbers based on the 0x80 bit in the lowest 4 bytes of the second argument
	inline __m256i applySigns( __m256i v, __m128i signs )
	{
	// Sign extend the masks from bytes to int64
	__m256i mask = _mm256_cvtepi8_epi64( signs );
	// Conditionally negate
	__m256i neg = _mm256_sub_epi64( _mm256_setzero_si256(), v );
	return _mm256_blendv_epi8( v, neg, mask );
	}

	struct BlockHeader
	{
	// Load offsets in bytes related to the start of the block header
	__m128i offsetBytes;
	// Right shift amounts to move loaded values to the correct positions, [ 0 .. 64 ]
	__m128i shifts;
	// 16 bytes with the 0x80 bit set when the corresponding input was negative; the rest of the bits are unused
	__m128i signs;
	// Count of payload bytes in the complete block
	size_t payloadBytes;
	};

	inline BlockHeader loadHeader( const uint8_t* rsi )
	{
	// Load 8 bytes, and zero extend them into uint16_t
	const __m128i v = _mm_cvtepu8_epi16( _mm_loadu_si64( rsi ) );

	// Unpack lengths
	const __m128i seven = _mm_set1_epi8( 7 );
	const __m128i l4 = _mm_slli_epi16( v, 4 );
	__m128i lengths = _mm_or_si128( v, l4 );
	lengths = _mm_and_si128( lengths, seven );
	// Transform 7 into 8
	__m128i tmp = _mm_cmpeq_epi8( lengths, seven );
	lengths = _mm_sub_epi8( lengths, tmp );

	BlockHeader header;
	// Byte offsets to load 16 numbers, relative to the start of the header
	header.offsetBytes = inclusivePrefixSum( lengths );
	// Count of payload bytes in the complete block
	header.payloadBytes = extractLast( header.offsetBytes );
	// Shift amounts, 64 - lengths * 8
	header.shifts = _mm_sub_epi8( _mm_set1_epi8( 64 ), _mm_slli_epi16( lengths, 3 ) );
	// Signs vector, we only use the highest 0x80 bit in these bytes
	header.signs = _mm_or_si128( _mm_slli_epi16( v, 8 ), l4 );

	return header;
	}

	template<int slice>
	inline void decodeSlice( const BlockHeader& block, int64_t* rdi, const uint8_t* payload )
	{
	#if USE_GATHER_INSTUCTIONS
	__m128i off;
	#else
	uint32_t off;
	#endif
	__m128i bits, s;
	if constexpr( slice != 0 )
	{
	constexpr int imm = _MM_SHUFFLE( slice, slice, slice, slice );
	#if USE_GATHER_INSTUCTIONS
	off = _mm_shuffle_epi32( block.offsetBytes, imm );
	#else
	off = (uint32_t)_mm_extract_epi32( block.offsetBytes, slice );
	#endif
	bits = _mm_shuffle_epi32( block.shifts, imm );
	s = _mm_shuffle_epi32( block.signs, imm );
	}
	else
	{
	// For the first slice of the block, the 4 lowest bytes are in the correct locations already
	#if USE_GATHER_INSTUCTIONS
	off = block.offsetBytes;
	#else
	off = (uint32_t)_mm_cvtsi128_si32( block.offsetBytes );
	#endif
	bits = block.shifts;
	s = block.signs;
	}

	__m256i v = loadNumbers( payload, off );
	v = shiftNumbers( v, bits );
	v = applySigns( v, s );
	_mm256_storeu_si256( ( __m256i* )rdi, v );
	}

	// Decode and store a block of 16 numbers, and return pointer to the next encoded block.
	// BTW, it helps to make sure this function is inlined by the compiler
	const uint8_t* decodeBlock( int64_t* rdi, const uint8_t* rsi )
	{
	const BlockHeader block = loadHeader( rsi );

	decodeSlice<0>( block, rdi, rsi );
	decodeSlice<1>( block, rdi + 4, rsi );
	decodeSlice<2>( block, rdi + 8, rsi );
	decodeSlice<3>( block, rdi + 12, rsi );

	return rsi + block.payloadBytes + 8;
	}