Created
March 11, 2023 01:26
-
-
Save Const-me/a0529a8c9885d371138a1c50e0622040 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ==== | |
#include <array> | |
#include <immintrin.h> | |
// Unpack 32 4-bit fields into 32 bytes | |
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval | |
inline __m256i bytesFromNibbles( const uint8_t* rsi ) | |
{ | |
// Load 16 bytes from memory | |
__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi ); | |
// Expand bytes into uint16_t values | |
__m256i bytes = _mm256_cvtepu8_epi16( tmp ); | |
// Unpack values into individual bytes | |
const __m256i lowMask = _mm256_set1_epi8( 0xF ); | |
__m256i high = _mm256_andnot_si256( lowMask, bytes ); | |
__m256i low = _mm256_and_si256( lowMask, bytes ); | |
high = _mm256_slli_epi16( high, 4 ); | |
bytes = _mm256_or_si256( low, high ); | |
return bytes; | |
} | |
// Convert lower 8 lower bytes in the vector from int8_t into float lanes | |
inline __m256 makeFloats( __m128i bytes ) | |
{ | |
__m256i i32 = _mm256_cvtepi8_epi32( bytes ); | |
return _mm256_cvtepi32_ps( i32 ); | |
} | |
// Decompress Q4_0 compressed block, the block size is 32 | |
// The block payload contains 1 reference value (the first argument), and 32 4-bit values packed into 16 bytes (second argument) | |
std::array<__m256, 4> decompressBlock40( const float* scaling, const uint8_t* rsi ) | |
{ | |
// Unpack 4-bit fields into bytes | |
__m256i bytes = bytesFromNibbles( rsi ); | |
// Now we have a vector with bytes in [0..15], offset into [-8..+7] | |
const __m256i off = _mm256_set1_epi8( 8 ); | |
bytes = _mm256_sub_epi8( bytes, off ); | |
// Broadcast ref1 into AVX vector | |
const __m256 sv = _mm256_broadcast_ss( scaling ); | |
// Produce the result | |
std::array<__m256, 4> arr; | |
__m128i tmp = _mm256_castsi256_si128( bytes ); | |
arr[ 0 ] = _mm256_mul_ps( sv, makeFloats( tmp ) ); | |
tmp = _mm_srli_si128( tmp, 8 ); | |
arr[ 1 ] = _mm256_mul_ps( sv, makeFloats( tmp ) ); | |
tmp = _mm256_extracti128_si256( bytes, 1 ); | |
arr[ 2 ] = _mm256_mul_ps( sv, makeFloats( tmp ) ); | |
tmp = _mm_srli_si128( tmp, 8 ); | |
arr[ 3 ] = _mm256_mul_ps( sv, makeFloats( tmp ) ); | |
return arr; | |
} | |
// Decompress Q4_1 compressed block, the block size is 32 | |
// The block payload contains min value, scaling vactor, and 32 4-bit values packed into 16 bytes | |
std::array<__m256, 4> decompressBlock41( const float* minValue, const float* scaling, const uint8_t* rsi ) | |
{ | |
// Unpack 4-bit fields into bytes | |
const __m256i bytes = bytesFromNibbles( rsi ); | |
// Broadcast both floats into AVX vectors | |
const __m256 iv = _mm256_broadcast_ss( minValue ); | |
const __m256 sv = _mm256_broadcast_ss( scaling ); | |
// Produce the result | |
std::array<__m256, 4> arr; | |
__m128i tmp = _mm256_castsi256_si128( bytes ); | |
arr[ 0 ] = _mm256_fmadd_ps( sv, makeFloats( tmp ), iv ); | |
tmp = _mm_srli_si128( tmp, 8 ); | |
arr[ 1 ] = _mm256_fmadd_ps( sv, makeFloats( tmp ), iv ); | |
tmp = _mm256_extracti128_si256( bytes, 1 ); | |
arr[ 2 ] = _mm256_fmadd_ps( sv, makeFloats( tmp ), iv ); | |
tmp = _mm_srli_si128( tmp, 8 ); | |
arr[ 3 ] = _mm256_fmadd_ps( sv, makeFloats( tmp ), iv ); | |
return arr; | |
} | |
// ==== Debug Functions ==== | |
#include <assert.h> | |
#include <cmath> | |
#include <stdio.h> | |
inline void storeBlock( std::array<float, 32>& arr, std::array<__m256, 4> v ) | |
{ | |
float* rdi = arr.data(); | |
_mm256_storeu_ps( rdi, v[ 0 ] ); | |
_mm256_storeu_ps( rdi + 8, v[ 1 ] ); | |
_mm256_storeu_ps( rdi + 16, v[ 2 ] ); | |
_mm256_storeu_ps( rdi + 24, v[ 3 ] ); | |
} | |
float decompressScalar40( float scaling, uint8_t byte ) | |
{ | |
assert( byte <= 15 ); | |
int8_t val = (int8_t)byte - 8; | |
return scaling * val; | |
} | |
float decompressScalar41( float minValue, float scaling, uint8_t byte ) | |
{ | |
assert( byte <= 15 ); | |
return std::fma( scaling, (float)byte, minValue ); | |
} | |
int main() | |
{ | |
const float scaling = 13; | |
const float min = 44; | |
// From random.org | |
const std::array<uint8_t, 16> bytes = { 188, 56, 77, 68, 113, 245, 126, 231, 143, 225, 48, 216, 191, 53, 110, 118 }; | |
// Decompress and store these bytes in both compressed formats | |
std::array<float, 32> b40, b41; | |
storeBlock( b40, decompressBlock40( &scaling, bytes.data() ) ); | |
storeBlock( b41, decompressBlock41( &min, &scaling, bytes.data() ) ); | |
// Verify the data | |
for( size_t i = 0; i < 32; i++ ) | |
{ | |
uint8_t byte = bytes[ i / 2 ]; | |
if( 0 == ( i % 2 ) ) | |
byte &= 0xF; | |
else | |
byte = byte >> 4; | |
// Verify Q4_0 decompressor | |
float fast = b40[ i ]; | |
float scalar = decompressScalar40( scaling, byte ); | |
if( fast != scalar ) | |
return 1; | |
// Verify Q4_1 decompressor | |
fast = b41[ i ]; | |
scalar = decompressScalar41( min, scaling, byte ); | |
if( fast != scalar ) | |
return 1; | |
} | |
printf( "Success!\n" ); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment