Const-me · March 11, 2023 01:26
diff --git a/QuantisationTest.cpp b/QuantisationTest.cpp
 // ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
 #include <array>
 #include <immintrin.h>

 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
 inline __m256i bytesFromNibbles( const uint8_t* rsi )
 {
 	// Load 16 bytes from memory
 	__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );

 	// Expand bytes into uint16_t values
 	__m256i bytes = _mm256_cvtepu8_epi16( tmp );

 	// Unpack values into individual bytes
 	const __m256i lowMask = _mm256_set1_epi8( 0xF );
 	__m256i high = _mm256_andnot_si256( lowMask, bytes );
 	__m256i low = _mm256_and_si256( lowMask, bytes );
 	high = _mm256_slli_epi16( high, 4 );
 	bytes = _mm256_or_si256( low, high );
 	return bytes;
 }

 // Convert lower 8 lower bytes in the vector from int8_t into float lanes
 inline __m256 makeFloats( __m128i bytes )
 {
 	__m256i i32 = _mm256_cvtepi8_epi32( bytes );
 	return _mm256_cvtepi32_ps( i32 );
 }

 // Decompress Q4_0 compressed block, the block size is 32
 // The block payload contains 1 reference value (the first argument), and 32 4-bit values packed into 16 bytes (second argument)
 std::array<__m256, 4> decompressBlock40( const float* scaling, const uint8_t* rsi )
 {
 	// Unpack 4-bit fields into bytes
 	__m256i bytes = bytesFromNibbles( rsi );
 	// Now we have a vector with bytes in [0..15], offset into [-8..+7]
 	const __m256i off = _mm256_set1_epi8( 8 );
 	bytes = _mm256_sub_epi8( bytes, off );

 	// Broadcast ref1 into AVX vector
 	const __m256 sv = _mm256_broadcast_ss( scaling );

 	// Produce the result
 	std::array<__m256, 4> arr;

 	__m128i tmp = _mm256_castsi256_si128( bytes );
 	arr[ 0 ] = _mm256_mul_ps( sv, makeFloats( tmp ) );
 	tmp = _mm_srli_si128( tmp, 8 );
 	arr[ 1 ] = _mm256_mul_ps( sv, makeFloats( tmp ) );

 	tmp = _mm256_extracti128_si256( bytes, 1 );
 	arr[ 2 ] = _mm256_mul_ps( sv, makeFloats( tmp ) );
 	tmp = _mm_srli_si128( tmp, 8 );
 	arr[ 3 ] = _mm256_mul_ps( sv, makeFloats( tmp ) );
 	return arr;
 }

 // Decompress Q4_1 compressed block, the block size is 32
 // The block payload contains min value, scaling vactor, and 32 4-bit values packed into 16 bytes
 std::array<__m256, 4> decompressBlock41( const float* minValue, const float* scaling, const uint8_t* rsi )
 {
 	// Unpack 4-bit fields into bytes
 	const __m256i bytes = bytesFromNibbles( rsi );

 	// Broadcast both floats into AVX vectors
 	const __m256 iv = _mm256_broadcast_ss( minValue );
 	const __m256 sv = _mm256_broadcast_ss( scaling );

 	// Produce the result
 	std::array<__m256, 4> arr;

 	__m128i tmp = _mm256_castsi256_si128( bytes );
 	arr[ 0 ] = _mm256_fmadd_ps( sv, makeFloats( tmp ), iv );
 	tmp = _mm_srli_si128( tmp, 8 );
 	arr[ 1 ] = _mm256_fmadd_ps( sv, makeFloats( tmp ), iv );

 	tmp = _mm256_extracti128_si256( bytes, 1 );
 	arr[ 2 ] = _mm256_fmadd_ps( sv, makeFloats( tmp ), iv );
 	tmp = _mm_srli_si128( tmp, 8 );
 	arr[ 3 ] = _mm256_fmadd_ps( sv, makeFloats( tmp ), iv );
 	return arr;
 }

 // ==== Debug Functions ====
 #include <assert.h>
 #include <cmath>
 #include <stdio.h>

 inline void storeBlock( std::array<float, 32>& arr, std::array<__m256, 4> v )
 {
 	float* rdi = arr.data();
 	_mm256_storeu_ps( rdi, v[ 0 ] );
 	_mm256_storeu_ps( rdi + 8, v[ 1 ] );
 	_mm256_storeu_ps( rdi + 16, v[ 2 ] );
 	_mm256_storeu_ps( rdi + 24, v[ 3 ] );
 }

 float decompressScalar40( float scaling, uint8_t byte )
 {
 	assert( byte <= 15 );
 	int8_t val = (int8_t)byte - 8;
 	return scaling * val;
 }

 float decompressScalar41( float minValue, float scaling, uint8_t byte )
 {
 	assert( byte <= 15 );
 	return std::fma( scaling, (float)byte, minValue );
 }

 int main()
 {
 	const float scaling = 13;
 	const float min = 44;
 	// From random.org
 	const std::array<uint8_t, 16> bytes = { 188, 56, 77, 68, 113, 245, 126, 231, 143, 225, 48, 216, 191, 53, 110, 118 };

 	// Decompress and store these bytes in both compressed formats
 	std::array<float, 32> b40, b41;
 	storeBlock( b40, decompressBlock40( &scaling, bytes.data() ) );
 	storeBlock( b41, decompressBlock41( &min, &scaling, bytes.data() ) );

 	// Verify the data
 	for( size_t i = 0; i < 32; i++ )
 	{
 		uint8_t byte = bytes[ i / 2 ];
 		if( 0 == ( i % 2 ) )
 			byte &= 0xF;
 		else
 			byte = byte >> 4;

 		// Verify Q4_0 decompressor
 		float fast = b40[ i ];
 		float scalar = decompressScalar40( scaling, byte );
 		if( fast != scalar )
 			return 1;

 		// Verify Q4_1 decompressor
 		fast = b41[ i ];
 		scalar = decompressScalar41( min, scaling, byte );
 		if( fast != scalar )
 			return 1;
 	}

 	printf( "Success!\n" );
 	return 0;
 }
	// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
	#include <array>
	#include <immintrin.h>

	// Unpack 32 4-bit fields into 32 bytes
	// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
	inline __m256i bytesFromNibbles( const uint8_t* rsi )
	{
	// Load 16 bytes from memory
	__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );

	// Expand bytes into uint16_t values
	__m256i bytes = _mm256_cvtepu8_epi16( tmp );

	// Unpack values into individual bytes
	const __m256i lowMask = _mm256_set1_epi8( 0xF );
	__m256i high = _mm256_andnot_si256( lowMask, bytes );
	__m256i low = _mm256_and_si256( lowMask, bytes );
	high = _mm256_slli_epi16( high, 4 );
	bytes = _mm256_or_si256( low, high );
	return bytes;
	}

	// Convert lower 8 lower bytes in the vector from int8_t into float lanes
	inline __m256 makeFloats( __m128i bytes )
	{
	__m256i i32 = _mm256_cvtepi8_epi32( bytes );
	return _mm256_cvtepi32_ps( i32 );
	}

	// Decompress Q4_0 compressed block, the block size is 32
	// The block payload contains 1 reference value (the first argument), and 32 4-bit values packed into 16 bytes (second argument)
	std::array<__m256, 4> decompressBlock40( const float* scaling, const uint8_t* rsi )
	{
	// Unpack 4-bit fields into bytes
	__m256i bytes = bytesFromNibbles( rsi );
	// Now we have a vector with bytes in [0..15], offset into [-8..+7]
	const __m256i off = _mm256_set1_epi8( 8 );
	bytes = _mm256_sub_epi8( bytes, off );

	// Broadcast ref1 into AVX vector
	const __m256 sv = _mm256_broadcast_ss( scaling );

	// Produce the result
	std::array<__m256, 4> arr;

	__m128i tmp = _mm256_castsi256_si128( bytes );
	arr[ 0 ] = _mm256_mul_ps( sv, makeFloats( tmp ) );
	tmp = _mm_srli_si128( tmp, 8 );
	arr[ 1 ] = _mm256_mul_ps( sv, makeFloats( tmp ) );

	tmp = _mm256_extracti128_si256( bytes, 1 );
	arr[ 2 ] = _mm256_mul_ps( sv, makeFloats( tmp ) );
	tmp = _mm_srli_si128( tmp, 8 );
	arr[ 3 ] = _mm256_mul_ps( sv, makeFloats( tmp ) );
	return arr;
	}

	// Decompress Q4_1 compressed block, the block size is 32
	// The block payload contains min value, scaling vactor, and 32 4-bit values packed into 16 bytes
	std::array<__m256, 4> decompressBlock41( const float* minValue, const float* scaling, const uint8_t* rsi )
	{
	// Unpack 4-bit fields into bytes
	const __m256i bytes = bytesFromNibbles( rsi );

	// Broadcast both floats into AVX vectors
	const __m256 iv = _mm256_broadcast_ss( minValue );
	const __m256 sv = _mm256_broadcast_ss( scaling );

	// Produce the result
	std::array<__m256, 4> arr;

	__m128i tmp = _mm256_castsi256_si128( bytes );
	arr[ 0 ] = _mm256_fmadd_ps( sv, makeFloats( tmp ), iv );
	tmp = _mm_srli_si128( tmp, 8 );
	arr[ 1 ] = _mm256_fmadd_ps( sv, makeFloats( tmp ), iv );

	tmp = _mm256_extracti128_si256( bytes, 1 );
	arr[ 2 ] = _mm256_fmadd_ps( sv, makeFloats( tmp ), iv );
	tmp = _mm_srli_si128( tmp, 8 );
	arr[ 3 ] = _mm256_fmadd_ps( sv, makeFloats( tmp ), iv );
	return arr;
	}

	// ==== Debug Functions ====
	#include <assert.h>
	#include <cmath>
	#include <stdio.h>

	inline void storeBlock( std::array<float, 32>& arr, std::array<__m256, 4> v )
	{
	float* rdi = arr.data();
	_mm256_storeu_ps( rdi, v[ 0 ] );
	_mm256_storeu_ps( rdi + 8, v[ 1 ] );
	_mm256_storeu_ps( rdi + 16, v[ 2 ] );
	_mm256_storeu_ps( rdi + 24, v[ 3 ] );
	}

	float decompressScalar40( float scaling, uint8_t byte )
	{
	assert( byte <= 15 );
	int8_t val = (int8_t)byte - 8;
	return scaling * val;
	}

	float decompressScalar41( float minValue, float scaling, uint8_t byte )
	{
	assert( byte <= 15 );
	return std::fma( scaling, (float)byte, minValue );
	}

	int main()
	{
	const float scaling = 13;
	const float min = 44;
	// From random.org
	const std::array<uint8_t, 16> bytes = { 188, 56, 77, 68, 113, 245, 126, 231, 143, 225, 48, 216, 191, 53, 110, 118 };

	// Decompress and store these bytes in both compressed formats
	std::array<float, 32> b40, b41;
	storeBlock( b40, decompressBlock40( &scaling, bytes.data() ) );
	storeBlock( b41, decompressBlock41( &min, &scaling, bytes.data() ) );

	// Verify the data
	for( size_t i = 0; i < 32; i++ )
	{
	uint8_t byte = bytes[ i / 2 ];
	if( 0 == ( i % 2 ) )
	byte &= 0xF;
	else
	byte = byte >> 4;

	// Verify Q4_0 decompressor
	float fast = b40[ i ];
	float scalar = decompressScalar40( scaling, byte );
	if( fast != scalar )
	return 1;

	// Verify Q4_1 decompressor
	fast = b41[ i ];
	scalar = decompressScalar41( min, scaling, byte );
	if( fast != scalar )
	return 1;
	}

	printf( "Success!\n" );
	return 0;
	}