Created
March 15, 2024 13:32
-
-
Save Const-me/dfee9f414982262df5e959713f5f63ac to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <emmintrin.h> // SSE 2 | |
#include <tmmintrin.h> // SSSE 3 | |
#include <smmintrin.h> // SSE 4.1 | |
// Vector constants for dot4Sse function | |
struct ConstantVectorsSse | |
{ | |
__m128i abcd; | |
__m128i lowNibbleMask; | |
__m128i zero; | |
}; | |
// Pack 4 bytes into a single uint32_t value | |
uint32_t packBytes( uint32_t a, uint32_t b, uint32_t c, uint32_t d ) | |
{ | |
b <<= 8; | |
c <<= 16; | |
d <<= 24; | |
return a | b | c | d; | |
} | |
// Initialize vector constants for dot4Sse function | |
struct ConstantVectorsSse makeConstantsSse( uint8_t a, uint8_t b, uint8_t c, uint8_t d ) | |
{ | |
struct ConstantVectorsSse cv; | |
cv.abcd = _mm_set1_epi32( (int)packBytes( a, b, c, d ) ); | |
cv.lowNibbleMask = _mm_set1_epi8( 0x0F ); | |
cv.zero = _mm_setzero_si128(); | |
return cv; | |
} | |
// Dot products of 4 groups of 4 bytes in memory against 4 small constants | |
// Returns a vector of 4 int32 lanes | |
__m128i dot4Sse( const uint8_t* rsi, const struct ConstantVectorsSse* cv ) | |
{ | |
// Load 16 bytes, and mask away higher 4 bits in each byte | |
__m128i v = _mm_loadu_si128( ( const __m128i* )rsi ); | |
v = _mm_and_si128( cv->lowNibbleMask, v ); | |
// Compute products, add pairwise | |
v = _mm_maddubs_epi16( cv->abcd, v ); | |
// Final reduction step, add adjacent pairs of uint16_t lanes | |
__m128i high = _mm_srli_epi32( v, 16 ); | |
__m128i low = _mm_blend_epi16( v, cv->zero, 0b10101010 ); | |
return _mm_add_epi32( high, low ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment