-
-
Save motorcityadam/abb3fec692a7fa0a8cd26f56804c0f11 to your computer and use it in GitHub Desktop.
U32->F32 using SSE2 intrinsics.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ---- Straightforward: | |
__m128i lo_int = _mm_and_si128(_mm_set1_epi32(0xffff), in); // low 16 bits of all vals | |
__m128i hi_int = _mm_srli_epi32(in, 16); // high 16 bits of all vals | |
__m128 lo_flt = _mm_cvtepi32_ps(lo_int); // exact (all 16 bit ints = machine numbers) | |
__m128 hi_flt = _mm_cvtepi32_ps(hi_int); // exact | |
__m128 hi_scl = _mm_mul_ps(hi_flt, _mm_set1_ps(65536.0f)); // exact (just exponent change) | |
__m128 result = _mm_add_ps(hi_scl, lo_flt); // this is the only step that rounds. | |
// same approach also works with FMA where available. | |
// Alternative spin on same idea: do conversion using IEEE magic values. | |
// Replaces hi_flt calculation with an "or", hi_scl calculation with a "sub". | |
// Rest is the same. Not sure which is faster. | |
// ---- Magic constant version 1: | |
static const float magic_scaled16 = 549755813888.0f; // 2**(16 + 23) | |
__m128i lo_int = _mm_and_si128(_mm_set1_epi32(0xffff), in); | |
__m128i hi_int = _mm_srli_epi32(in, 16); | |
__m128 lo_flt = _mm_cvtepi32_ps(lo_int); | |
__m128 hi_flt = _mm_or_ps(_mm_castsi128_ps(hi_int), _mm_set1_ps(magic_scaled16)); | |
__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16)); | |
__m128 result = _mm_add_ps(hi_scl, lo_flt); | |
// ---- And if you want to get really tricky, you can convert *both* ints to floats | |
// via magic constants. This is cheaper than one would expect because the two bias | |
// subtractions can be folded into one and still be exact for this problem: | |
static const float magic_scaled16 = 549755813888.0f; // 2**(16 + 23) | |
static const float magic_unscaled = 8388608.0f; // 2**23 | |
__m128i lo_int = _mm_and_si128(_mm_set1_epi32(0xffff), in); | |
__m128i hi_int = _mm_srli_epi32(in, 16); | |
__m128 lo_flt = _mm_or_ps(_mm_castsi128_ps(lo_int), _mm_set1_ps(magic_unscaled)); | |
__m128 hi_flt = _mm_or_ps(_mm_castsi128_ps(hi_int), _mm_set1_ps(magic_scaled16)); | |
__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled)); | |
__m128 result = _mm_add_ps(hi_scl, lo_flt); | |
// Needs 4 constants, though. | |
// ---- One option that works everywhere and reduces the number of constant loads | |
// to two is using shuffles: | |
static const uint32_t pack_magic = ((127 + (16 + 23)) << 23) | ((127 + ( 0 + 23)) << 7)); | |
__m128 ileave_hi = _mm_castsi128_ps(_mm_unpackhi_epi16(in, _mm_set1_epi32(pack_magic))); | |
__m128 ileave_lo = _mm_castsi128_ps(_mm_unpacklo_epi16(in, _mm_set1_epi32(pack_magic))); | |
__m128 hi_flt = _mm_shuffle_ps(ileave_lo, ileave_hi, 0xdd); | |
__m128 lo_flt = _mm_shuffle_ps(ileave_lo, ileave_hi, 0x88); | |
__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled)); | |
__m128 result = _mm_add_ps(hi_scl, lo_flt); | |
// Makes it very shuffle-heavy though. Not a win in a tight loop that batch-converts, | |
// but when you have a bunch of computation following it might be the right choice. | |
// ---- With SSE 4.1, you can reduce the constants to three without any penalties, | |
// by reducing the AND/OR for "lo_flt" to a single PBLENDW: | |
__m128 lo_flt = _mm_castsi128_ps(_mm_blend_epi16(in, _mm_set1_epi32((127 + 23) << 23), 0xaa)); | |
__m128i hi_int = _mm_srli_epi32(in, 16); | |
__m128 hi_flt = _mm_or_ps(_mm_castsi128_ps(hi_int), magic_scaled16); | |
__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled)); | |
__m128 result = _mm_add_ps(hi_scl, lo_flt); | |
// ---- You can also reduce the constants to two by using the same trick again for | |
// the high half, but this time it doesn't reduce the op count and is thus not as | |
// attractive (it should also be slightly slower in isolation). | |
__m128i const blend_magic = _mm_set1_epi32(((127 + ( 0 + 23)) << 23) | ((127 + (16 + 23)) << 7))); | |
__m128i hi_tmp = _mm_blend_epi16(in, blend_magic, 0x55); | |
__m128 lo_flt = _mm_castsi128_ps(_mm_blend_epi16(in, blend_magic, 0xaa)); | |
__m128 hi_flt = _mm_castsi128_ps(_mm_alignr_epi8(hi_tmp, hi_tmp, 2)); | |
__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled)); | |
__m128 result = _mm_add_ps(hi_scl, lo_flt); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment