motorcityadam · July 22, 2023 03:41
diff --git a/u32_f32.c b/u32_f32.c
 // ---- Straightforward:
 __m128i lo_int = _mm_and_si128(_mm_set1_epi32(0xffff), in); // low 16 bits of all vals
 __m128i hi_int = _mm_srli_epi32(in, 16);   // high 16 bits of all vals
 __m128  lo_flt = _mm_cvtepi32_ps(lo_int);  // exact (all 16 bit ints = machine numbers)
 __m128  hi_flt = _mm_cvtepi32_ps(hi_int);  // exact
 __m128  hi_scl = _mm_mul_ps(hi_flt, _mm_set1_ps(65536.0f)); // exact (just exponent change)
 __m128  result = _mm_add_ps(hi_scl, lo_flt); // this is the only step that rounds.

 // same approach also works with FMA where available.

 // Alternative spin on same idea: do conversion using IEEE magic values.
 // Replaces hi_flt calculation with an "or", hi_scl calculation with a "sub".
 // Rest is the same. Not sure which is faster.

 // ---- Magic constant version 1:

 static const float magic_scaled16 = 549755813888.0f; // 2**(16 + 23)

 __m128i lo_int      = _mm_and_si128(_mm_set1_epi32(0xffff), in);
 __m128i hi_int      = _mm_srli_epi32(in, 16);
 __m128  lo_flt      = _mm_cvtepi32_ps(lo_int);
 __m128  hi_flt      = _mm_or_ps(_mm_castsi128_ps(hi_int), _mm_set1_ps(magic_scaled16));
 __m128  hi_scl      = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16));
 __m128  result      = _mm_add_ps(hi_scl, lo_flt);

 // ---- And if you want to get really tricky, you can convert *both* ints to floats
 // via magic constants. This is cheaper than one would expect because the two bias
 // subtractions can be folded into one and still be exact for this problem:

 static const float magic_scaled16 = 549755813888.0f; // 2**(16 + 23)
 static const float magic_unscaled = 8388608.0f; // 2**23

 __m128i lo_int      = _mm_and_si128(_mm_set1_epi32(0xffff), in);
 __m128i hi_int      = _mm_srli_epi32(in, 16);
 __m128  lo_flt      = _mm_or_ps(_mm_castsi128_ps(lo_int), _mm_set1_ps(magic_unscaled));
 __m128  hi_flt      = _mm_or_ps(_mm_castsi128_ps(hi_int), _mm_set1_ps(magic_scaled16));
 __m128  hi_scl      = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled));
 __m128  result      = _mm_add_ps(hi_scl, lo_flt);

 // Needs 4 constants, though.

 // ---- One option that works everywhere and reduces the number of constant loads
 // to two is using shuffles:

 static const uint32_t pack_magic = ((127 + (16 + 23)) << 23) | ((127 + ( 0 + 23)) << 7));

 __m128  ileave_hi   = _mm_castsi128_ps(_mm_unpackhi_epi16(in, _mm_set1_epi32(pack_magic)));
 __m128  ileave_lo   = _mm_castsi128_ps(_mm_unpacklo_epi16(in, _mm_set1_epi32(pack_magic)));
 __m128  hi_flt      = _mm_shuffle_ps(ileave_lo, ileave_hi, 0xdd);
 __m128  lo_flt      = _mm_shuffle_ps(ileave_lo, ileave_hi, 0x88);
 __m128  hi_scl      = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled));
 __m128  result      = _mm_add_ps(hi_scl, lo_flt);

 // Makes it very shuffle-heavy though. Not a win in a tight loop that batch-converts,
 // but when you have a bunch of computation following it might be the right choice.

 // ---- With SSE 4.1, you can reduce the constants to three without any penalties,
 // by reducing the AND/OR for "lo_flt" to a single PBLENDW:

 __m128  lo_flt      = _mm_castsi128_ps(_mm_blend_epi16(in, _mm_set1_epi32((127 + 23) << 23), 0xaa));
 __m128i hi_int      = _mm_srli_epi32(in, 16);
 __m128  hi_flt      = _mm_or_ps(_mm_castsi128_ps(hi_int), magic_scaled16);
 __m128  hi_scl      = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled));
 __m128  result      = _mm_add_ps(hi_scl, lo_flt);

 // ---- You can also reduce the constants to two by using the same trick again for
 // the high half, but this time it doesn't reduce the op count and is thus not as
 // attractive (it should also be slightly slower in isolation).

 __m128i const blend_magic = _mm_set1_epi32(((127 + ( 0 + 23)) << 23) | ((127 + (16 + 23)) << 7)));

 __m128i hi_tmp      = _mm_blend_epi16(in, blend_magic, 0x55);
 __m128  lo_flt      = _mm_castsi128_ps(_mm_blend_epi16(in, blend_magic, 0xaa));
 __m128  hi_flt      = _mm_castsi128_ps(_mm_alignr_epi8(hi_tmp, hi_tmp, 2));
 __m128  hi_scl      = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled));
 __m128  result      = _mm_add_ps(hi_scl, lo_flt);
	// ---- Straightforward:
	__m128i lo_int = _mm_and_si128(_mm_set1_epi32(0xffff), in); // low 16 bits of all vals
	__m128i hi_int = _mm_srli_epi32(in, 16); // high 16 bits of all vals
	__m128 lo_flt = _mm_cvtepi32_ps(lo_int); // exact (all 16 bit ints = machine numbers)
	__m128 hi_flt = _mm_cvtepi32_ps(hi_int); // exact
	__m128 hi_scl = _mm_mul_ps(hi_flt, _mm_set1_ps(65536.0f)); // exact (just exponent change)
	__m128 result = _mm_add_ps(hi_scl, lo_flt); // this is the only step that rounds.

	// same approach also works with FMA where available.

	// Alternative spin on same idea: do conversion using IEEE magic values.
	// Replaces hi_flt calculation with an "or", hi_scl calculation with a "sub".
	// Rest is the same. Not sure which is faster.

	// ---- Magic constant version 1:

	static const float magic_scaled16 = 549755813888.0f; // 2**(16 + 23)

	__m128i lo_int = _mm_and_si128(_mm_set1_epi32(0xffff), in);
	__m128i hi_int = _mm_srli_epi32(in, 16);
	__m128 lo_flt = _mm_cvtepi32_ps(lo_int);
	__m128 hi_flt = _mm_or_ps(_mm_castsi128_ps(hi_int), _mm_set1_ps(magic_scaled16));
	__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16));
	__m128 result = _mm_add_ps(hi_scl, lo_flt);

	// ---- And if you want to get really tricky, you can convert both ints to floats
	// via magic constants. This is cheaper than one would expect because the two bias
	// subtractions can be folded into one and still be exact for this problem:

	static const float magic_scaled16 = 549755813888.0f; // 2**(16 + 23)
	static const float magic_unscaled = 8388608.0f; // 2**23

	__m128i lo_int = _mm_and_si128(_mm_set1_epi32(0xffff), in);
	__m128i hi_int = _mm_srli_epi32(in, 16);
	__m128 lo_flt = _mm_or_ps(_mm_castsi128_ps(lo_int), _mm_set1_ps(magic_unscaled));
	__m128 hi_flt = _mm_or_ps(_mm_castsi128_ps(hi_int), _mm_set1_ps(magic_scaled16));
	__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled));
	__m128 result = _mm_add_ps(hi_scl, lo_flt);

	// Needs 4 constants, though.

	// ---- One option that works everywhere and reduces the number of constant loads
	// to two is using shuffles:

	static const uint32_t pack_magic = ((127 + (16 + 23)) << 23) \| ((127 + ( 0 + 23)) << 7));

	__m128 ileave_hi = _mm_castsi128_ps(_mm_unpackhi_epi16(in, _mm_set1_epi32(pack_magic)));
	__m128 ileave_lo = _mm_castsi128_ps(_mm_unpacklo_epi16(in, _mm_set1_epi32(pack_magic)));
	__m128 hi_flt = _mm_shuffle_ps(ileave_lo, ileave_hi, 0xdd);
	__m128 lo_flt = _mm_shuffle_ps(ileave_lo, ileave_hi, 0x88);
	__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled));
	__m128 result = _mm_add_ps(hi_scl, lo_flt);

	// Makes it very shuffle-heavy though. Not a win in a tight loop that batch-converts,
	// but when you have a bunch of computation following it might be the right choice.

	// ---- With SSE 4.1, you can reduce the constants to three without any penalties,
	// by reducing the AND/OR for "lo_flt" to a single PBLENDW:

	__m128 lo_flt = _mm_castsi128_ps(_mm_blend_epi16(in, _mm_set1_epi32((127 + 23) << 23), 0xaa));
	__m128i hi_int = _mm_srli_epi32(in, 16);
	__m128 hi_flt = _mm_or_ps(_mm_castsi128_ps(hi_int), magic_scaled16);
	__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled));
	__m128 result = _mm_add_ps(hi_scl, lo_flt);

	// ---- You can also reduce the constants to two by using the same trick again for
	// the high half, but this time it doesn't reduce the op count and is thus not as
	// attractive (it should also be slightly slower in isolation).

	__m128i const blend_magic = _mm_set1_epi32(((127 + ( 0 + 23)) << 23) \| ((127 + (16 + 23)) << 7)));

	__m128i hi_tmp = _mm_blend_epi16(in, blend_magic, 0x55);
	__m128 lo_flt = _mm_castsi128_ps(_mm_blend_epi16(in, blend_magic, 0xaa));
	__m128 hi_flt = _mm_castsi128_ps(_mm_alignr_epi8(hi_tmp, hi_tmp, 2));
	__m128 hi_scl = _mm_sub_ps(hi_flt, _mm_set1_ps(magic_scaled16 + magic_unscaled));
	__m128 result = _mm_add_ps(hi_scl, lo_flt);