Created
March 21, 2012 18:27
-
-
Save castano/2150795 to your computer and use it in GitHub Desktop.
ISPC half to float conversion
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <intrin.h> // __rdtsc | |
#include <stdio.h> | |
typedef unsigned int uint; | |
typedef unsigned short uint16; | |
typedef unsigned long long uint64; | |
// ISPC functions: | |
extern "C" void half_to_float_test(const uint16 * vin, float * vout, int count); | |
extern "C" void half_to_float_test_fast(const uint16 * vin, float * vout, int count); | |
extern "C" void half_to_float_test_ryg(const uint16 * vin, float * vout, int count); | |
extern "C" void half_to_float_test_fast_ryg(const uint16 * vin, float * vout, int count); | |
union FP32 | |
{ | |
uint u; | |
float f; | |
struct | |
{ | |
uint Mantissa : 23; | |
uint Exponent : 8; | |
uint Sign : 1; | |
}; | |
}; | |
union FP16 | |
{ | |
unsigned short u; | |
struct | |
{ | |
uint Mantissa : 10; | |
uint Exponent : 5; | |
uint Sign : 1; | |
}; | |
}; | |
static FP32 half_to_float_full(FP16 h) | |
{ | |
FP32 o = { 0 }; | |
// From ISPC ref code | |
if (h.Exponent == 0 && h.Mantissa == 0) // (Signed) zero | |
o.Sign = h.Sign; | |
else | |
{ | |
if (h.Exponent == 0) // Denormal (will convert to normalized) | |
{ | |
// Adjust mantissa so it's normalized (and keep track of exp adjust) | |
int e = -1; | |
uint m = h.Mantissa; | |
do | |
{ | |
e++; | |
m <<= 1; | |
} while ((m & 0x400) == 0); | |
o.Mantissa = (m & 0x3ff) << 13; | |
o.Exponent = 127 - 15 - e; | |
o.Sign = h.Sign; | |
} | |
else if (h.Exponent == 0x1f) // Inf/NaN | |
{ | |
// NOTE: It's safe to treat both with the same code path by just truncating | |
// lower Mantissa bits in NaNs (this is valid). | |
o.Mantissa = h.Mantissa << 13; | |
o.Exponent = 255; | |
o.Sign = h.Sign; | |
} | |
else // Normalized number | |
{ | |
o.Mantissa = h.Mantissa << 13; | |
o.Exponent = 127 - 15 + h.Exponent; | |
o.Sign = h.Sign; | |
} | |
} | |
return o; | |
} | |
static __m128 half_to_float4_SSE2(__m128i h) | |
{ | |
#define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) } | |
#define CONST(name) *(const __m128i *)&name | |
SSE_CONST4(mask_nosign, 0x7fff); | |
SSE_CONST4(mask_justsign, 0x8000); | |
SSE_CONST4(mask_shifted_exp, 0x7c00 << 13); | |
SSE_CONST4(expadjust_normal, (127 - 15) << 23); | |
SSE_CONST4(expadjust_infnan, (128 - 16) << 23); | |
SSE_CONST4(expadjust_denorm, 1 << 23); | |
SSE_CONST4(magic_denorm, 113 << 23); | |
__m128i mnosign = CONST(mask_nosign); | |
__m128i expmant = _mm_and_si128(mnosign, h); | |
__m128i justsign = _mm_and_si128(h, CONST(mask_justsign)); | |
__m128i mshiftexp = CONST(mask_shifted_exp); | |
__m128i eadjust = CONST(expadjust_normal); | |
__m128i shifted = _mm_slli_epi32(expmant, 13); | |
__m128i adjusted = _mm_add_epi32(eadjust, shifted); | |
__m128i justexp = _mm_and_si128(shifted, mshiftexp); | |
__m128i zero = _mm_setzero_si128(); | |
__m128i b_isinfnan = _mm_cmpeq_epi32(mshiftexp, justexp); | |
__m128i b_isdenorm = _mm_cmpeq_epi32(zero, justexp); | |
__m128i adj_infnan = _mm_and_si128(b_isinfnan, CONST(expadjust_infnan)); | |
__m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan); | |
__m128i adj_den = CONST(expadjust_denorm); | |
__m128i den1 = _mm_add_epi32(adj_den, adjusted2); | |
__m128 den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm); | |
__m128 adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm)); | |
__m128 adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2)); | |
__m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4); | |
__m128i sign = _mm_slli_epi32(justsign, 16); | |
__m128 final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign)); | |
// ~21 SSE2 ops. | |
return final; | |
#undef SSE_CONST4 | |
#undef CONST | |
} | |
void half_to_float_test_sse2(const uint16 * vin, float * vout, int count) { | |
__m128i zero = _mm_setzero_si128(); | |
for (int i = 0; i < count; i += 8) | |
{ | |
__m128i in = _mm_loadu_si128((const __m128i *)(vin + i)); | |
__m128i a = _mm_unpacklo_epi16(in, zero); | |
__m128i b = _mm_unpackhi_epi16(in, zero); | |
__m128 outa = half_to_float4_SSE2(a); | |
_mm_storeu_ps((float *)(vout + i), outa); | |
__m128 outb = half_to_float4_SSE2(b); | |
_mm_storeu_ps((float *)(vout + i + 4), outb); | |
} | |
} | |
int test_results(const uint16 * vin, const float * vout, int count) { | |
int error_count = 0; | |
for (int i = 0; i < count; i++) | |
{ | |
FP16 in; | |
in.u = vin[i]; | |
FP32 full = half_to_float_full(in); | |
FP32 out; | |
out.f = vout[i]; | |
if (full.u != out.u) error_count++; | |
} | |
return error_count; | |
} | |
int main(void) { | |
uint16 vin[0x10000]; | |
float vout[0x10000]; | |
// Init input. | |
for (int i = 0; i < 0x10000; i++) | |
{ | |
vin[i] = i; | |
} | |
// Run tests. | |
uint64 start, end; | |
half_to_float_test(vin, vout, 0x10000); | |
int error_count = test_results(vin, vout, 0x10000); | |
start = __rdtsc(); | |
for (int i = 0; i < 64; i++) half_to_float_test(vin, vout, 0x10000); | |
end = __rdtsc(); | |
printf("half_to_float: %.3f (%d)\n", double(end-start) / (1000*1000*4), error_count); | |
half_to_float_test_fast(vin, vout, 0x10000); | |
error_count = test_results(vin, vout, 0x10000); | |
start = __rdtsc(); | |
for (int i = 0; i < 64; i++) half_to_float_test_fast(vin, vout, 0x10000); | |
end = __rdtsc(); | |
printf("half_to_float_fast: %.3f (%d)\n", double(end-start) / (1000*1000*4), error_count); | |
half_to_float_test_ryg(vin, vout, 0x10000); | |
error_count = test_results(vin, vout, 0x10000); | |
start = __rdtsc(); | |
for (int i = 0; i < 64; i++) half_to_float_test_ryg(vin, vout, 0x10000); | |
end = __rdtsc(); | |
printf("half_to_float_ryg: %.3f (%d)\n", double(end-start) / (1000*1000*4), error_count); | |
half_to_float_test_fast_ryg(vin, vout, 0x10000); | |
error_count = test_results(vin, vout, 0x10000); | |
start = __rdtsc(); | |
for (int i = 0; i < 64; i++) half_to_float_test_fast_ryg(vin, vout, 0x10000); | |
end = __rdtsc(); | |
printf("half_to_float_fast_ryg: %.3f (%d)\n", double(end-start) / (1000*1000*4), error_count); | |
half_to_float_test_sse2(vin, vout, 0x10000); | |
error_count = test_results(vin, vout, 0x10000); | |
start = __rdtsc(); | |
for (int i = 0; i < 64; i++) half_to_float_test_sse2(vin, vout, 0x10000); | |
end = __rdtsc(); | |
printf("half_to_float_sse2: %.3f (%d)\n", double(end-start) / (1000*1000*4), error_count); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
inline float half_to_float_ryg(unsigned int16 h) | |
{ | |
if (__have_native_half) { | |
return __half_to_float_varying(h); | |
} | |
else { | |
unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit | |
unsigned int32 hem = h & (int32)0x7fffu; // Pick off exponent-mantissa bits | |
unsigned int32 xs = ((unsigned int32) hs) << 16; | |
unsigned int32 xem = ((unsigned int32) hem) << 13; | |
unsigned int32 xe = xem & 0xF800000; // & (0x7c00 << 13) | |
xem += 0x38000000; // (127 - 15) << 23 | |
// handle exponent special cases | |
if (xe == 0xF800000) // Inf/NaN? | |
{ | |
xem += 0x38000000; // extra exp adjust | |
} | |
else if (xe == 0) // Zero/Denormal? | |
{ | |
xem += 0x800000; // extra exp adjust | |
xem = intbits(floatbits(xem) - floatbits(0x38800000)); // renormalize | |
} | |
return floatbits(xs | xem); | |
} | |
} | |
inline float half_to_float_fast_ryg(unsigned int16 h) | |
{ | |
if (__have_native_half) { | |
return __half_to_float_varying(h); | |
} | |
else { | |
unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit | |
unsigned int32 hem = h & (int32)0x7fffu; // Pick off exponent-mantissa bits | |
unsigned int32 xs = ((unsigned int32) hs) << 16; | |
unsigned int32 xem = ((unsigned int32) hem) << 13; | |
xem += 0x38000000; // (127 - 15) << 23 | |
return floatbits(xs | xem); | |
} | |
} | |
export void half_to_float_test(uniform const unsigned int16 vin[], uniform float vout[], uniform int count) | |
{ | |
foreach(i = 0 ... count) { | |
vout[i] = half_to_float(vin[i]); | |
} | |
} | |
export void half_to_float_test_fast(uniform const unsigned int16 vin[], uniform float vout[], uniform int count) | |
{ | |
foreach(i = 0 ... count) { | |
vout[i] = half_to_float_fast(vin[i]); | |
} | |
} | |
export void half_to_float_test_ryg(uniform const unsigned int16 vin[], uniform float vout[], uniform int count) | |
{ | |
foreach(i = 0 ... count) { | |
vout[i] = half_to_float_ryg(vin[i]); | |
} | |
} | |
export void half_to_float_test_fast_ryg(uniform const unsigned int16 vin[], uniform float vout[], uniform int count) | |
{ | |
foreach(i = 0 ... count) { | |
vout[i] = half_to_float_fast_ryg(vin[i]); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Reference: | |
half_to_float_sse2: 2.306 | |
ISPC targeting SSE2: | |
half_to_float: 6.668 | |
half_to_float_fast: 2.554 | |
half_to_float_ryg: 4.390 | |
half_to_float_fast_ryg: 2.005 | |
ISPC targeting SSE4: | |
half_to_float: 4.776 | |
half_to_float_fast: 1.939 | |
half_to_float_ryg: 3.121 | |
half_to_float_fast_ryg: 1.448 | |
ISPC targeting AVX: | |
half_to_float: 4.368 | |
half_to_float_fast: 1.857 | |
half_to_float_ryg: 2.791 | |
half_to_float_fast_ryg: 1.275 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment