Created
October 18, 2011 23:51
-
-
Save kayru/1297108 to your computer and use it in GitHub Desktop.
SSE2 Half to Float
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Yuriy O'Donnell <[email protected]> | |
// Released under MIT License (do whatever you want with it) | |
#define NOMINMAX | |
#define WIN32_LEAN_AND_MEAN | |
#include <stdio.h> | |
#include <windows.h> | |
#include <xnamath.h> | |
#include <malloc.h> | |
#include <float.h> | |
#include <algorithm> | |
#include <xmmintrin.h> | |
#include <emmintrin.h> | |
#define ALIGN16 __declspec(align(16)) | |
typedef unsigned short uint16; | |
typedef unsigned int uint32; | |
// XMConvertFloatToHalfStream -- 6.72 cycles per value | |
// D3DXFloat16To32Array -- 2.22 cycles per value | |
// both of the above functions handle 0 correctly | |
// ~4.63 cycles per value | |
// does not handle 0 correctly | |
inline float half_to_float(uint16 v) | |
{ | |
uint32 s,e,m,r; | |
s = v & 0x8000; | |
m = v & 0x03FF; | |
e = v & 0x7C00; | |
e += 0x0001C000; | |
r = (s << 16) | (m<<13) | (e<<13); | |
return *(float*)&r; | |
} | |
// ~1.68 cycles per value when __fastcall | |
// ~1.38 cycles per value when __forceinline | |
// does not handle 0 correctly | |
__forceinline void half_to_float_sse2_intrin_x8(const uint16* halfs, float* floats) // works on 8 at a time | |
{ | |
const __m128i mask_s8 = _mm_set1_epi16((short)0x8000); | |
const __m128i mask_m8 = _mm_set1_epi16((short)0x03FF); | |
const __m128i mask_e8 = _mm_set1_epi16((short)0x7C00); | |
const __m128i bias_e4 = _mm_set1_epi32(0x0001C000); | |
// exactly the same process as half_to_float() | |
// partially 8 at a time, partially 4 | |
__m128i h8 = _mm_load_si128((__m128i*)halfs); | |
// get sign, mantissa and exponent bits for all 8 halfs | |
__m128i s8 = _mm_and_si128(h8, mask_s8); | |
__m128i m8 = _mm_and_si128(h8, mask_m8); | |
__m128i e8 = _mm_and_si128(h8, mask_e8); | |
// first 4 | |
__m128i s4a = _mm_unpacklo_epi16(s8, _mm_setzero_si128()); | |
s4a = _mm_slli_epi32 (s4a, 16); | |
__m128i m4a = _mm_unpacklo_epi16(m8, _mm_setzero_si128()); | |
m4a = _mm_slli_epi32 (m4a, 13); | |
__m128i e4a = _mm_unpacklo_epi16(e8, _mm_setzero_si128()); | |
e4a = _mm_add_epi32(bias_e4, e4a); | |
e4a = _mm_slli_epi32 (e4a, 13); | |
__m128i f4a = _mm_or_si128(s4a, _mm_or_si128(e4a, m4a)); | |
_mm_store_si128((__m128i*)floats, f4a); | |
// second 4 | |
__m128i s4b = _mm_unpackhi_epi16(s8, _mm_setzero_si128()); | |
s4b = _mm_slli_epi32 (s4b, 16); | |
__m128i m4b = _mm_unpackhi_epi16(m8, _mm_setzero_si128()); | |
m4b = _mm_slli_epi32 (m4b, 13); | |
__m128i e4b = _mm_unpackhi_epi16(e8, _mm_setzero_si128()); | |
e4b = _mm_add_epi32(bias_e4, e4b); | |
e4b = _mm_slli_epi32 (e4b, 13); | |
__m128i f4b = _mm_or_si128(s4b, _mm_or_si128(e4b, m4b)); | |
_mm_store_si128((__m128i*)(floats+4), f4b); | |
} | |
// ~1.89 cycles per value when __fastcall | |
// does not handle 0 correctly | |
void __fastcall half_to_float_sse2_asm_x8(const uint16* halfs, float* floats) // works on 8 at a time | |
{ | |
#define SPLAT_4(x) {x,x,x,x} | |
#define SPLAT_8(x) {x,x,x,x,x,x,x,x} | |
static __declspec(align(16)) uint16 mask_s8[8] = SPLAT_8(0x8000); | |
static __declspec(align(16)) uint16 mask_m8[8] = SPLAT_8(0x03FF); | |
static __declspec(align(16)) uint16 mask_e8[8] = SPLAT_8(0x7C00); | |
static __declspec(align(16)) uint32 bias_e4[4] = SPLAT_4(0x0001C000); | |
#undef SPLAT_4 | |
#undef SPLAT_8 | |
__asm | |
{ | |
// load halfs into sse register | |
movdqa xmm0, [ecx] | |
pxor xmm7, xmm7 | |
// get sign, mantissa and exponent bits for all 8 halfs | |
movdqa xmm1, xmm0 | |
movdqa xmm2, xmm0 | |
movdqa xmm3, xmm0 | |
movdqa xmm0, bias_e4 | |
pand xmm1, mask_s8 | |
pand xmm2, mask_m8 | |
pand xmm3, mask_e8 | |
// first 4 sign | |
movdqa xmm4, xmm1 | |
punpcklwd xmm4, xmm7 | |
pslld xmm4, 16 | |
// first 4 mantissa | |
movdqa xmm5, xmm2 | |
punpcklwd xmm5, xmm7 | |
pslld xmm5, 13 | |
// first 4 exponent | |
movdqa xmm6, xmm3 | |
punpcklwd xmm6, xmm7 | |
paddd xmm6, xmm0 | |
pslld xmm6, 13 | |
// first 4 pack | |
por xmm6, xmm5 | |
por xmm6, xmm4 | |
movdqa [edx], xmm6 | |
// second 4 sign | |
movdqa xmm4, xmm1 | |
punpckhwd xmm4, xmm7 | |
pslld xmm4, 16 | |
// second 4 mantissa | |
movdqa xmm5, xmm2 | |
punpckhwd xmm5, xmm7 | |
pslld xmm5, 13 | |
// second 4 exponent | |
movdqa xmm6, xmm3 | |
punpckhwd xmm6, xmm7 | |
paddd xmm6, xmm0 | |
pslld xmm6, 13 | |
// second 4 pack | |
por xmm6, xmm5 | |
por xmm6, xmm4 | |
movdqa [edx+16], xmm6 | |
} | |
} | |
int main() | |
{ | |
size_t count = 32768; | |
// prepare data | |
float* input = (float*) _aligned_malloc(count * sizeof(float), 16); | |
uint16* packed = (uint16*) _aligned_malloc(count * sizeof(uint16), 16); | |
float* unpacked_a = (float*) _aligned_malloc(count * sizeof(float), 16); | |
float* unpacked_b = (float*) _aligned_malloc(count * sizeof(float), 16); | |
for( size_t i=0; i<count; ++i ) | |
{ | |
input[i] = float(i) / 100.0f; // simple numbers to deal with | |
} | |
XMConvertFloatToHalfStream(packed, sizeof(packed[0]), input, sizeof(input[0]), count); | |
float dt0 = FLT_MAX; | |
float dt1 = FLT_MAX; | |
for( size_t attempt=0; attempt<1000; ++attempt ) | |
{ | |
DWORD64 t0 = __rdtsc(); | |
for( size_t i=0; i<count; i+=8 ) | |
{ | |
half_to_float_sse2_asm_x8(packed+i, unpacked_a+i); | |
} | |
DWORD64 t1 = __rdtsc(); | |
for( size_t i=0; i<count; i+=8 ) | |
{ | |
half_to_float_sse2_intrin_x8(packed+i, unpacked_b+i); | |
} | |
DWORD64 t2 = __rdtsc(); | |
dt0 = std::min(dt0, float(t1-t0)/count); | |
dt1 = std::min(dt1, float(t2-t1)/count); | |
} | |
printf("dt0: %f\n", dt0); | |
printf("dt1: %f\n", dt1); | |
_aligned_free(input); | |
_aligned_free(packed); | |
_aligned_free(unpacked_a); | |
_aligned_free(unpacked_b); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment