kayru · October 18, 2011 23:51
diff --git a/halftofloat.cpp b/halftofloat.cpp
 // Yuriy O'Donnell <[email protected]>
 // Released under MIT License (do whatever you want with it)

 #define NOMINMAX
 #define WIN32_LEAN_AND_MEAN

 #include <stdio.h>
 #include <windows.h>
 #include <xnamath.h>
 #include <malloc.h>
 #include <float.h>
 #include <algorithm>
 #include <xmmintrin.h>
 #include <emmintrin.h>

 #define ALIGN16 __declspec(align(16))
 typedef unsigned short uint16;
 typedef unsigned int   uint32;

 // XMConvertFloatToHalfStream -- 6.72 cycles per value
 // D3DXFloat16To32Array -- 2.22 cycles per value
 // both of the above functions handle 0 correctly


 // ~4.63 cycles per value
 // does not handle 0 correctly
 inline float half_to_float(uint16 v)
 {
 	uint32 s,e,m,r;

 	s = v & 0x8000;
 	m = v & 0x03FF;
 	e = v & 0x7C00;
 	e += 0x0001C000;

 	r = (s << 16) | (m<<13) | (e<<13);

 	return *(float*)&r;
 }

 // ~1.68 cycles per value when __fastcall
 // ~1.38 cycles per value when __forceinline
 // does not handle 0 correctly
 __forceinline void half_to_float_sse2_intrin_x8(const uint16* halfs, float* floats) // works on 8 at a time
 {
 	const __m128i mask_s8 = _mm_set1_epi16((short)0x8000);
 	const __m128i mask_m8 = _mm_set1_epi16((short)0x03FF);
 	const __m128i mask_e8 = _mm_set1_epi16((short)0x7C00);
 	const __m128i bias_e4 = _mm_set1_epi32(0x0001C000);

 	// exactly the same process as half_to_float()
 	// partially 8 at a time, partially 4

 	__m128i h8 = _mm_load_si128((__m128i*)halfs);
 	
 	// get sign, mantissa and exponent bits for all 8 halfs

 	__m128i s8 = _mm_and_si128(h8, mask_s8);
 	__m128i m8 = _mm_and_si128(h8, mask_m8);
 	__m128i e8 = _mm_and_si128(h8, mask_e8);

 	// first 4

 	__m128i s4a = _mm_unpacklo_epi16(s8, _mm_setzero_si128());
 	        s4a = _mm_slli_epi32 (s4a, 16);

 	__m128i m4a = _mm_unpacklo_epi16(m8, _mm_setzero_si128());
 	        m4a = _mm_slli_epi32 (m4a, 13);

 	__m128i e4a = _mm_unpacklo_epi16(e8, _mm_setzero_si128());
 	        e4a = _mm_add_epi32(bias_e4, e4a);
 	        e4a = _mm_slli_epi32 (e4a, 13);

 	__m128i f4a = _mm_or_si128(s4a, _mm_or_si128(e4a, m4a));

 	_mm_store_si128((__m128i*)floats, f4a);
 	
 	// second 4

 	__m128i s4b = _mm_unpackhi_epi16(s8, _mm_setzero_si128());
 	        s4b = _mm_slli_epi32 (s4b, 16);

 	__m128i m4b = _mm_unpackhi_epi16(m8, _mm_setzero_si128());
 	        m4b = _mm_slli_epi32 (m4b, 13);

 	__m128i e4b = _mm_unpackhi_epi16(e8, _mm_setzero_si128());
 	        e4b = _mm_add_epi32(bias_e4, e4b);
 	        e4b = _mm_slli_epi32 (e4b, 13);

 	__m128i f4b = _mm_or_si128(s4b, _mm_or_si128(e4b, m4b));

 	_mm_store_si128((__m128i*)(floats+4), f4b);
 }

 // ~1.89 cycles per value when __fastcall
 // does not handle 0 correctly
 void __fastcall half_to_float_sse2_asm_x8(const uint16* halfs, float* floats) // works on 8 at a time
 {
 	#define SPLAT_4(x) {x,x,x,x}
 	#define SPLAT_8(x) {x,x,x,x,x,x,x,x}
 	static __declspec(align(16)) uint16 mask_s8[8] = SPLAT_8(0x8000);
 	static __declspec(align(16)) uint16 mask_m8[8] = SPLAT_8(0x03FF);
 	static __declspec(align(16)) uint16 mask_e8[8] = SPLAT_8(0x7C00);
 	static __declspec(align(16)) uint32 bias_e4[4] = SPLAT_4(0x0001C000);
 	#undef SPLAT_4
 	#undef SPLAT_8

 	__asm
 	{
 		// load halfs into sse register
 		movdqa xmm0, [ecx]
 		pxor xmm7, xmm7

 		// get sign, mantissa and exponent bits for all 8 halfs
 		movdqa xmm1, xmm0
 		movdqa xmm2, xmm0
 		movdqa xmm3, xmm0

 		movdqa xmm0, bias_e4

 		pand xmm1, mask_s8 
 		pand xmm2, mask_m8
 		pand xmm3, mask_e8

 		// first 4 sign
 		movdqa    xmm4, xmm1
 		punpcklwd xmm4, xmm7
 		pslld     xmm4, 16

 		// first 4 mantissa
 		movdqa    xmm5, xmm2
 		punpcklwd xmm5, xmm7
 		pslld     xmm5, 13

 		// first 4 exponent
 		movdqa    xmm6, xmm3
 		punpcklwd xmm6, xmm7
 		paddd     xmm6, xmm0
 		pslld     xmm6, 13

 		// first 4 pack
 		por xmm6, xmm5
 		por xmm6, xmm4

 		movdqa [edx], xmm6

 		// second 4 sign
 		movdqa    xmm4, xmm1
 		punpckhwd xmm4, xmm7
 		pslld     xmm4, 16

 		// second 4 mantissa
 		movdqa    xmm5, xmm2
 		punpckhwd xmm5, xmm7
 		pslld     xmm5, 13

 		// second 4 exponent
 		movdqa    xmm6, xmm3
 		punpckhwd xmm6, xmm7
 		paddd     xmm6, xmm0
 		pslld     xmm6, 13

 		// second 4 pack
 		por xmm6, xmm5
 		por xmm6, xmm4

 		movdqa [edx+16], xmm6
 	}
 }

 int main()
 {
 	size_t count = 32768;

 	// prepare data
 	float*  input      = (float*)  _aligned_malloc(count * sizeof(float),  16);
 	uint16* packed     = (uint16*) _aligned_malloc(count * sizeof(uint16), 16);
 	float*  unpacked_a = (float*)  _aligned_malloc(count * sizeof(float),  16);
 	float*  unpacked_b = (float*)  _aligned_malloc(count * sizeof(float),  16);

 	for( size_t i=0; i<count; ++i )
 	{
 		input[i] = float(i) / 100.0f; // simple numbers to deal with
 	}

 	XMConvertFloatToHalfStream(packed, sizeof(packed[0]), input, sizeof(input[0]), count);

 	float dt0 = FLT_MAX;
 	float dt1 = FLT_MAX;

 	for( size_t attempt=0; attempt<1000; ++attempt )
 	{		
 		DWORD64 t0 = __rdtsc();

 		for( size_t i=0; i<count; i+=8 )
 		{
 			half_to_float_sse2_asm_x8(packed+i, unpacked_a+i);
 		}

 		DWORD64 t1 = __rdtsc();

 		for( size_t i=0; i<count; i+=8 )
 		{
 			half_to_float_sse2_intrin_x8(packed+i, unpacked_b+i);
 		}

 		DWORD64 t2 = __rdtsc();

 		dt0 = std::min(dt0, float(t1-t0)/count);
 		dt1 = std::min(dt1, float(t2-t1)/count);
 	}

 	printf("dt0: %f\n", dt0);
 	printf("dt1: %f\n", dt1);

 	_aligned_free(input);
 	_aligned_free(packed);
 	_aligned_free(unpacked_a);
 	_aligned_free(unpacked_b);

 	return 0;
 }
	// Yuriy O'Donnell <[email protected]>
	// Released under MIT License (do whatever you want with it)

	#define NOMINMAX
	#define WIN32_LEAN_AND_MEAN

	#include <stdio.h>
	#include <windows.h>
	#include <xnamath.h>
	#include <malloc.h>
	#include <float.h>
	#include <algorithm>
	#include <xmmintrin.h>
	#include <emmintrin.h>

	#define ALIGN16 __declspec(align(16))
	typedef unsigned short uint16;
	typedef unsigned int uint32;

	// XMConvertFloatToHalfStream -- 6.72 cycles per value
	// D3DXFloat16To32Array -- 2.22 cycles per value
	// both of the above functions handle 0 correctly


	// ~4.63 cycles per value
	// does not handle 0 correctly
	inline float half_to_float(uint16 v)
	{
	uint32 s,e,m,r;

	s = v & 0x8000;
	m = v & 0x03FF;
	e = v & 0x7C00;
	e += 0x0001C000;

	r = (s << 16) \| (m<<13) \| (e<<13);

	return (float)&r;
	}

	// ~1.68 cycles per value when __fastcall
	// ~1.38 cycles per value when __forceinline
	// does not handle 0 correctly
	__forceinline void half_to_float_sse2_intrin_x8(const uint16* halfs, float* floats) // works on 8 at a time
	{
	const __m128i mask_s8 = _mm_set1_epi16((short)0x8000);
	const __m128i mask_m8 = _mm_set1_epi16((short)0x03FF);
	const __m128i mask_e8 = _mm_set1_epi16((short)0x7C00);
	const __m128i bias_e4 = _mm_set1_epi32(0x0001C000);

	// exactly the same process as half_to_float()
	// partially 8 at a time, partially 4

	__m128i h8 = _mm_load_si128((__m128i*)halfs);

	// get sign, mantissa and exponent bits for all 8 halfs

	__m128i s8 = _mm_and_si128(h8, mask_s8);
	__m128i m8 = _mm_and_si128(h8, mask_m8);
	__m128i e8 = _mm_and_si128(h8, mask_e8);

	// first 4

	__m128i s4a = _mm_unpacklo_epi16(s8, _mm_setzero_si128());
	s4a = _mm_slli_epi32 (s4a, 16);

	__m128i m4a = _mm_unpacklo_epi16(m8, _mm_setzero_si128());
	m4a = _mm_slli_epi32 (m4a, 13);

	__m128i e4a = _mm_unpacklo_epi16(e8, _mm_setzero_si128());
	e4a = _mm_add_epi32(bias_e4, e4a);
	e4a = _mm_slli_epi32 (e4a, 13);

	__m128i f4a = _mm_or_si128(s4a, _mm_or_si128(e4a, m4a));

	_mm_store_si128((__m128i*)floats, f4a);

	// second 4

	__m128i s4b = _mm_unpackhi_epi16(s8, _mm_setzero_si128());
	s4b = _mm_slli_epi32 (s4b, 16);

	__m128i m4b = _mm_unpackhi_epi16(m8, _mm_setzero_si128());
	m4b = _mm_slli_epi32 (m4b, 13);

	__m128i e4b = _mm_unpackhi_epi16(e8, _mm_setzero_si128());
	e4b = _mm_add_epi32(bias_e4, e4b);
	e4b = _mm_slli_epi32 (e4b, 13);

	__m128i f4b = _mm_or_si128(s4b, _mm_or_si128(e4b, m4b));

	_mm_store_si128((__m128i*)(floats+4), f4b);
	}

	// ~1.89 cycles per value when __fastcall
	// does not handle 0 correctly
	void __fastcall half_to_float_sse2_asm_x8(const uint16* halfs, float* floats) // works on 8 at a time
	{
	#define SPLAT_4(x) {x,x,x,x}
	#define SPLAT_8(x) {x,x,x,x,x,x,x,x}
	static __declspec(align(16)) uint16 mask_s8[8] = SPLAT_8(0x8000);
	static __declspec(align(16)) uint16 mask_m8[8] = SPLAT_8(0x03FF);
	static __declspec(align(16)) uint16 mask_e8[8] = SPLAT_8(0x7C00);
	static __declspec(align(16)) uint32 bias_e4[4] = SPLAT_4(0x0001C000);
	#undef SPLAT_4
	#undef SPLAT_8

	__asm
	{
	// load halfs into sse register
	movdqa xmm0, [ecx]
	pxor xmm7, xmm7

	// get sign, mantissa and exponent bits for all 8 halfs
	movdqa xmm1, xmm0
	movdqa xmm2, xmm0
	movdqa xmm3, xmm0

	movdqa xmm0, bias_e4

	pand xmm1, mask_s8
	pand xmm2, mask_m8
	pand xmm3, mask_e8

	// first 4 sign
	movdqa xmm4, xmm1
	punpcklwd xmm4, xmm7
	pslld xmm4, 16

	// first 4 mantissa
	movdqa xmm5, xmm2
	punpcklwd xmm5, xmm7
	pslld xmm5, 13

	// first 4 exponent
	movdqa xmm6, xmm3
	punpcklwd xmm6, xmm7
	paddd xmm6, xmm0
	pslld xmm6, 13

	// first 4 pack
	por xmm6, xmm5
	por xmm6, xmm4

	movdqa [edx], xmm6

	// second 4 sign
	movdqa xmm4, xmm1
	punpckhwd xmm4, xmm7
	pslld xmm4, 16

	// second 4 mantissa
	movdqa xmm5, xmm2
	punpckhwd xmm5, xmm7
	pslld xmm5, 13

	// second 4 exponent
	movdqa xmm6, xmm3
	punpckhwd xmm6, xmm7
	paddd xmm6, xmm0
	pslld xmm6, 13

	// second 4 pack
	por xmm6, xmm5
	por xmm6, xmm4

	movdqa [edx+16], xmm6
	}
	}

	int main()
	{
	size_t count = 32768;

	// prepare data
	float* input = (float) _aligned_malloc(count sizeof(float), 16);
	uint16* packed = (uint16) _aligned_malloc(count sizeof(uint16), 16);
	float* unpacked_a = (float) _aligned_malloc(count sizeof(float), 16);
	float* unpacked_b = (float) _aligned_malloc(count sizeof(float), 16);

	for( size_t i=0; i<count; ++i )
	{
	input[i] = float(i) / 100.0f; // simple numbers to deal with
	}

	XMConvertFloatToHalfStream(packed, sizeof(packed[0]), input, sizeof(input[0]), count);

	float dt0 = FLT_MAX;
	float dt1 = FLT_MAX;

	for( size_t attempt=0; attempt<1000; ++attempt )
	{
	DWORD64 t0 = __rdtsc();

	for( size_t i=0; i<count; i+=8 )
	{
	half_to_float_sse2_asm_x8(packed+i, unpacked_a+i);
	}

	DWORD64 t1 = __rdtsc();

	for( size_t i=0; i<count; i+=8 )
	{
	half_to_float_sse2_intrin_x8(packed+i, unpacked_b+i);
	}

	DWORD64 t2 = __rdtsc();

	dt0 = std::min(dt0, float(t1-t0)/count);
	dt1 = std::min(dt1, float(t2-t1)/count);
	}

	printf("dt0: %f\n", dt0);
	printf("dt1: %f\n", dt1);

	_aligned_free(input);
	_aligned_free(packed);
	_aligned_free(unpacked_a);
	_aligned_free(unpacked_b);

	return 0;
	}