alnsn · January 4, 2018 00:44
diff --git a/u4.c b/u4.c
 #include <assert.h>
 #include <err.h>
 #include <inttypes.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include <emmintrin.h>
 #include <tmmintrin.h>

 #include "cycle.h"

 static inline __m128i d4toa(unsigned int u)
 {
 	/*
 	 * Multiply u by -65536 (to move -u to the high word),
 	 * 26215 (magic for u/10), 10486 (magic for u/100) and
 	 * 8389 (magic for u/1000).
 	 */
 	const __m128i first_madd =
 	    _mm_set_epi16(-32768, -32768, 0, 26215, 0, 10486, 0, 8389);

 	/*
 	 * Zero-out 18 low bits of u*26215, 20 low bits of u*10486
 	 * and 23 low bits of u*8389:
 	 * [-u, 0, u/10*4, 0, u/100*16, 0, u/1000*128].
 	 */
 	const __m128i mask =
 	    _mm_set_epi16(0xffff, 0, 0xfffc, 0, 0xfff0, 0, 0xff80, 0);

 	/*
 	 * Input value
 	 *
 	 * [-u, u/10*4, u/10*4, u/100*16, u/100*16, u/1000*128, n/1000*128, 0]
 	 *
 	 * is multiplied to produce 4 scaled digits:
 	 *
 	 * [(-u)*-256 - (u/10*4)*10*64, 0, (u/10*4)*64 - (u/100*16)*16*10,
 	 *  (u/100*16)*16 - (u/1000*128)*2*10, (n/1000*128)*2]
 	 */
 	const __m128i second_madd =
 	    _mm_set_epi16(-256, -640, 64, -160, 16, -20, 2, 0);

 	/*
 	 * Shuffle digits to low bytes and OR with ascii zeroes.
 	 * Only low 32-bit word matter, three other words can
 	 * have any values.
 	 */
 	const __m128i shuffle = _mm_set_epi32(0, 0, 0, 0x0d090501);
 	const __m128i ascii_zero = _mm_set_epi32(0, 0, 0, 0x30303030);

 	__m128i x;

 	//assert(u <= 9999);

 	x = _mm_madd_epi16(_mm_set1_epi16(u), first_madd);
 	x = _mm_and_si128(x, mask);
 	x = _mm_or_si128(x, _mm_slli_si128(x, 2));
 	x = _mm_madd_epi16(x, second_madd);
 	x = _mm_shuffle_epi8(x, shuffle);
 	x = _mm_or_si128(x, ascii_zero);

 	return x;
 }

 #define INLINE static inline __attribute__((always_inline))

 typedef __uint128_t uint128_t;

 /* Powers of 10. */
 #define PO2                100ULL
 #define PO4              10000ULL
 #define PO8          100000000ULL
 #define PO10       10000000000ULL
 #define PO16 10000000000000000ULL

 /* 64 bit's worth of '0' characters (i.e., 8 characters). */
 #define ZERO_CHARS 0x3030303030303030ULL

 INLINE uint32_t
 encode_hundreds(uint32_t hi, uint32_t lo)
 {
 	/*
 	 * Pack everything in a single 32 bit value.
 	 *
 	 * merged = [ hi 0 lo 0 ]
 	 */
        uint32_t merged = hi | (lo << 16);
 	/*
 	 * Fixed-point multiplication by 103/1024 ~= 1/10.
 	 */
        uint32_t tens = (merged * 103UL) >> 10;

 	/*
 	 * Mask away garbage bits between our digits.
 	 *
 	 * tens = [ hi/10 0 lo/10 0 ]
 	 *
 	 * On a platform with more restricted literals (ARM, for
 	 * instance), it may make sense to and-not with the middle
 	 * bits.
 	 */
        tens &= (0xFUL << 16) | 0xFUL;

 	/*
 	 * x mod 10 = x - 10 * (x div 10).
 	 *
 	 * (merged - 10 * tens) = [ hi%10 0 lo%10 0 ]
 	 *
 	 * Then insert these values between tens.  Arithmetic instead
 	 * of bitwise operation helps the compiler merge this with
 	 * later increments by a constant (e.g., ZERO_CHARS).
 	 */
 	return tens + ((merged - 10UL * tens) << 8);
 }

 INLINE void *
 itoa_ten_thousand(void *out, unsigned int x)
 {
 	uint32_t buf;
 	unsigned int x_div_PO2, x_mod_PO2;
 	unsigned int zeros;

 	x_div_PO2 = (x * 10486UL) >> 20;
 	x_mod_PO2 = x - PO2 * x_div_PO2;
 	buf = encode_hundreds(x_div_PO2, x_mod_PO2);
 	/*
 	 * Count leading (in memory, trailing in register: we're
 	 * little endian) zero bytes: count leading zero bits and
 	 * round down to 8.
 	 */
 	zeros = __builtin_ctz(buf) & -8U;
 	buf += ZERO_CHARS; /* BCD -> ASCII. */
 	buf >>= zeros; /* Shift away leading zero characters */
 	memcpy(out, &buf, 4);

 	/* zeros is in bits; convert to bytes to find actual length. */
 	return (char *)out + 4 - zeros / 8;
 }

 INLINE void *
 itoa_ten_thousand_pad(void *out, unsigned int x)
 {
 	uint32_t buf;
 	unsigned int x_div_PO2, x_mod_PO2;
 	unsigned int zeros;

 	x_div_PO2 = (x * 10486UL) >> 20;
 	x_mod_PO2 = x - PO2 * x_div_PO2;
 	buf = encode_hundreds(x_div_PO2, x_mod_PO2);
 	buf += ZERO_CHARS; /* BCD -> ASCII. */
 	memcpy(out, &buf, 4);
 	return (char *)out + 4;
 }

 static uint32_t fast04u(unsigned int) __attribute__((noinline));
 static uint32_t an_4u(unsigned int) __attribute__((noinline));
 static uint32_t an_04u(unsigned int) __attribute__((noinline));
 static uint32_t nop(unsigned int) __attribute__((noinline));

 static uint32_t fast04u(unsigned int u)
 {
 	uint32_t ret = _mm_cvtsi128_si32(d4toa(u));

 	asm("# foo" : "+r"(ret) :: "memory");
 	return ret;
 }

 static uint32_t an_4u(unsigned int u)
 {
 	uint32_t buf;

 	itoa_ten_thousand(&buf, u);
 	asm("" : "+r"(buf) :: "memory");
 	return buf;
 }

 static uint32_t an_04u(unsigned int u)
 {
 	uint32_t buf;

 	itoa_ten_thousand_pad(&buf, u);
 	asm("" : "+r"(buf) :: "memory");
 	return buf;
 }

 static uint32_t nop(unsigned int u)
 {

 	asm("" : "+r"(u));
 	return u;
 }

 static void
 print_u32(uint32_t buf)
 {

 	printf("%.4s\n", (char *)&buf);
 	return;
 }

 static int
 cmp_u64(const void *vx, const void *vy)
 {
 	const uint64_t *x = vx;
 	const uint64_t *y = vy;

 	if (*x == *y) {
 		return 0;
 	}

 	return (*x < *y) ? -1 : 1;
 }

 static __attribute__((noinline)) void
 report(uint64_t *times, size_t n)
 {

 	qsort(times, n, sizeof(*times), cmp_u64);
 	printf("%"PRIu64" %"PRIu64" %"PRIu64"\n",
 	    times[n / 100], times[n / 10], times[n / 2]);
 	return;
 }

 static inline void
 cpuid(void)
 {
 	uint32_t a, b, c, d;

 	asm("cpuid"
 	    : "=a"(a), "=b"(b), "=c"(c), "=d"(d)
 	    : "0"(0)
 	    : "memory");
 	return;
 }

 #define NRUNS 1000UL
 #define K 100

 int
 main()
 {
 	static uint64_t times[NRUNS];

 #define TEST(X) do {				\
 		print_u32(fast04u((X)));	\
 		print_u32(an_04u((X)));		\
 		print_u32(an_4u((X)));		\
 	} while (0)

 	/* TEST(0); */
 	/* TEST(1); */
 	/* TEST(12); */
 	/* TEST(123); */
 	/* TEST(1234); */
 	/* TEST(1230); */
 	/* TEST(9900); */

 	asm("" ::: "memory");
 #define TEST_SERIAL(FUN) do {					\
 		memset(times, 0, sizeof(times));		\
 								\
 		for (size_t i = 0; i < NRUNS; i++) {		\
 			ticks begin, end;			\
 			uint32_t acc = 0;			\
 								\
 			cpuid();				\
 			begin = getticks();			\
 			for (size_t j = 0; j < K; j++) {	\
 				asm("# foo" ::: "memory");	\
 				acc = FUN (acc % 8192);		\
 			}					\
 								\
 			cpuid();				\
 			end = getticks();			\
 			times[i] = elapsed(end, begin);		\
 		}						\
 								\
 		report(times, NRUNS);				\
 	} while (0)

 	printf("serial\n");
 	TEST_SERIAL(nop);
 	TEST_SERIAL(fast04u);
 	TEST_SERIAL(an_04u);
 	TEST_SERIAL(an_4u);

 #define TEST_THROUGHPUT(FUN) do {				\
 		static uint32_t dst[10 * (K)];			\
 								\
 		memset(times, 0, sizeof(times));		\
 		memset(dst, 0, sizeof(dst));			\
 								\
 		for (size_t i = 0; i < NRUNS; i++) {		\
 			ticks begin, end;			\
 			uint32_t acc = 0;			\
 								\
 			cpuid();				\
 			begin = getticks();			\
 			for (size_t j = 0; j < 10 * (K); j++) {	\
 				dst[j] = FUN (j);		\
 			}					\
 								\
 			cpuid();				\
 			end = getticks();			\
 			times[i] = elapsed(end, begin);		\
 		}						\
 								\
 		report(times, NRUNS);				\
 	} while (0)

 	printf("throughput\n");
 	TEST_THROUGHPUT(nop);
 	TEST_THROUGHPUT(fast04u);
 	TEST_THROUGHPUT(an_04u);
 	TEST_THROUGHPUT(an_4u);

 	return 0;
 }
	#include <assert.h>
	#include <err.h>
	#include <inttypes.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#include <emmintrin.h>
	#include <tmmintrin.h>

	#include "cycle.h"

	static inline __m128i d4toa(unsigned int u)
	{
	/*
	* Multiply u by -65536 (to move -u to the high word),
	* 26215 (magic for u/10), 10486 (magic for u/100) and
	* 8389 (magic for u/1000).
	*/
	const __m128i first_madd =
	_mm_set_epi16(-32768, -32768, 0, 26215, 0, 10486, 0, 8389);

	/*
	* Zero-out 18 low bits of u26215, 20 low bits of u10486
	* and 23 low bits of u*8389:
	* [-u, 0, u/104, 0, u/10016, 0, u/1000*128].
	*/
	const __m128i mask =
	_mm_set_epi16(0xffff, 0, 0xfffc, 0, 0xfff0, 0, 0xff80, 0);

	/*
	* Input value
	*
	* [-u, u/104, u/104, u/10016, u/10016, u/1000128, n/1000128, 0]
	*
	* is multiplied to produce 4 scaled digits:
	*
	* [(-u)-256 - (u/104)1064, 0, (u/104)64 - (u/10016)16*10,
	* (u/10016)16 - (u/1000128)210, (n/1000128)*2]
	*/
	const __m128i second_madd =
	_mm_set_epi16(-256, -640, 64, -160, 16, -20, 2, 0);

	/*
	* Shuffle digits to low bytes and OR with ascii zeroes.
	* Only low 32-bit word matter, three other words can
	* have any values.
	*/
	const __m128i shuffle = _mm_set_epi32(0, 0, 0, 0x0d090501);
	const __m128i ascii_zero = _mm_set_epi32(0, 0, 0, 0x30303030);

	__m128i x;

	//assert(u <= 9999);

	x = _mm_madd_epi16(_mm_set1_epi16(u), first_madd);
	x = _mm_and_si128(x, mask);
	x = _mm_or_si128(x, _mm_slli_si128(x, 2));
	x = _mm_madd_epi16(x, second_madd);
	x = _mm_shuffle_epi8(x, shuffle);
	x = _mm_or_si128(x, ascii_zero);

	return x;
	}

	#define INLINE static inline __attribute__((always_inline))

	typedef __uint128_t uint128_t;

	/* Powers of 10. */
	#define PO2 100ULL
	#define PO4 10000ULL
	#define PO8 100000000ULL
	#define PO10 10000000000ULL
	#define PO16 10000000000000000ULL

	/* 64 bit's worth of '0' characters (i.e., 8 characters). */
	#define ZERO_CHARS 0x3030303030303030ULL

	INLINE uint32_t
	encode_hundreds(uint32_t hi, uint32_t lo)
	{
	/*
	* Pack everything in a single 32 bit value.
	*
	* merged = [ hi 0 lo 0 ]
	*/
	uint32_t merged = hi \| (lo << 16);
	/*
	* Fixed-point multiplication by 103/1024 ~= 1/10.
	*/
	uint32_t tens = (merged * 103UL) >> 10;

	/*
	* Mask away garbage bits between our digits.
	*
	* tens = [ hi/10 0 lo/10 0 ]
	*
	* On a platform with more restricted literals (ARM, for
	* instance), it may make sense to and-not with the middle
	* bits.
	*/
	tens &= (0xFUL << 16) \| 0xFUL;

	/*
	* x mod 10 = x - 10 * (x div 10).
	*
	* (merged - 10 * tens) = [ hi%10 0 lo%10 0 ]
	*
	* Then insert these values between tens. Arithmetic instead
	* of bitwise operation helps the compiler merge this with
	* later increments by a constant (e.g., ZERO_CHARS).
	*/
	return tens + ((merged - 10UL * tens) << 8);
	}

	INLINE void *
	itoa_ten_thousand(void *out, unsigned int x)
	{
	uint32_t buf;
	unsigned int x_div_PO2, x_mod_PO2;
	unsigned int zeros;

	x_div_PO2 = (x * 10486UL) >> 20;
	x_mod_PO2 = x - PO2 * x_div_PO2;
	buf = encode_hundreds(x_div_PO2, x_mod_PO2);
	/*
	* Count leading (in memory, trailing in register: we're
	* little endian) zero bytes: count leading zero bits and
	* round down to 8.
	*/
	zeros = __builtin_ctz(buf) & -8U;
	buf += ZERO_CHARS; /* BCD -> ASCII. */
	buf >>= zeros; /* Shift away leading zero characters */
	memcpy(out, &buf, 4);

	/* zeros is in bits; convert to bytes to find actual length. */
	return (char *)out + 4 - zeros / 8;
	}

	INLINE void *
	itoa_ten_thousand_pad(void *out, unsigned int x)
	{
	uint32_t buf;
	unsigned int x_div_PO2, x_mod_PO2;
	unsigned int zeros;

	x_div_PO2 = (x * 10486UL) >> 20;
	x_mod_PO2 = x - PO2 * x_div_PO2;
	buf = encode_hundreds(x_div_PO2, x_mod_PO2);
	buf += ZERO_CHARS; /* BCD -> ASCII. */
	memcpy(out, &buf, 4);
	return (char *)out + 4;
	}

	static uint32_t fast04u(unsigned int) __attribute__((noinline));
	static uint32_t an_4u(unsigned int) __attribute__((noinline));
	static uint32_t an_04u(unsigned int) __attribute__((noinline));
	static uint32_t nop(unsigned int) __attribute__((noinline));

	static uint32_t fast04u(unsigned int u)
	{
	uint32_t ret = _mm_cvtsi128_si32(d4toa(u));

	asm("# foo" : "+r"(ret) :: "memory");
	return ret;
	}

	static uint32_t an_4u(unsigned int u)
	{
	uint32_t buf;

	itoa_ten_thousand(&buf, u);
	asm("" : "+r"(buf) :: "memory");
	return buf;
	}

	static uint32_t an_04u(unsigned int u)
	{
	uint32_t buf;

	itoa_ten_thousand_pad(&buf, u);
	asm("" : "+r"(buf) :: "memory");
	return buf;
	}

	static uint32_t nop(unsigned int u)
	{

	asm("" : "+r"(u));
	return u;
	}

	static void
	print_u32(uint32_t buf)
	{

	printf("%.4s\n", (char *)&buf);
	return;
	}

	static int
	cmp_u64(const void vx, const void vy)
	{
	const uint64_t *x = vx;
	const uint64_t *y = vy;

	if (x == y) {
	return 0;
	}

	return (x < y) ? -1 : 1;
	}

	static __attribute__((noinline)) void
	report(uint64_t *times, size_t n)
	{

	qsort(times, n, sizeof(*times), cmp_u64);
	printf("%"PRIu64" %"PRIu64" %"PRIu64"\n",
	times[n / 100], times[n / 10], times[n / 2]);
	return;
	}

	static inline void
	cpuid(void)
	{
	uint32_t a, b, c, d;

	asm("cpuid"
	: "=a"(a), "=b"(b), "=c"(c), "=d"(d)
	: "0"(0)
	: "memory");
	return;
	}

	#define NRUNS 1000UL
	#define K 100

	int
	main()
	{
	static uint64_t times[NRUNS];

	#define TEST(X) do { \
	print_u32(fast04u((X))); \
	print_u32(an_04u((X))); \
	print_u32(an_4u((X))); \
	} while (0)

	/* TEST(0); */
	/* TEST(1); */
	/* TEST(12); */
	/* TEST(123); */
	/* TEST(1234); */
	/* TEST(1230); */
	/* TEST(9900); */

	asm("" ::: "memory");
	#define TEST_SERIAL(FUN) do { \
	memset(times, 0, sizeof(times)); \
	\
	for (size_t i = 0; i < NRUNS; i++) { \
	ticks begin, end; \
	uint32_t acc = 0; \
	\
	cpuid(); \
	begin = getticks(); \
	for (size_t j = 0; j < K; j++) { \
	asm("# foo" ::: "memory"); \
	acc = FUN (acc % 8192); \
	} \
	\
	cpuid(); \
	end = getticks(); \
	times[i] = elapsed(end, begin); \
	} \
	\
	report(times, NRUNS); \
	} while (0)

	printf("serial\n");
	TEST_SERIAL(nop);
	TEST_SERIAL(fast04u);
	TEST_SERIAL(an_04u);
	TEST_SERIAL(an_4u);

	#define TEST_THROUGHPUT(FUN) do { \
	static uint32_t dst[10 * (K)]; \
	\
	memset(times, 0, sizeof(times)); \
	memset(dst, 0, sizeof(dst)); \
	\
	for (size_t i = 0; i < NRUNS; i++) { \
	ticks begin, end; \
	uint32_t acc = 0; \
	\
	cpuid(); \
	begin = getticks(); \
	for (size_t j = 0; j < 10 * (K); j++) { \
	dst[j] = FUN (j); \
	} \
	\
	cpuid(); \
	end = getticks(); \
	times[i] = elapsed(end, begin); \
	} \
	\
	report(times, NRUNS); \
	} while (0)

	printf("throughput\n");
	TEST_THROUGHPUT(nop);
	TEST_THROUGHPUT(fast04u);
	TEST_THROUGHPUT(an_04u);
	TEST_THROUGHPUT(an_4u);

	return 0;
	}