-
-
Save alnsn/615162fcb42b4b2a2261c03654a966ab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <err.h> | |
#include <inttypes.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <emmintrin.h> | |
#include <tmmintrin.h> | |
#include "cycle.h" | |
static inline __m128i d4toa(unsigned int u) | |
{ | |
/* | |
* Multiply u by -65536 (to move -u to the high word), | |
* 26215 (magic for u/10), 10486 (magic for u/100) and | |
* 8389 (magic for u/1000). | |
*/ | |
const __m128i first_madd = | |
_mm_set_epi16(-32768, -32768, 0, 26215, 0, 10486, 0, 8389); | |
/* | |
* Zero-out 18 low bits of u*26215, 20 low bits of u*10486 | |
* and 23 low bits of u*8389: | |
* [-u, 0, u/10*4, 0, u/100*16, 0, u/1000*128]. | |
*/ | |
const __m128i mask = | |
_mm_set_epi16(0xffff, 0, 0xfffc, 0, 0xfff0, 0, 0xff80, 0); | |
/* | |
* Input value | |
* | |
* [-u, u/10*4, u/10*4, u/100*16, u/100*16, u/1000*128, n/1000*128, 0] | |
* | |
* is multiplied to produce 4 scaled digits: | |
* | |
* [(-u)*-256 - (u/10*4)*10*64, 0, (u/10*4)*64 - (u/100*16)*16*10, | |
* (u/100*16)*16 - (u/1000*128)*2*10, (n/1000*128)*2] | |
*/ | |
const __m128i second_madd = | |
_mm_set_epi16(-256, -640, 64, -160, 16, -20, 2, 0); | |
/* | |
* Shuffle digits to low bytes and OR with ascii zeroes. | |
* Only low 32-bit word matter, three other words can | |
* have any values. | |
*/ | |
const __m128i shuffle = _mm_set_epi32(0, 0, 0, 0x0d090501); | |
const __m128i ascii_zero = _mm_set_epi32(0, 0, 0, 0x30303030); | |
__m128i x; | |
//assert(u <= 9999); | |
x = _mm_madd_epi16(_mm_set1_epi16(u), first_madd); | |
x = _mm_and_si128(x, mask); | |
x = _mm_or_si128(x, _mm_slli_si128(x, 2)); | |
x = _mm_madd_epi16(x, second_madd); | |
x = _mm_shuffle_epi8(x, shuffle); | |
x = _mm_or_si128(x, ascii_zero); | |
return x; | |
} | |
#define INLINE static inline __attribute__((always_inline)) | |
typedef __uint128_t uint128_t; | |
/* Powers of 10. */ | |
#define PO2 100ULL | |
#define PO4 10000ULL | |
#define PO8 100000000ULL | |
#define PO10 10000000000ULL | |
#define PO16 10000000000000000ULL | |
/* 64 bit's worth of '0' characters (i.e., 8 characters). */ | |
#define ZERO_CHARS 0x3030303030303030ULL | |
INLINE uint32_t | |
encode_hundreds(uint32_t hi, uint32_t lo) | |
{ | |
/* | |
* Pack everything in a single 32 bit value. | |
* | |
* merged = [ hi 0 lo 0 ] | |
*/ | |
uint32_t merged = hi | (lo << 16); | |
/* | |
* Fixed-point multiplication by 103/1024 ~= 1/10. | |
*/ | |
uint32_t tens = (merged * 103UL) >> 10; | |
/* | |
* Mask away garbage bits between our digits. | |
* | |
* tens = [ hi/10 0 lo/10 0 ] | |
* | |
* On a platform with more restricted literals (ARM, for | |
* instance), it may make sense to and-not with the middle | |
* bits. | |
*/ | |
tens &= (0xFUL << 16) | 0xFUL; | |
/* | |
* x mod 10 = x - 10 * (x div 10). | |
* | |
* (merged - 10 * tens) = [ hi%10 0 lo%10 0 ] | |
* | |
* Then insert these values between tens. Arithmetic instead | |
* of bitwise operation helps the compiler merge this with | |
* later increments by a constant (e.g., ZERO_CHARS). | |
*/ | |
return tens + ((merged - 10UL * tens) << 8); | |
} | |
INLINE void * | |
itoa_ten_thousand(void *out, unsigned int x) | |
{ | |
uint32_t buf; | |
unsigned int x_div_PO2, x_mod_PO2; | |
unsigned int zeros; | |
x_div_PO2 = (x * 10486UL) >> 20; | |
x_mod_PO2 = x - PO2 * x_div_PO2; | |
buf = encode_hundreds(x_div_PO2, x_mod_PO2); | |
/* | |
* Count leading (in memory, trailing in register: we're | |
* little endian) zero bytes: count leading zero bits and | |
* round down to 8. | |
*/ | |
zeros = __builtin_ctz(buf) & -8U; | |
buf += ZERO_CHARS; /* BCD -> ASCII. */ | |
buf >>= zeros; /* Shift away leading zero characters */ | |
memcpy(out, &buf, 4); | |
/* zeros is in bits; convert to bytes to find actual length. */ | |
return (char *)out + 4 - zeros / 8; | |
} | |
INLINE void * | |
itoa_ten_thousand_pad(void *out, unsigned int x) | |
{ | |
uint32_t buf; | |
unsigned int x_div_PO2, x_mod_PO2; | |
unsigned int zeros; | |
x_div_PO2 = (x * 10486UL) >> 20; | |
x_mod_PO2 = x - PO2 * x_div_PO2; | |
buf = encode_hundreds(x_div_PO2, x_mod_PO2); | |
buf += ZERO_CHARS; /* BCD -> ASCII. */ | |
memcpy(out, &buf, 4); | |
return (char *)out + 4; | |
} | |
static uint32_t fast04u(unsigned int) __attribute__((noinline)); | |
static uint32_t an_4u(unsigned int) __attribute__((noinline)); | |
static uint32_t an_04u(unsigned int) __attribute__((noinline)); | |
static uint32_t nop(unsigned int) __attribute__((noinline)); | |
static uint32_t fast04u(unsigned int u) | |
{ | |
uint32_t ret = _mm_cvtsi128_si32(d4toa(u)); | |
asm("# foo" : "+r"(ret) :: "memory"); | |
return ret; | |
} | |
static uint32_t an_4u(unsigned int u) | |
{ | |
uint32_t buf; | |
itoa_ten_thousand(&buf, u); | |
asm("" : "+r"(buf) :: "memory"); | |
return buf; | |
} | |
static uint32_t an_04u(unsigned int u) | |
{ | |
uint32_t buf; | |
itoa_ten_thousand_pad(&buf, u); | |
asm("" : "+r"(buf) :: "memory"); | |
return buf; | |
} | |
static uint32_t nop(unsigned int u) | |
{ | |
asm("" : "+r"(u)); | |
return u; | |
} | |
static void | |
print_u32(uint32_t buf) | |
{ | |
printf("%.4s\n", (char *)&buf); | |
return; | |
} | |
static int | |
cmp_u64(const void *vx, const void *vy) | |
{ | |
const uint64_t *x = vx; | |
const uint64_t *y = vy; | |
if (*x == *y) { | |
return 0; | |
} | |
return (*x < *y) ? -1 : 1; | |
} | |
static __attribute__((noinline)) void | |
report(uint64_t *times, size_t n) | |
{ | |
qsort(times, n, sizeof(*times), cmp_u64); | |
printf("%"PRIu64" %"PRIu64" %"PRIu64"\n", | |
times[n / 100], times[n / 10], times[n / 2]); | |
return; | |
} | |
static inline void | |
cpuid(void) | |
{ | |
uint32_t a, b, c, d; | |
asm("cpuid" | |
: "=a"(a), "=b"(b), "=c"(c), "=d"(d) | |
: "0"(0) | |
: "memory"); | |
return; | |
} | |
#define NRUNS 1000UL | |
#define K 100 | |
int | |
main() | |
{ | |
static uint64_t times[NRUNS]; | |
#define TEST(X) do { \ | |
print_u32(fast04u((X))); \ | |
print_u32(an_04u((X))); \ | |
print_u32(an_4u((X))); \ | |
} while (0) | |
/* TEST(0); */ | |
/* TEST(1); */ | |
/* TEST(12); */ | |
/* TEST(123); */ | |
/* TEST(1234); */ | |
/* TEST(1230); */ | |
/* TEST(9900); */ | |
asm("" ::: "memory"); | |
#define TEST_SERIAL(FUN) do { \ | |
memset(times, 0, sizeof(times)); \ | |
\ | |
for (size_t i = 0; i < NRUNS; i++) { \ | |
ticks begin, end; \ | |
uint32_t acc = 0; \ | |
\ | |
cpuid(); \ | |
begin = getticks(); \ | |
for (size_t j = 0; j < K; j++) { \ | |
asm("# foo" ::: "memory"); \ | |
acc = FUN (acc % 8192); \ | |
} \ | |
\ | |
cpuid(); \ | |
end = getticks(); \ | |
times[i] = elapsed(end, begin); \ | |
} \ | |
\ | |
report(times, NRUNS); \ | |
} while (0) | |
printf("serial\n"); | |
TEST_SERIAL(nop); | |
TEST_SERIAL(fast04u); | |
TEST_SERIAL(an_04u); | |
TEST_SERIAL(an_4u); | |
#define TEST_THROUGHPUT(FUN) do { \ | |
static uint32_t dst[10 * (K)]; \ | |
\ | |
memset(times, 0, sizeof(times)); \ | |
memset(dst, 0, sizeof(dst)); \ | |
\ | |
for (size_t i = 0; i < NRUNS; i++) { \ | |
ticks begin, end; \ | |
uint32_t acc = 0; \ | |
\ | |
cpuid(); \ | |
begin = getticks(); \ | |
for (size_t j = 0; j < 10 * (K); j++) { \ | |
dst[j] = FUN (j); \ | |
} \ | |
\ | |
cpuid(); \ | |
end = getticks(); \ | |
times[i] = elapsed(end, begin); \ | |
} \ | |
\ | |
report(times, NRUNS); \ | |
} while (0) | |
printf("throughput\n"); | |
TEST_THROUGHPUT(nop); | |
TEST_THROUGHPUT(fast04u); | |
TEST_THROUGHPUT(an_04u); | |
TEST_THROUGHPUT(an_4u); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment