Skip to content

Instantly share code, notes, and snippets.

@alnsn
Forked from pkhuong/u4.c
Created January 4, 2018 00:44
Show Gist options
  • Save alnsn/615162fcb42b4b2a2261c03654a966ab to your computer and use it in GitHub Desktop.
Save alnsn/615162fcb42b4b2a2261c03654a966ab to your computer and use it in GitHub Desktop.
#include <assert.h>
#include <err.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include "cycle.h"
static inline __m128i d4toa(unsigned int u)
{
/*
* Multiply u by -65536 (to move -u to the high word),
* 26215 (magic for u/10), 10486 (magic for u/100) and
* 8389 (magic for u/1000).
*/
const __m128i first_madd =
_mm_set_epi16(-32768, -32768, 0, 26215, 0, 10486, 0, 8389);
/*
* Zero-out 18 low bits of u*26215, 20 low bits of u*10486
* and 23 low bits of u*8389:
* [-u, 0, u/10*4, 0, u/100*16, 0, u/1000*128].
*/
const __m128i mask =
_mm_set_epi16(0xffff, 0, 0xfffc, 0, 0xfff0, 0, 0xff80, 0);
/*
* Input value
*
* [-u, u/10*4, u/10*4, u/100*16, u/100*16, u/1000*128, n/1000*128, 0]
*
* is multiplied to produce 4 scaled digits:
*
* [(-u)*-256 - (u/10*4)*10*64, 0, (u/10*4)*64 - (u/100*16)*16*10,
* (u/100*16)*16 - (u/1000*128)*2*10, (n/1000*128)*2]
*/
const __m128i second_madd =
_mm_set_epi16(-256, -640, 64, -160, 16, -20, 2, 0);
/*
* Shuffle digits to low bytes and OR with ascii zeroes.
* Only low 32-bit word matter, three other words can
* have any values.
*/
const __m128i shuffle = _mm_set_epi32(0, 0, 0, 0x0d090501);
const __m128i ascii_zero = _mm_set_epi32(0, 0, 0, 0x30303030);
__m128i x;
//assert(u <= 9999);
x = _mm_madd_epi16(_mm_set1_epi16(u), first_madd);
x = _mm_and_si128(x, mask);
x = _mm_or_si128(x, _mm_slli_si128(x, 2));
x = _mm_madd_epi16(x, second_madd);
x = _mm_shuffle_epi8(x, shuffle);
x = _mm_or_si128(x, ascii_zero);
return x;
}
#define INLINE static inline __attribute__((always_inline))
typedef __uint128_t uint128_t;
/* Powers of 10. */
#define PO2 100ULL
#define PO4 10000ULL
#define PO8 100000000ULL
#define PO10 10000000000ULL
#define PO16 10000000000000000ULL
/* 64 bit's worth of '0' characters (i.e., 8 characters). */
#define ZERO_CHARS 0x3030303030303030ULL
INLINE uint32_t
encode_hundreds(uint32_t hi, uint32_t lo)
{
/*
* Pack everything in a single 32 bit value.
*
* merged = [ hi 0 lo 0 ]
*/
uint32_t merged = hi | (lo << 16);
/*
* Fixed-point multiplication by 103/1024 ~= 1/10.
*/
uint32_t tens = (merged * 103UL) >> 10;
/*
* Mask away garbage bits between our digits.
*
* tens = [ hi/10 0 lo/10 0 ]
*
* On a platform with more restricted literals (ARM, for
* instance), it may make sense to and-not with the middle
* bits.
*/
tens &= (0xFUL << 16) | 0xFUL;
/*
* x mod 10 = x - 10 * (x div 10).
*
* (merged - 10 * tens) = [ hi%10 0 lo%10 0 ]
*
* Then insert these values between tens. Arithmetic instead
* of bitwise operation helps the compiler merge this with
* later increments by a constant (e.g., ZERO_CHARS).
*/
return tens + ((merged - 10UL * tens) << 8);
}
INLINE void *
itoa_ten_thousand(void *out, unsigned int x)
{
uint32_t buf;
unsigned int x_div_PO2, x_mod_PO2;
unsigned int zeros;
x_div_PO2 = (x * 10486UL) >> 20;
x_mod_PO2 = x - PO2 * x_div_PO2;
buf = encode_hundreds(x_div_PO2, x_mod_PO2);
/*
* Count leading (in memory, trailing in register: we're
* little endian) zero bytes: count leading zero bits and
* round down to 8.
*/
zeros = __builtin_ctz(buf) & -8U;
buf += ZERO_CHARS; /* BCD -> ASCII. */
buf >>= zeros; /* Shift away leading zero characters */
memcpy(out, &buf, 4);
/* zeros is in bits; convert to bytes to find actual length. */
return (char *)out + 4 - zeros / 8;
}
INLINE void *
itoa_ten_thousand_pad(void *out, unsigned int x)
{
uint32_t buf;
unsigned int x_div_PO2, x_mod_PO2;
unsigned int zeros;
x_div_PO2 = (x * 10486UL) >> 20;
x_mod_PO2 = x - PO2 * x_div_PO2;
buf = encode_hundreds(x_div_PO2, x_mod_PO2);
buf += ZERO_CHARS; /* BCD -> ASCII. */
memcpy(out, &buf, 4);
return (char *)out + 4;
}
static uint32_t fast04u(unsigned int) __attribute__((noinline));
static uint32_t an_4u(unsigned int) __attribute__((noinline));
static uint32_t an_04u(unsigned int) __attribute__((noinline));
static uint32_t nop(unsigned int) __attribute__((noinline));
static uint32_t fast04u(unsigned int u)
{
uint32_t ret = _mm_cvtsi128_si32(d4toa(u));
asm("# foo" : "+r"(ret) :: "memory");
return ret;
}
static uint32_t an_4u(unsigned int u)
{
uint32_t buf;
itoa_ten_thousand(&buf, u);
asm("" : "+r"(buf) :: "memory");
return buf;
}
static uint32_t an_04u(unsigned int u)
{
uint32_t buf;
itoa_ten_thousand_pad(&buf, u);
asm("" : "+r"(buf) :: "memory");
return buf;
}
static uint32_t nop(unsigned int u)
{
asm("" : "+r"(u));
return u;
}
static void
print_u32(uint32_t buf)
{
printf("%.4s\n", (char *)&buf);
return;
}
static int
cmp_u64(const void *vx, const void *vy)
{
const uint64_t *x = vx;
const uint64_t *y = vy;
if (*x == *y) {
return 0;
}
return (*x < *y) ? -1 : 1;
}
static __attribute__((noinline)) void
report(uint64_t *times, size_t n)
{
qsort(times, n, sizeof(*times), cmp_u64);
printf("%"PRIu64" %"PRIu64" %"PRIu64"\n",
times[n / 100], times[n / 10], times[n / 2]);
return;
}
static inline void
cpuid(void)
{
uint32_t a, b, c, d;
asm("cpuid"
: "=a"(a), "=b"(b), "=c"(c), "=d"(d)
: "0"(0)
: "memory");
return;
}
#define NRUNS 1000UL
#define K 100
int
main()
{
static uint64_t times[NRUNS];
#define TEST(X) do { \
print_u32(fast04u((X))); \
print_u32(an_04u((X))); \
print_u32(an_4u((X))); \
} while (0)
/* TEST(0); */
/* TEST(1); */
/* TEST(12); */
/* TEST(123); */
/* TEST(1234); */
/* TEST(1230); */
/* TEST(9900); */
asm("" ::: "memory");
#define TEST_SERIAL(FUN) do { \
memset(times, 0, sizeof(times)); \
\
for (size_t i = 0; i < NRUNS; i++) { \
ticks begin, end; \
uint32_t acc = 0; \
\
cpuid(); \
begin = getticks(); \
for (size_t j = 0; j < K; j++) { \
asm("# foo" ::: "memory"); \
acc = FUN (acc % 8192); \
} \
\
cpuid(); \
end = getticks(); \
times[i] = elapsed(end, begin); \
} \
\
report(times, NRUNS); \
} while (0)
printf("serial\n");
TEST_SERIAL(nop);
TEST_SERIAL(fast04u);
TEST_SERIAL(an_04u);
TEST_SERIAL(an_4u);
#define TEST_THROUGHPUT(FUN) do { \
static uint32_t dst[10 * (K)]; \
\
memset(times, 0, sizeof(times)); \
memset(dst, 0, sizeof(dst)); \
\
for (size_t i = 0; i < NRUNS; i++) { \
ticks begin, end; \
uint32_t acc = 0; \
\
cpuid(); \
begin = getticks(); \
for (size_t j = 0; j < 10 * (K); j++) { \
dst[j] = FUN (j); \
} \
\
cpuid(); \
end = getticks(); \
times[i] = elapsed(end, begin); \
} \
\
report(times, NRUNS); \
} while (0)
printf("throughput\n");
TEST_THROUGHPUT(nop);
TEST_THROUGHPUT(fast04u);
TEST_THROUGHPUT(an_04u);
TEST_THROUGHPUT(an_4u);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment