Last active
June 5, 2018 16:31
-
-
Save tpruzina/440a60c1ae910e0c7be850bbb6d4d766 to your computer and use it in GitHub Desktop.
You just can't beat the compiler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <stddef.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <string.h> | |
// play with these | |
#define EXPECT_SIZE 4096 | |
//#define ALIGN | |
//#define FUNROLL | |
//#define RESTRICT | |
#if !defined(NAIVE) && !defined(INTMAXCPY) && !defined(SOCPY) | |
#define MAIN_TEST_ALL | |
#define NAIVECPY | |
#define INTMAXCPY | |
#define SOCPY | |
#define SSECPY | |
#endif | |
// preprocessor directives | |
#define EXPECT(expr, result) __builtin_expect((expr), (result)) | |
#define LIKELY(condition) __builtin_expect(!!(condition), 1) | |
#define UNLIKELY(condition) __builtin_expect((condition), 0) | |
#define INLINE __inline__ __attribute__((always_inline)) | |
// define this to work with aligned alloc | |
#ifdef ALIGN | |
#define ALIGN_BYTES 8 | |
#define ALIGNED __attribute__((aligned(ALIGN_BYTES))) | |
#else | |
#define ALIGNED | |
#endif | |
// tell the compiler to unroll loops | |
#ifdef FUNROLL | |
#define UNROLL_ENABLED __attribute__((optimize("unroll-loops"))) | |
#else | |
#define UNROLL_ENABLED | |
#endif | |
// tell compiler about aliasing (see "C restrict" keyword on wiki) | |
#ifdef RESTRICT | |
#define RESTRICTED __restrict__ | |
#else | |
#define RESTRICTED | |
#endif | |
#ifdef NAIVECPY | |
UNROLL_ENABLED void | |
naive_cpy(uint8_t * RESTRICTED ALIGNED src, | |
uint8_t * RESTRICTED ALIGNED dst, size_t size) | |
{ | |
size_t i = EXPECT(size, EXPECT_SIZE); | |
for (; UNLIKELY(i); i--) | |
*dst++ = *src++; | |
} | |
#endif | |
#ifdef INTMAXCPY | |
UNROLL_ENABLED void | |
intmax_cpy(uint8_t * RESTRICTED ALIGNED src, | |
uint8_t * RESTRICTED ALIGNED dst, size_t size) | |
{ | |
uintmax_t *si = (uintmax_t *) src; | |
uintmax_t *di = (uintmax_t *) dst; | |
uint8_t *end = src + size; | |
size_t i = EXPECT(size, EXPECT_SIZE) / sizeof(uintmax_t); | |
for (; i > 0; i--) | |
*di++ = *si++; | |
dst = (typeof(dst)) di; | |
src = (typeof(dst)) si; | |
while (src < end) | |
*dst++ = *src++; | |
} | |
#endif | |
#ifdef SOCPY | |
typedef struct { | |
uint8_t dummy[32]; | |
} DT; | |
void SO_cpy(uint8_t * dst, uint8_t * src, size_t s) | |
{ | |
// okay this marks the end of src | |
uint8_t *end = src + s; | |
// this is pure autism | |
// sizeof(DT*) == 8/4B (32/64bit), effectively we loaded (void*)(src_ptr - 8) | |
DT *d1 = (DT *) dst - 1; | |
DT *s1 = (DT *) src - 1; | |
// divide size by copy window | |
size_t si = s / sizeof(DT); | |
si = (si + 7) / 8; | |
switch (si % 8) { | |
case 0: | |
do { | |
*++d1 = *++s1; | |
case 7: | |
*++d1 = *++s1; | |
case 6: | |
*++d1 = *++s1; | |
case 5: | |
*++d1 = *++s1; | |
case 4: | |
*++d1 = *++s1; | |
case 3: | |
*++d1 = *++s1; | |
case 2: | |
*++d1 = *++s1; | |
case 1: | |
*++d1 = *++s1; | |
} | |
while (--si > 0); | |
} | |
dst = (uint8_t *) d1; | |
src = (uint8_t *) s1; | |
while (src < end) { | |
*++dst = *++src; | |
} | |
} | |
#endif | |
#ifdef SSECPY | |
#include <emmintrin.h> | |
static INLINE void sse_cpy(uint8_t * d, uint8_t * s, size_t size) | |
{ | |
size_t pad; | |
void *dst = d; | |
const void *src = s; | |
// this might not matter for powers of two, but it was | |
// a bug in testing random buffer sizes. TODO: think about this. | |
pad = (16 - (((size_t) dst) & 15)) & 15; | |
if (pad > 0) { | |
// TODO: I am not sure this is bug free... or the right way to do this. | |
__m128i top = _mm_loadu_si128((const __m128i *) src); | |
_mm_storeu_si128((__m128i *) dst, top); | |
src += pad; | |
dst += pad; | |
size -= pad; | |
} | |
_mm_prefetch((const char *) (src), _MM_HINT_NTA); | |
for (; size >= 128; size -= 128) { | |
__m128i w0, w1, w2, w3, w4, w5, w6, w7; | |
w0 = _mm_load_si128(((const __m128i *) src) + 0); | |
w1 = _mm_load_si128(((const __m128i *) src) + 1); | |
w2 = _mm_load_si128(((const __m128i *) src) + 2); | |
w3 = _mm_load_si128(((const __m128i *) src) + 3); | |
w4 = _mm_load_si128(((const __m128i *) src) + 4); | |
w5 = _mm_load_si128(((const __m128i *) src) + 5); | |
w6 = _mm_load_si128(((const __m128i *) src) + 6); | |
w7 = _mm_load_si128(((const __m128i *) src) + 7); | |
_mm_prefetch((const char *) (src + 256), _MM_HINT_NTA); | |
src += 128; | |
_mm_stream_si128((((__m128i *) dst) + 0), w0); | |
_mm_stream_si128((((__m128i *) dst) + 1), w1); | |
_mm_stream_si128((((__m128i *) dst) + 2), w2); | |
_mm_stream_si128((((__m128i *) dst) + 3), w3); | |
_mm_stream_si128((((__m128i *) dst) + 4), w4); | |
_mm_stream_si128((((__m128i *) dst) + 5), w5); | |
_mm_stream_si128((((__m128i *) dst) + 6), w6); | |
_mm_stream_si128((((__m128i *) dst) + 7), w7); | |
dst += 128; | |
} | |
// TODO: Last 128 bytes matter when not aligned? | |
// Docs said we need this? | |
_mm_sfence(); | |
} | |
#endif // SSECPY | |
typedef uint64_t ts; | |
static __inline__ uint64_t rdtsc(void) | |
{ | |
uint32_t a, d; | |
__asm volatile ("rdtsc":"=a" (a), "=d"(d)); | |
return ((uint64_t) a) | (((uint64_t) d) << 32); | |
} | |
#ifdef MAIN_TEST_ALL | |
// reset, copy in loop, compare last result and print out time it took to copy data | |
void | |
time_cpy(typeof(naive_cpy) cpy, uint8_t * dst, uint8_t * src, size_t size, | |
size_t loops, const char *msg) | |
{ | |
memset(src, 0, size); | |
memset(dst, 0xFF, size); | |
ts then = rdtsc(); | |
for (uint_fast32_t i = 0; i < loops; i++) | |
cpy(dst, src, size); | |
ts now = rdtsc(); | |
if (memcmp(src, dst, size)) | |
fprintf(stderr, "we dun goofed soomewhere\n"); | |
else | |
printf("%12d clocks [%s]\n", (int) (now - then), msg); | |
} | |
int main(int argc, char **argv) | |
{ | |
// These actually produce different code based on whether size is known at compile time | |
// size_t size = 4096; | |
// size_t size = atoi(argv[1]); | |
size_t size, loops; | |
if (argc == 2) { | |
size = atoi(argv[1]); | |
loops = 1000; | |
} else if (argc == 3) { | |
size = atoi(argv[1]); | |
loops = atoi(argv[2]); | |
} else { | |
size = 4096; | |
loops = 1000; | |
} | |
#ifdef ALIGN | |
uint8_t *src = (uint8_t *) aligned_alloc(ALIGN_BYTES, size); | |
uint8_t *dst = (uint8_t *) aligned_alloc(ALIGN_BYTES, size); | |
#else | |
uint8_t *src = (uint8_t *) malloc(size); | |
uint8_t *dst = (uint8_t *) malloc(size); | |
#endif | |
#ifdef ALIGN | |
printf("[aligned]"); | |
#endif | |
#ifdef RESTRICT | |
printf("[restrict]"); | |
#endif | |
#ifdef FUNROLL | |
printf("[funroll attr]"); | |
#endif | |
printf(" size=%u loops=%u\n", size, loops); | |
time_cpy(naive_cpy, dst, src, size, loops, "NAIVE"); | |
time_cpy(intmax_cpy, dst, src, size, loops, "INTMAX"); | |
time_cpy(SO_cpy, dst, src, size, loops, "SOCPY"); | |
time_cpy(sse_cpy, dst, src, size, loops, "SSE"); | |
free(src); | |
free(dst); | |
return 0; | |
} | |
#endif //MAIN_TEST_ALL |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment