Last active
December 16, 2019 09:52
-
-
Save danielealbano/233c5f56dc7d7a315431407d63db24b0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## INSTRUCTIONS | |
## | |
## compile using | |
## gcc -O3 -funroll-loops -std=c11 -march=native -mavx2 -msse -msse2 -msse3 -msse4 -o test-ht-cmp-methods.c test-ht-cmp-methods.c | |
## | |
## run with | |
## ./test-ht-cmp-methods.c | |
## | |
## Output (comoare method, key idx, ht size rounded up per cache line, cpu cycles, clock ticks) | |
## simple,7,8,220,2 | |
## simple,15,16,242,3 | |
## simple,999,1000,1587,3 | |
## simple,9999,10000,12524,6 | |
## simple,99999,100000,139767,51 | |
## simple,999999,1000000,1721598,595 | |
## simple,9999999,10000000,18302077,6297 | |
## simple,99999999,100000000,152492410,52420 | |
## sse,7,8,109,1 | |
## sse,15,16,115,1 | |
## sse,999,1000,1331,2 | |
## sse,9999,10000,14477,8 | |
## sse,99999,100000,195084,68 | |
## sse,999999,1000000,2232959,771 | |
## sse,9999999,10000000,23281896,7856 | |
## sse,99999999,100000000,198084171,68021 | |
## avx2,7,8,178,1 | |
## avx2,15,16,183,2 | |
## avx2,999,1000,1732,3 | |
## avx2,9999,10000,15230,7 | |
## avx2,99999,100000,230672,71 | |
## avx2,999999,1000000,1960848,677 | |
## avx2,9999999,10000000,24492360,8331 | |
## avx2,99999999,100000000,166761353,57420 | |
## | |
## CODE BELOW | |
## | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <time.h> | |
#include <sys/time.h> | |
#include <sys/resource.h> | |
#include <xmmintrin.h> | |
#include <immintrin.h> | |
volatile uint64_t* ht_keys; | |
static inline unsigned long rdtscp_start(void) | |
{ | |
unsigned long var; | |
unsigned int hi, lo; | |
__asm volatile ("cpuid\n\t" | |
"rdtsc\n\t" : "=a" (lo), "=d" (hi) | |
:: "%rbx", "%rcx"); | |
var = ((unsigned long)hi << 32) | lo; | |
return (var); | |
} | |
static inline unsigned long rdtscp_end(void) | |
{ | |
unsigned long var; | |
unsigned int hi, lo; | |
__asm volatile ("rdtscp\n\t" | |
"mov %%edx, %1\n\t" | |
"mov %%eax, %0\n\t" | |
"cpuid\n\t" : "=r" (lo), "=r" (hi) | |
:: "%rax", "%rbx", "%rcx", "%rdx"); | |
var = ((unsigned long)hi << 32) | lo; | |
return (var); | |
} | |
int ht_search_key_simple(volatile uint64_t* ht_keys, uint64_t ht_size, uint64_t key, uint64_t* bucket_index) | |
{ | |
uint8_t found_in_ht = 0; | |
for(uint64_t i = 0; i < ht_size; i+=4) | |
{ | |
if (ht_keys[i] == key) | |
{ | |
found_in_ht = 1; | |
*bucket_index = i; | |
} | |
else if (ht_keys[i + 1] == key) | |
{ | |
found_in_ht = 1; | |
*bucket_index = i + 1; | |
} | |
else if (ht_keys[i + 2] == key) | |
{ | |
found_in_ht = 1; | |
*bucket_index = i + 2; | |
} | |
else if (ht_keys[i + 3] == key) | |
{ | |
found_in_ht = 1; | |
*bucket_index = i + 3; | |
} | |
if (found_in_ht) | |
{ | |
break; | |
} | |
} | |
return found_in_ht; | |
} | |
int ht_search_key_sse(volatile uint64_t* ht_keys, uint64_t ht_size, uint64_t key, uint64_t* bucket_index) | |
{ | |
uint64_t found_in_ht = 0; | |
uint32_t __attribute__ ((aligned(16))) search_vector[4] = { | |
(uint32_t)((key & 0xFFFFFFFF00000000LL) >> 32), (uint32_t)(key & 0xFFFFFFFFLL), | |
(uint32_t)((key & 0xFFFFFFFF00000000LL) >> 32), (uint32_t)(key & 0xFFFFFFFFLL) | |
}; | |
uint32_t __attribute__ ((aligned(16))) result[4]; | |
__m128i v1 = _mm_load_si128((__m128i *)search_vector); | |
for(uint64_t i = 0; i < ht_size; i+=2) | |
{ | |
__m128i v2 = _mm_load_si128((__m128i *)&ht_keys[i]); | |
__m128i vcmp = _mm_cmpeq_epi32(v1, v2); | |
_mm_store_si128((__m128i *)result, vcmp); | |
if (((uint64_t*)result)[0] == 0xFFFFFFFFFFFFFFFF) | |
{ | |
found_in_ht = 1; | |
*bucket_index = i + 0; | |
} | |
else if (((uint64_t*)result)[1] == 0xFFFFFFFFFFFFFFFF) | |
{ | |
found_in_ht = 1; | |
*bucket_index = i + 1; | |
} | |
if (found_in_ht) | |
{ | |
break; | |
} | |
} | |
return found_in_ht; | |
} | |
int ht_search_key_avx2(volatile uint64_t* ht_keys, uint64_t ht_size, uint64_t key, uint64_t* bucket_index) | |
{ | |
uint64_t found_in_ht = 0; | |
uint64_t __attribute__ ((aligned(16))) search_vector[4] = { key, key, key, key }; | |
uint64_t __attribute__ ((aligned(16))) result[4]; | |
__m256i v1 = _mm256_loadu_si256((__m256i *)search_vector); | |
for(uint64_t i = 0; i < ht_size; i+=4) | |
{ | |
__m256i v2 = _mm256_loadu_si256((__m256i *)&ht_keys[i]); | |
__m256i vcmp = _mm256_cmpeq_epi64(v1, v2); | |
_mm256_store_si256((__m256i *)result, vcmp); | |
if (result[0] == 0xFFFFFFFFFFFFFFFF) | |
{ | |
found_in_ht = 1; | |
*bucket_index = i + 0; | |
break; | |
} | |
else if (result[1] == 0xFFFFFFFFFFFFFFFF) | |
{ | |
found_in_ht = 1; | |
*bucket_index = i + 1; | |
break; | |
} | |
else if (result[2] == 0xFFFFFFFFFFFFFFFF) | |
{ | |
found_in_ht = 1; | |
*bucket_index = i + 2; | |
break; | |
} | |
else if (result[3] == 0xFFFFFFFFFFFFFFFF) | |
{ | |
found_in_ht = 1; | |
*bucket_index = i + 3; | |
break; | |
} | |
} | |
return found_in_ht; | |
} | |
typedef int (*ht_search_key_fn_t)(volatile uint64_t*, uint64_t, uint64_t, uint64_t*); | |
typedef struct { | |
char* name; | |
ht_search_key_fn_t fn; | |
} compare_method_t; | |
int main(int argc, char** argv) | |
{ | |
compare_method_t compare_methods[] = { | |
{ | |
.name = "simple", | |
.fn = &ht_search_key_simple | |
}, | |
{ | |
.name = "sse", | |
.fn = &ht_search_key_sse | |
}, | |
{ | |
.name = "avx2", | |
.fn = &ht_search_key_avx2 | |
}, | |
{ | |
.name = NULL, | |
.fn = NULL | |
} | |
}; | |
uint64_t ht_sizes[] = {1, 9, 999, 9999, 99999, 999999, 9999999, 99999999}; | |
compare_method_t* compare_method = &compare_methods[0]; | |
while(compare_method->name != NULL) | |
{ | |
for(uint8_t ht_sizes_idx = 0; ht_sizes_idx < sizeof(ht_sizes) / sizeof(uint64_t); ht_sizes_idx++) | |
{ | |
uint64_t ht_size = ht_sizes[ht_sizes_idx]; | |
// Ensure that ht_size is big N times a cacheline | |
ht_size = ht_size + ((64 - ((ht_size * sizeof(uint64_t)) % 64)) / sizeof(uint64_t)); | |
uint64_t key = ht_size - 1; | |
// Allocate memory | |
ht_keys = _mm_malloc(ht_size * sizeof(uint64_t), 64 * 8); | |
if (ht_keys == NULL) | |
{ | |
perror("Unable to allocate memory"); | |
return -1; | |
} | |
// fill the ht_keys | |
for(uint64_t i = 0; i < ht_size; i++) | |
{ | |
ht_keys[i] = i; | |
} | |
//struct timespec start, end; | |
clock_t tick, tock; | |
uint64_t total_cpu_cycles; | |
uint64_t found_in_ht_bucket_index; | |
printf("%s,%lld,%lld,", compare_method->name, key, ht_size); | |
for(uint8_t loop = 0; loop < 5; loop++) | |
{ | |
tick = clock(); | |
total_cpu_cycles = rdtscp_start(); | |
(*compare_method->fn)(ht_keys, ht_size, key, &found_in_ht_bucket_index); | |
total_cpu_cycles = rdtscp_end() - total_cpu_cycles; | |
tock = clock(); | |
} | |
clock_t ticks = tock - tick; | |
//double dt = (double)ticks; | |
printf("%lld,", total_cpu_cycles); | |
printf("%lu\n", ticks); | |
_mm_free((void*)ht_keys); | |
} | |
compare_method++; | |
} | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment