Last active
August 29, 2015 14:06
-
-
Save simonwagner/a6b4a8f9d2eadc4b89df to your computer and use it in GitHub Desktop.
UTF32 compare benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Compile with gcc, because clang does not support alignment of stack values (and does not even warn about that). | |
g++-mp-4.8 -march=native -mtune=native -std=c++11 benchmark.cpp -O3 -o benchmark | |
Results on my computer: | |
naive: 22.6027 (1) | |
hadd: 10.0168 (0.443169) | |
hadd_2: 9.88255 (0.437228) | |
movemask: 9.85698 (0.436097) | |
*/ | |
#include <iostream> | |
#include <algorithm> | |
#include <memory> | |
#include <functional> | |
#include <cassert> | |
#include <string> | |
#include <chrono> | |
#include <fstream> | |
#include <emmintrin.h> | |
#include <xmmintrin.h> | |
#include <tmmintrin.h> | |
using namespace std; | |
#define assert_aligned(x) assert((uintptr_t)(x) % 16 == 0) | |
#define RUNS (1000) | |
#define SIZE (1024*1024*8) | |
typedef uint32_t aligned_uint32_t __attribute__ ((aligned (16))); | |
typedef uint8_t aligned_uint8_t __attribute__ ((aligned (16))); | |
size_t inline constexpr aligned_length(size_t length, int alignement) { | |
return length - (length % alignement); | |
} | |
size_t inline constexpr padded_length(size_t length, int multiple) { | |
return length/multiple + (length % multiple != 0 ? 1 : 0); | |
} | |
void eq_naive(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) { | |
size_t i; | |
const size_t result_length = padded_length(length, 8); | |
for(i = 0; i < length; ++i) { | |
uint8_t cmp = (a[i] == b[i] ? 1 : 0); | |
result[i/8] |= cmp << (i % 8); | |
} | |
if(length % 8 > 0) { | |
result[length - 1] >>= 8 - (length % 8); | |
} | |
} | |
__m128i inline vector_cmp(const aligned_uint32_t* a, const aligned_uint32_t* b) { | |
__m128i mmx_mask = _mm_set_epi32(8,4,2,1); | |
__m128i mmx_a = _mm_load_si128((__m128i*)a); | |
__m128i mmx_b = _mm_load_si128((__m128i*)b); | |
__m128i mmx_cmp = _mm_cmpeq_epi32(mmx_a, mmx_b); | |
__m128i mmx_and = _mm_and_si128(mmx_cmp, mmx_mask); | |
return mmx_and; | |
} | |
void eq_hadd(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) { | |
assert_aligned(a); | |
assert_aligned(b); | |
assert_aligned(result); | |
size_t i; | |
const size_t result_length = padded_length(length, 8); | |
__m128i mmx_zero = _mm_setzero_si128(); | |
for(i = 0; i < aligned_length(length, 8); i+=8) { | |
__m128i mmx_temp_result[2]; | |
for(int mi = 0; mi < 2; ++mi) { | |
__m128i mmx_and = vector_cmp(&a[i+4*mi], &b[i+4*mi]); | |
__m128i mmx_hadd1 = _mm_hadd_epi32(mmx_and, mmx_zero); | |
mmx_temp_result[mi] = _mm_hadd_epi32(mmx_hadd1, mmx_zero); | |
} | |
__m128i shifted_mmx_temp_result = _mm_slli_epi64(mmx_temp_result[1], 4); | |
__m128i mmx_result = _mm_or_si128(mmx_temp_result[0], shifted_mmx_temp_result); | |
result[i/8] = _mm_extract_epi8(mmx_result, 0); | |
} | |
for(; i < length; ++i) { | |
uint8_t cmp = (a[i] == b[i] ? 1 : 0); | |
result[i/8] |= cmp << (i % 8); | |
} | |
if(length % 8 > 0) { | |
result[length - 1] >>= 8 - (length % 8); | |
} | |
} | |
void eq_hadd_2(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) { | |
assert_aligned(a); | |
assert_aligned(b); | |
assert_aligned(result); | |
size_t i; | |
const size_t result_length = padded_length(length, 8); | |
__m128i mmx_mult_shift = _mm_set_epi32(16,1,16,1); | |
uint16_t* result_16bit_block = (uint16_t*)result; //we will write to the result in 16bit blocks | |
for(i = 0; i < aligned_length(length, 16); i+=16) { | |
__m128i mmx_a = vector_cmp(&a[i + 4*0], &b[i + 4*0]); | |
__m128i mmx_b = vector_cmp(&a[i + 4*1], &b[i + 4*1]); | |
__m128i mmx_c = vector_cmp(&a[i + 4*2], &b[i + 4*2]); | |
__m128i mmx_d = vector_cmp(&a[i + 4*3], &b[i + 4*3]); | |
__m128i mmx_x = _mm_hadd_epi32(mmx_a,mmx_b); | |
__m128i mmx_y = _mm_hadd_epi32(mmx_c,mmx_d); | |
__m128i mmx_z = _mm_hadd_epi32(mmx_x,mmx_y); | |
__m128i mmx_z_shifted = _mm_mullo_epi32(mmx_z, mmx_mult_shift); | |
__m128i mmx_result = _mm_hadd_epi32(mmx_z_shifted, mmx_z_shifted /*this argument does not matter*/); | |
/* | |
result[i/16 + 0] = _mm_extract_epi8(mmx_result, 0); | |
result[i/16 + 1] = _mm_extract_epi8(mmx_result, 4); | |
*/ | |
//instead of writing the final values twice to memory, shuffle the result | |
//and write it into memory at once | |
__m128i mmx_shuffle_mask = _mm_set_epi32(0x80808080,0x80808080,0x80808080,0x80800400); /*4->1 and 0->0 */ | |
__m128i mmx_shuffled_result = _mm_shuffle_epi8(mmx_result, mmx_shuffle_mask); | |
result_16bit_block[i/16] = _mm_extract_epi16(mmx_shuffled_result, 0); | |
} | |
for(; i < length; ++i) { | |
uint8_t cmp = (a[i] == b[i] ? 1 : 0); | |
result[i/8] |= cmp << (i % 8); | |
} | |
if(length % 8 > 0) { | |
result[length - 1] >>= 8 - (length % 8); | |
} | |
} | |
void eq_movemask(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) { | |
assert_aligned(a); | |
assert_aligned(b); | |
assert_aligned(result); | |
size_t i; | |
const size_t result_length = padded_length(length, 8); | |
for(i = 0; i < aligned_length(length, 8); i+=8) { | |
uint16_t tmp_result[2]; | |
for(int mi = 0; mi < 2; ++mi) { | |
__m128i mmx_a = _mm_load_si128((__m128i*)&a[i+4*mi]); | |
__m128i mmx_b = _mm_load_si128((__m128i*)&b[i+4*mi]); | |
__m128i mmx_cmp = _mm_cmpeq_epi32(mmx_a, mmx_b); | |
auto tmp1 = _mm_packs_epi32(mmx_cmp, mmx_cmp); | |
auto tmp2 = _mm_packs_epi32(tmp1, tmp1); | |
tmp_result[mi] = _mm_movemask_epi8(tmp2) & 0x0F; | |
} | |
result[i/8] = (tmp_result[1] << 4) | (tmp_result[0]); | |
} | |
for(; i < length; ++i) { | |
uint8_t cmp = (a[i] == b[i] ? 1 : 0); | |
result[i/8] |= cmp << (i % 8); | |
} | |
if(length % 8 > 0) { | |
result[length - 1] >>= 8 - (length % 8); | |
} | |
} | |
template<typename ArrayType, typename ResultType> | |
bool exec_test(function<void(const ArrayType*, const ArrayType*, ResultType*, size_t)> tested_func, const ArrayType* a, const ArrayType* b, size_t input_length, const ResultType* exp, size_t result_length, function<ResultType*(size_t)> result_alloc, function<void(ResultType*)> result_dealloc) { | |
ResultType* result = result_alloc(result_length); | |
tested_func(a, b, result, input_length); | |
bool passed = true; | |
for(int i = 0; i < result_length; ++i) { | |
if(!(exp[i] == result[i])) { | |
cerr << (unsigned int)exp[i] << ", got instead " << (unsigned int)result[i] << endl; | |
passed = false; | |
break; | |
} | |
} | |
uninitialized_fill_n(result, result_length, 42); //fill with garbage, so that the old result is no longer there if memory is reused | |
result_dealloc(result); | |
return passed; | |
} | |
template<typename ArrayType, typename ResultType> | |
void run_test_on_func(const string name, function<void(const ArrayType*, const ArrayType*, ResultType*, size_t)> tested_func, function<ResultType*(size_t)> alloc, function<void(ResultType*)> dealloc) { | |
assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3}, (const ArrayType[]){0,1,2,3}, 4, (const ResultType[]){0b00001111}, 1, alloc, dealloc)); | |
assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3}, (const ArrayType[]){0,1,2,3}, 4, (const ResultType[]){0b00001011}, 1, alloc, dealloc)); | |
assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7}, (const ArrayType[]){0,1,2,3,4,5,6,7}, 8, (const ResultType[]){0b11111111}, 1, alloc, dealloc)); | |
assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7}, (const ArrayType[]){0,1,2,3,4,5,6,7}, 8, (const ResultType[]){0b11111011}, 1, alloc, dealloc)); | |
assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0}, 11, (const ResultType[]){0b11111111, 0b00000111}, 2, alloc, dealloc)); | |
assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7,8,9,0}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,1}, 11, (const ResultType[]){0b11111011, 0b00000011}, 2, alloc, dealloc)); | |
assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, 16, (const ResultType[]){0b11111111, 0b11111111}, 2, alloc, dealloc)); | |
assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7,8,9,0,1,2,3,5,5}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, 16, (const ResultType[]){0b11111011, 0b10111111}, 2, alloc, dealloc)); | |
assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7}, 18, (const ResultType[]){0b11111111, 0b11111111, 0b11}, 2, alloc, dealloc)); | |
assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7,8,9,0,1,2,3,5,5,5,7}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7}, 18, (const ResultType[]){0b11111011, 0b10111111, 0b10}, 2, alloc, dealloc)); | |
cout << "ok: " << name << endl; | |
} | |
void touch(uint8_t* result, size_t length) { | |
ofstream null("/dev/null"); | |
null << accumulate(result, result+length, 0); | |
} | |
double benchmark(function<void(const aligned_uint32_t*, const aligned_uint32_t*, aligned_uint8_t*, size_t)> f, const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* buffer, size_t length) { | |
chrono::time_point<chrono::high_resolution_clock> start, end; | |
chrono::nanoseconds elapsed = chrono::nanoseconds(0); | |
for(int i = 0; i < RUNS; i++) { | |
start = std::chrono::high_resolution_clock::now(); | |
f(a, b, buffer, length); | |
end = std::chrono::high_resolution_clock::now(); | |
size_t result_length = padded_length(length, 8); | |
//touch(buffer, result_length); | |
elapsed += end-start; | |
} | |
return elapsed.count()/1.0e9; | |
} | |
void run_test() { | |
function<aligned_uint8_t*(size_t)> alloc_uint8_t = [](size_t size) -> aligned_uint8_t* { return new aligned_uint8_t[size]; }; | |
function<void(aligned_uint8_t*)> dealloc_uint8_t = [](aligned_uint8_t* r) -> void { delete [] r;}; | |
run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_naive", eq_naive, alloc_uint8_t, dealloc_uint8_t); | |
run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_hadd", eq_hadd, alloc_uint8_t, dealloc_uint8_t); | |
run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_hadd_2", eq_hadd_2, alloc_uint8_t, dealloc_uint8_t); | |
run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_movemask", eq_movemask, alloc_uint8_t, dealloc_uint8_t); | |
} | |
void run_benchmark() { | |
//const aligned_uint32_t* a = (const aligned_uint32_t[]){0,1,2,3,4,5,6,7,8,9,0}; | |
const size_t length = SIZE; | |
aligned_uint32_t* a = (aligned_uint32_t*)_mm_malloc(sizeof(uint32_t) * length, 16); | |
aligned_uint32_t* b = (aligned_uint32_t*)_mm_malloc(sizeof(uint32_t) * length, 16); | |
uint8_t* result = (uint8_t*)_mm_malloc(sizeof(uint8_t) * padded_length(length, 8), 16); | |
cout << "running naive..." << endl; | |
double naive_time = benchmark(eq_naive, a, b, result, length); | |
cout << "running hadd..." << endl; | |
double hadd_time = benchmark(eq_hadd, a, b, result, length); | |
cout << "running hadd_2..." << endl; | |
double hadd_2_time = benchmark(eq_hadd_2, a, b, result, length); | |
cout << "running movemask..." << endl; | |
double movemask_time = benchmark(eq_movemask, a, b, result, length); | |
cout << endl; | |
double base_time = naive_time; | |
cout << "naive: " << naive_time << " (" << naive_time/base_time << ")" << endl; | |
cout << "hadd: " << hadd_time << " (" << hadd_time/base_time << ")" << endl; | |
cout << "hadd_2: " << hadd_2_time << " (" << hadd_2_time/base_time << ")" << endl; | |
cout << "movemask: " << movemask_time << " (" << movemask_time/base_time << ")" << endl; | |
_mm_free(a); | |
_mm_free(b); | |
_mm_free(result); | |
} | |
int main(int argc, const char** argv) { | |
run_test(); | |
cout << "-------" << endl; | |
run_benchmark(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment