Skip to content

Instantly share code, notes, and snippets.

@simonwagner
Last active August 29, 2015 14:06
Show Gist options
  • Save simonwagner/a6b4a8f9d2eadc4b89df to your computer and use it in GitHub Desktop.
Save simonwagner/a6b4a8f9d2eadc4b89df to your computer and use it in GitHub Desktop.
UTF32 compare benchmark
/*
Compile with gcc, because clang does not support alignment of stack values (and does not even warn about that).
g++-mp-4.8 -march=native -mtune=native -std=c++11 benchmark.cpp -O3 -o benchmark
Results on my computer:
naive: 22.6027 (1)
hadd: 10.0168 (0.443169)
hadd_2: 9.88255 (0.437228)
movemask: 9.85698 (0.436097)
*/
#include <iostream>
#include <algorithm>
#include <memory>
#include <functional>
#include <cassert>
#include <string>
#include <chrono>
#include <fstream>
#include <emmintrin.h>
#include <xmmintrin.h>
#include <tmmintrin.h>
using namespace std;
#define assert_aligned(x) assert((uintptr_t)(x) % 16 == 0)
#define RUNS (1000)
#define SIZE (1024*1024*8)
typedef uint32_t aligned_uint32_t __attribute__ ((aligned (16)));
typedef uint8_t aligned_uint8_t __attribute__ ((aligned (16)));
size_t inline constexpr aligned_length(size_t length, int alignement) {
return length - (length % alignement);
}
size_t inline constexpr padded_length(size_t length, int multiple) {
return length/multiple + (length % multiple != 0 ? 1 : 0);
}
void eq_naive(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) {
size_t i;
const size_t result_length = padded_length(length, 8);
for(i = 0; i < length; ++i) {
uint8_t cmp = (a[i] == b[i] ? 1 : 0);
result[i/8] |= cmp << (i % 8);
}
if(length % 8 > 0) {
result[length - 1] >>= 8 - (length % 8);
}
}
__m128i inline vector_cmp(const aligned_uint32_t* a, const aligned_uint32_t* b) {
__m128i mmx_mask = _mm_set_epi32(8,4,2,1);
__m128i mmx_a = _mm_load_si128((__m128i*)a);
__m128i mmx_b = _mm_load_si128((__m128i*)b);
__m128i mmx_cmp = _mm_cmpeq_epi32(mmx_a, mmx_b);
__m128i mmx_and = _mm_and_si128(mmx_cmp, mmx_mask);
return mmx_and;
}
void eq_hadd(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) {
assert_aligned(a);
assert_aligned(b);
assert_aligned(result);
size_t i;
const size_t result_length = padded_length(length, 8);
__m128i mmx_zero = _mm_setzero_si128();
for(i = 0; i < aligned_length(length, 8); i+=8) {
__m128i mmx_temp_result[2];
for(int mi = 0; mi < 2; ++mi) {
__m128i mmx_and = vector_cmp(&a[i+4*mi], &b[i+4*mi]);
__m128i mmx_hadd1 = _mm_hadd_epi32(mmx_and, mmx_zero);
mmx_temp_result[mi] = _mm_hadd_epi32(mmx_hadd1, mmx_zero);
}
__m128i shifted_mmx_temp_result = _mm_slli_epi64(mmx_temp_result[1], 4);
__m128i mmx_result = _mm_or_si128(mmx_temp_result[0], shifted_mmx_temp_result);
result[i/8] = _mm_extract_epi8(mmx_result, 0);
}
for(; i < length; ++i) {
uint8_t cmp = (a[i] == b[i] ? 1 : 0);
result[i/8] |= cmp << (i % 8);
}
if(length % 8 > 0) {
result[length - 1] >>= 8 - (length % 8);
}
}
void eq_hadd_2(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) {
assert_aligned(a);
assert_aligned(b);
assert_aligned(result);
size_t i;
const size_t result_length = padded_length(length, 8);
__m128i mmx_mult_shift = _mm_set_epi32(16,1,16,1);
uint16_t* result_16bit_block = (uint16_t*)result; //we will write to the result in 16bit blocks
for(i = 0; i < aligned_length(length, 16); i+=16) {
__m128i mmx_a = vector_cmp(&a[i + 4*0], &b[i + 4*0]);
__m128i mmx_b = vector_cmp(&a[i + 4*1], &b[i + 4*1]);
__m128i mmx_c = vector_cmp(&a[i + 4*2], &b[i + 4*2]);
__m128i mmx_d = vector_cmp(&a[i + 4*3], &b[i + 4*3]);
__m128i mmx_x = _mm_hadd_epi32(mmx_a,mmx_b);
__m128i mmx_y = _mm_hadd_epi32(mmx_c,mmx_d);
__m128i mmx_z = _mm_hadd_epi32(mmx_x,mmx_y);
__m128i mmx_z_shifted = _mm_mullo_epi32(mmx_z, mmx_mult_shift);
__m128i mmx_result = _mm_hadd_epi32(mmx_z_shifted, mmx_z_shifted /*this argument does not matter*/);
/*
result[i/16 + 0] = _mm_extract_epi8(mmx_result, 0);
result[i/16 + 1] = _mm_extract_epi8(mmx_result, 4);
*/
//instead of writing the final values twice to memory, shuffle the result
//and write it into memory at once
__m128i mmx_shuffle_mask = _mm_set_epi32(0x80808080,0x80808080,0x80808080,0x80800400); /*4->1 and 0->0 */
__m128i mmx_shuffled_result = _mm_shuffle_epi8(mmx_result, mmx_shuffle_mask);
result_16bit_block[i/16] = _mm_extract_epi16(mmx_shuffled_result, 0);
}
for(; i < length; ++i) {
uint8_t cmp = (a[i] == b[i] ? 1 : 0);
result[i/8] |= cmp << (i % 8);
}
if(length % 8 > 0) {
result[length - 1] >>= 8 - (length % 8);
}
}
void eq_movemask(const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* result, size_t length) {
assert_aligned(a);
assert_aligned(b);
assert_aligned(result);
size_t i;
const size_t result_length = padded_length(length, 8);
for(i = 0; i < aligned_length(length, 8); i+=8) {
uint16_t tmp_result[2];
for(int mi = 0; mi < 2; ++mi) {
__m128i mmx_a = _mm_load_si128((__m128i*)&a[i+4*mi]);
__m128i mmx_b = _mm_load_si128((__m128i*)&b[i+4*mi]);
__m128i mmx_cmp = _mm_cmpeq_epi32(mmx_a, mmx_b);
auto tmp1 = _mm_packs_epi32(mmx_cmp, mmx_cmp);
auto tmp2 = _mm_packs_epi32(tmp1, tmp1);
tmp_result[mi] = _mm_movemask_epi8(tmp2) & 0x0F;
}
result[i/8] = (tmp_result[1] << 4) | (tmp_result[0]);
}
for(; i < length; ++i) {
uint8_t cmp = (a[i] == b[i] ? 1 : 0);
result[i/8] |= cmp << (i % 8);
}
if(length % 8 > 0) {
result[length - 1] >>= 8 - (length % 8);
}
}
template<typename ArrayType, typename ResultType>
bool exec_test(function<void(const ArrayType*, const ArrayType*, ResultType*, size_t)> tested_func, const ArrayType* a, const ArrayType* b, size_t input_length, const ResultType* exp, size_t result_length, function<ResultType*(size_t)> result_alloc, function<void(ResultType*)> result_dealloc) {
ResultType* result = result_alloc(result_length);
tested_func(a, b, result, input_length);
bool passed = true;
for(int i = 0; i < result_length; ++i) {
if(!(exp[i] == result[i])) {
cerr << (unsigned int)exp[i] << ", got instead " << (unsigned int)result[i] << endl;
passed = false;
break;
}
}
uninitialized_fill_n(result, result_length, 42); //fill with garbage, so that the old result is no longer there if memory is reused
result_dealloc(result);
return passed;
}
template<typename ArrayType, typename ResultType>
void run_test_on_func(const string name, function<void(const ArrayType*, const ArrayType*, ResultType*, size_t)> tested_func, function<ResultType*(size_t)> alloc, function<void(ResultType*)> dealloc) {
assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3}, (const ArrayType[]){0,1,2,3}, 4, (const ResultType[]){0b00001111}, 1, alloc, dealloc));
assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3}, (const ArrayType[]){0,1,2,3}, 4, (const ResultType[]){0b00001011}, 1, alloc, dealloc));
assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7}, (const ArrayType[]){0,1,2,3,4,5,6,7}, 8, (const ResultType[]){0b11111111}, 1, alloc, dealloc));
assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7}, (const ArrayType[]){0,1,2,3,4,5,6,7}, 8, (const ResultType[]){0b11111011}, 1, alloc, dealloc));
assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0}, 11, (const ResultType[]){0b11111111, 0b00000111}, 2, alloc, dealloc));
assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7,8,9,0}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,1}, 11, (const ResultType[]){0b11111011, 0b00000011}, 2, alloc, dealloc));
assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, 16, (const ResultType[]){0b11111111, 0b11111111}, 2, alloc, dealloc));
assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7,8,9,0,1,2,3,5,5}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}, 16, (const ResultType[]){0b11111011, 0b10111111}, 2, alloc, dealloc));
assert(exec_test(tested_func, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7}, 18, (const ResultType[]){0b11111111, 0b11111111, 0b11}, 2, alloc, dealloc));
assert(exec_test(tested_func, (const ArrayType[]){0,1,5,3,4,5,6,7,8,9,0,1,2,3,5,5,5,7}, (const ArrayType[]){0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7}, 18, (const ResultType[]){0b11111011, 0b10111111, 0b10}, 2, alloc, dealloc));
cout << "ok: " << name << endl;
}
void touch(uint8_t* result, size_t length) {
ofstream null("/dev/null");
null << accumulate(result, result+length, 0);
}
double benchmark(function<void(const aligned_uint32_t*, const aligned_uint32_t*, aligned_uint8_t*, size_t)> f, const aligned_uint32_t* a, const aligned_uint32_t* b, aligned_uint8_t* buffer, size_t length) {
chrono::time_point<chrono::high_resolution_clock> start, end;
chrono::nanoseconds elapsed = chrono::nanoseconds(0);
for(int i = 0; i < RUNS; i++) {
start = std::chrono::high_resolution_clock::now();
f(a, b, buffer, length);
end = std::chrono::high_resolution_clock::now();
size_t result_length = padded_length(length, 8);
//touch(buffer, result_length);
elapsed += end-start;
}
return elapsed.count()/1.0e9;
}
void run_test() {
function<aligned_uint8_t*(size_t)> alloc_uint8_t = [](size_t size) -> aligned_uint8_t* { return new aligned_uint8_t[size]; };
function<void(aligned_uint8_t*)> dealloc_uint8_t = [](aligned_uint8_t* r) -> void { delete [] r;};
run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_naive", eq_naive, alloc_uint8_t, dealloc_uint8_t);
run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_hadd", eq_hadd, alloc_uint8_t, dealloc_uint8_t);
run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_hadd_2", eq_hadd_2, alloc_uint8_t, dealloc_uint8_t);
run_test_on_func<aligned_uint32_t, aligned_uint8_t>("eq_movemask", eq_movemask, alloc_uint8_t, dealloc_uint8_t);
}
void run_benchmark() {
//const aligned_uint32_t* a = (const aligned_uint32_t[]){0,1,2,3,4,5,6,7,8,9,0};
const size_t length = SIZE;
aligned_uint32_t* a = (aligned_uint32_t*)_mm_malloc(sizeof(uint32_t) * length, 16);
aligned_uint32_t* b = (aligned_uint32_t*)_mm_malloc(sizeof(uint32_t) * length, 16);
uint8_t* result = (uint8_t*)_mm_malloc(sizeof(uint8_t) * padded_length(length, 8), 16);
cout << "running naive..." << endl;
double naive_time = benchmark(eq_naive, a, b, result, length);
cout << "running hadd..." << endl;
double hadd_time = benchmark(eq_hadd, a, b, result, length);
cout << "running hadd_2..." << endl;
double hadd_2_time = benchmark(eq_hadd_2, a, b, result, length);
cout << "running movemask..." << endl;
double movemask_time = benchmark(eq_movemask, a, b, result, length);
cout << endl;
double base_time = naive_time;
cout << "naive: " << naive_time << " (" << naive_time/base_time << ")" << endl;
cout << "hadd: " << hadd_time << " (" << hadd_time/base_time << ")" << endl;
cout << "hadd_2: " << hadd_2_time << " (" << hadd_2_time/base_time << ")" << endl;
cout << "movemask: " << movemask_time << " (" << movemask_time/base_time << ")" << endl;
_mm_free(a);
_mm_free(b);
_mm_free(result);
}
int main(int argc, const char** argv) {
run_test();
cout << "-------" << endl;
run_benchmark();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment