Last active
October 1, 2020 07:43
-
-
Save Mytherin/0c494ab3c3aee490e7d1be66a228b87c to your computer and use it in GitHub Desktop.
AVX512 Bitmask vs scalar selection vector
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <chrono> | |
#include <iostream> | |
#include <stdlib.h> | |
#include <bitset> | |
#include <vector> | |
// #include "x86/avx.h" | |
// #include "x86/avx512f.h" | |
// #include "x86/avx512h.h" | |
#include <immintrin.h> | |
/* | |
INT32 | |
clang++ -std=c++11 test.cpp -O3 -mavx512f && ./a.out | |
Bitmask (no SIMD) (Selectivity 100): 649ms (Result: 51201370174) | |
Bitmask (no SIMD) (Selectivity 99): 673ms (Result: 50694880171) | |
Bitmask (no SIMD) (Selectivity 95): 662ms (Result: 48667251986) | |
Bitmask (no SIMD) (Selectivity 75): 648ms (Result: 38528036114) | |
Bitmask (no SIMD) (Selectivity 50): 646ms (Result: 25854797349) | |
Bitmask (no SIMD) (Selectivity 25): 647ms (Result: 13180588168) | |
Bitmask (no SIMD) (Selectivity 5): 650ms (Result: 3041189445) | |
Bitmask (no SIMD) (Selectivity 1): 647ms (Result: 1013846053) | |
Bitmask (no SIMD) (Selectivity -1): 315ms (Result: 0) | |
Bitmask (SIMD) (Selectivity 100): 482ms (Result: 51201370174) | |
Bitmask (SIMD) (Selectivity 99): 482ms (Result: 50694880171) | |
Bitmask (SIMD) (Selectivity 95): 482ms (Result: 48667251986) | |
Bitmask (SIMD) (Selectivity 75): 482ms (Result: 38528036114) | |
Bitmask (SIMD) (Selectivity 50): 481ms (Result: 25854797349) | |
Bitmask (SIMD) (Selectivity 25): 482ms (Result: 13180588168) | |
Bitmask (SIMD) (Selectivity 5): 482ms (Result: 3041189445) | |
Bitmask (SIMD) (Selectivity 1): 482ms (Result: 1013846053) | |
Bitmask (SIMD) (Selectivity -1): 482ms (Result: 0) | |
SelVector (Selectivity 100): 625ms (Result: 51201370174) | |
SelVector (Selectivity 99): 632ms (Result: 50694880171) | |
SelVector (Selectivity 95): 631ms (Result: 48667251986) | |
SelVector (Selectivity 75): 623ms (Result: 38528036114) | |
SelVector (Selectivity 50): 613ms (Result: 25854797349) | |
SelVector (Selectivity 25): 601ms (Result: 13180588168) | |
SelVector (Selectivity 5): 549ms (Result: 3041189445) | |
SelVector (Selectivity 1): 466ms (Result: 1013846053) | |
SelVector (Selectivity -1): 283ms (Result: 0) | |
INT64 (See https://gist.github.com/Mytherin/b5c0dc2ca20ca6754255bb3abfbfedbc) | |
Bitmask (no SIMD) (Selectivity 100): 1088ms (Result: 51201370174) | |
Bitmask (no SIMD) (Selectivity 99): 1122ms (Result: 50694880171) | |
Bitmask (no SIMD) (Selectivity 95): 1158ms (Result: 48667251986) | |
Bitmask (no SIMD) (Selectivity 75): 1138ms (Result: 38528036114) | |
Bitmask (no SIMD) (Selectivity 50): 1134ms (Result: 25854797349) | |
Bitmask (no SIMD) (Selectivity 25): 1137ms (Result: 13180588168) | |
Bitmask (no SIMD) (Selectivity 5): 1125ms (Result: 3041189445) | |
Bitmask (no SIMD) (Selectivity 1): 1081ms (Result: 1013846053) | |
Bitmask (no SIMD) (Selectivity -1): 483ms (Result: 0) | |
Bitmask (SIMD) (Selectivity 100): 925ms (Result: 51201370174) | |
Bitmask (SIMD) (Selectivity 99): 926ms (Result: 50694880171) | |
Bitmask (SIMD) (Selectivity 95): 927ms (Result: 48667251986) | |
Bitmask (SIMD) (Selectivity 75): 928ms (Result: 38528036114) | |
Bitmask (SIMD) (Selectivity 50): 928ms (Result: 25854797349) | |
Bitmask (SIMD) (Selectivity 25): 928ms (Result: 13180588168) | |
Bitmask (SIMD) (Selectivity 5): 928ms (Result: 3041189445) | |
Bitmask (SIMD) (Selectivity 1): 928ms (Result: 1013846053) | |
Bitmask (SIMD) (Selectivity -1): 927ms (Result: 0) | |
SelVector (Selectivity 100): 925ms (Result: 51201370174) | |
SelVector (Selectivity 99): 925ms (Result: 50694880171) | |
SelVector (Selectivity 95): 928ms (Result: 48667251986) | |
SelVector (Selectivity 75): 922ms (Result: 38528036114) | |
SelVector (Selectivity 50): 914ms (Result: 25854797349) | |
SelVector (Selectivity 25): 895ms (Result: 13180588168) | |
SelVector (Selectivity 5): 733ms (Result: 3041189445) | |
SelVector (Selectivity 1): 545ms (Result: 1013846053) | |
SelVector (Selectivity -1): 327ms (Result: 0) | |
INT32 | |
clang++ -std=c++11 test.cpp -O3 -DDISABLE_AVX512 && ./a.out | |
Bitmask (no SIMD) (Selectivity 100): 631ms (Result: 51201370174) | |
Bitmask (no SIMD) (Selectivity 99): 650ms (Result: 50694880171) | |
Bitmask (no SIMD) (Selectivity 95): 636ms (Result: 48667251986) | |
Bitmask (no SIMD) (Selectivity 75): 603ms (Result: 38528036114) | |
Bitmask (no SIMD) (Selectivity 50): 579ms (Result: 25854797349) | |
Bitmask (no SIMD) (Selectivity 25): 558ms (Result: 13180588168) | |
Bitmask (no SIMD) (Selectivity 5): 547ms (Result: 3041189445) | |
Bitmask (no SIMD) (Selectivity 1): 542ms (Result: 1013846053) | |
Bitmask (no SIMD) (Selectivity -1): 243ms (Result: 0) | |
SelVector (Selectivity 100): 678ms (Result: 51201370174) | |
SelVector (Selectivity 99): 679ms (Result: 50694880171) | |
SelVector (Selectivity 95): 672ms (Result: 48667251986) | |
SelVector (Selectivity 75): 642ms (Result: 38528036114) | |
SelVector (Selectivity 50): 609ms (Result: 25854797349) | |
SelVector (Selectivity 25): 580ms (Result: 13180588168) | |
SelVector (Selectivity 5): 516ms (Result: 3041189445) | |
SelVector (Selectivity 1): 414ms (Result: 1013846053) | |
SelVector (Selectivity -1): 258ms (Result: 0) | |
INT64 | |
clang++ -std=c++11 test-64.cpp -O3 -DDISABLE_AVX512 && ./a.out | |
Bitmask (no SIMD) (Selectivity 100): 935ms (Result: 51201370174) | |
Bitmask (no SIMD) (Selectivity 99): 977ms (Result: 50694880171) | |
Bitmask (no SIMD) (Selectivity 95): 999ms (Result: 48667251986) | |
Bitmask (no SIMD) (Selectivity 75): 989ms (Result: 38528036114) | |
Bitmask (no SIMD) (Selectivity 50): 979ms (Result: 25854797349) | |
Bitmask (no SIMD) (Selectivity 25): 974ms (Result: 13180588168) | |
Bitmask (no SIMD) (Selectivity 5): 964ms (Result: 3041189445) | |
Bitmask (no SIMD) (Selectivity 1): 923ms (Result: 1013846053) | |
Bitmask (no SIMD) (Selectivity -1): 359ms (Result: 0) | |
SelVector (Selectivity 100): 968ms (Result: 51201370174) | |
SelVector (Selectivity 99): 974ms (Result: 50694880171) | |
SelVector (Selectivity 95): 973ms (Result: 48667251986) | |
SelVector (Selectivity 75): 960ms (Result: 38528036114) | |
SelVector (Selectivity 50): 940ms (Result: 25854797349) | |
SelVector (Selectivity 25): 912ms (Result: 13180588168) | |
SelVector (Selectivity 5): 747ms (Result: 3041189445) | |
SelVector (Selectivity 1): 603ms (Result: 1013846053) | |
SelVector (Selectivity -1): 340ms (Result: 0) | |
*/ | |
// https://github.com/simd-everywhere/simde/releases/tag/v0.6.0 | |
// clang++ -std=c++11 test.cpp -O3 -mavx512f && ./a.out | |
// clang++ -std=c++11 test.cpp -O3 -DDISABLE_AVX512 && ./a.out | |
using namespace std; | |
#define VECTOR_SIZE 1024 | |
// 1024 / 64 | |
#define MASK_COUNT 16 | |
#define DATA_SIZE 512000000 | |
struct nullmask_t { | |
union { | |
uint64_t mask_u64[VECTOR_SIZE / 64]; | |
uint32_t mask_u32[VECTOR_SIZE / 32]; | |
uint16_t mask_u16[VECTOR_SIZE / 16]; | |
uint8_t mask_u8[VECTOR_SIZE / 8]; | |
} mask_union; | |
}; | |
size_t sel_vector_lt(int32_t * a, int32_t constant, size_t count, uint16_t *result_sel) { | |
size_t result_count = 0; | |
for(size_t i = 0; i < count; i++) { | |
result_sel[result_count] = i; | |
result_count += a[i] <= constant; | |
} | |
return result_count; | |
} | |
void sel_vector_addition(const int32_t * a , const int32_t * b , size_t count, const uint16_t *sel, int32_t * result ) { | |
for(size_t i = 0; i < count; i++) { | |
result[i] = a[sel[i]] + b[sel[i]]; | |
} | |
} | |
void sel_vector_summation(const int32_t * a , size_t count, int64_t &sum) { | |
for(size_t i = 0; i < count; i++) { | |
sum += a[i]; | |
} | |
} | |
void selection_vector_benchmark(int32_t *a, int32_t *b, int32_t *c, int32_t constant) { | |
uint16_t sel_vector[VECTOR_SIZE]; | |
int32_t intermediate[VECTOR_SIZE]; | |
int64_t sum = 0; | |
auto start = chrono::system_clock::now(); | |
for(int i = 0; i < DATA_SIZE; i += VECTOR_SIZE) { | |
size_t result_count = sel_vector_lt(c + i, constant, VECTOR_SIZE, sel_vector); | |
sel_vector_addition(a + i, b + i, result_count, sel_vector, intermediate); | |
sel_vector_summation(intermediate, result_count, sum); | |
} | |
auto end = chrono::system_clock::now(); | |
auto elapsed = | |
std::chrono::duration_cast<std::chrono::milliseconds>(end - start); | |
std::cout << "SelVector (Selectivity " << constant << "): " << elapsed.count() << "ms (Result: " << sum << ")\n"; | |
} | |
int32_t *aligned_alloc(int32_t count) { | |
auto ptr = new uint8_t[count * sizeof(int32_t) + 63]; | |
return (int32_t*)(((uintptr_t(ptr) + 63) / 64) * 64); | |
} | |
#ifndef DISABLE_AVX512 | |
void bitmask_lt(const int32_t *a , const __m512i constant , size_t count, nullmask_t &sel) { | |
for(size_t i = 0, target = count / 16; i < target; i++) { | |
__m512i a_mm512 = _mm512_load_epi32(a + i * 16); | |
sel.mask_union.mask_u16[i] = _mm512_cmple_epi32_mask(a_mm512, constant); | |
} | |
} | |
void bitmask_addition(const int32_t *a , const int32_t *b , size_t count, const nullmask_t &sel, int32_t *result ) { | |
for(size_t i = 0, target = count / 16; i < target; i++) { | |
__m512i a_mm512 = _mm512_load_epi32(a + i * 16); | |
__m512i b_mm512 = _mm512_load_epi32(b + i * 16); | |
auto mask = sel.mask_union.mask_u16[i]; | |
auto result_mm512 = _mm512_maskz_add_epi32(mask, a_mm512, b_mm512); | |
_mm512_store_epi32(result + i * 16, result_mm512); | |
} | |
} | |
void bitmask_summation(const int32_t *a , const nullmask_t &sel, size_t count, int64_t &sum) { | |
for(size_t i = 0, target = count / 16; i < target; i++) { | |
__m512i a_mm512 = _mm512_load_epi32(a + i * 16); | |
sum += _mm512_mask_reduce_add_epi32(sel.mask_union.mask_u16[i], a_mm512); | |
} | |
} | |
void bitmask_benchmark(int32_t *a, int32_t *b, int32_t *c, int32_t constant) { | |
nullmask_t selection_mask; | |
int32_t *constant_i32 = aligned_alloc(VECTOR_SIZE); | |
for(int i = 0; i < VECTOR_SIZE; i++) { | |
constant_i32[i] = constant; | |
} | |
__m512i constant_mm512 = _mm512_load_epi32(constant_i32); | |
int32_t *intermediate = aligned_alloc(VECTOR_SIZE); | |
int64_t sum = 0; | |
auto start = chrono::system_clock::now(); | |
for(int i = 0; i < DATA_SIZE; i += VECTOR_SIZE) { | |
bitmask_lt((c + i), constant_mm512, VECTOR_SIZE, selection_mask); | |
bitmask_addition(a + i, b + i, VECTOR_SIZE, selection_mask, intermediate); | |
bitmask_summation(intermediate, selection_mask, VECTOR_SIZE, sum); | |
} | |
auto end = chrono::system_clock::now(); | |
auto elapsed = | |
std::chrono::duration_cast<std::chrono::milliseconds>(end - start); | |
std::cout << "Bitmask (SIMD) (Selectivity " << constant << "): " << elapsed.count() << "ms (Result: " << sum << ")\n"; | |
} | |
#endif | |
void bitmask_nosimd_lt(const int32_t *a , const int32_t constant , size_t count, nullmask_t &sel) { | |
for(size_t i = 0, target = count / 32; i < target; i++) { | |
sel.mask_union.mask_u32[i] = 0; | |
for(size_t j = 0; j < 32; j++) { | |
sel.mask_union.mask_u32[i] |= (a[i * 32 + j] <= constant) << j; | |
} | |
} | |
} | |
size_t bitmask_nosimd_addition(const int32_t *a , const int32_t *b , size_t count, const nullmask_t &sel, int32_t *result) { | |
size_t result_count = 0; | |
for(size_t i = 0, target = count / 32; i < target; i++) { | |
auto mask = sel.mask_union.mask_u32[i]; | |
if (mask == 0) { | |
// no matches | |
continue; | |
} else if (mask == UINT32_C(0xFFFFFFFF)) { | |
// everything matches | |
for(size_t j = 0; j < 32; j++) { | |
auto index = i * 32 + j; | |
result[result_count++] = a[index] + b[index]; | |
} | |
} else { | |
// partial matches | |
for(size_t j = 0; j < 32; j++) { | |
auto index = i * 32 + j; | |
result[result_count] = a[index] + b[index]; | |
result_count += bool(mask & (1 << j)); | |
} | |
} | |
} | |
return result_count; | |
} | |
void bitmask_benchmark_nosimd(int32_t *a, int32_t *b, int32_t *c, int32_t constant) { | |
nullmask_t selection_mask; | |
int32_t *intermediate = aligned_alloc(VECTOR_SIZE); | |
int64_t sum = 0; | |
auto start = chrono::system_clock::now(); | |
for(int i = 0; i < DATA_SIZE; i += VECTOR_SIZE) { | |
bitmask_nosimd_lt((c + i), constant, VECTOR_SIZE, selection_mask); | |
size_t result_count = bitmask_nosimd_addition(a + i, b + i, VECTOR_SIZE, selection_mask, intermediate); | |
sel_vector_summation(intermediate, result_count, sum); | |
} | |
auto end = chrono::system_clock::now(); | |
auto elapsed = | |
std::chrono::duration_cast<std::chrono::milliseconds>(end - start); | |
std::cout << "Bitmask (no SIMD) (Selectivity " << constant << "): " << elapsed.count() << "ms (Result: " << sum << ")\n"; | |
} | |
int main() { | |
int32_t *a = aligned_alloc(DATA_SIZE); | |
int32_t *b = aligned_alloc(DATA_SIZE); | |
int32_t *c = aligned_alloc(DATA_SIZE); | |
for(size_t i = 0; i < DATA_SIZE; i++) { | |
a[i] = rand() % 101; | |
b[i] = rand() % 101; | |
c[i] = rand() % 101; | |
} | |
// SELECT SUM(a+b) FROM tbl WHERE c <= CONSTANT | |
// CONSTANT determines selectivity (0-100%) | |
vector<int> selectivities { 100, 99, 95, 75, 50, 25, 5, 1, -1}; | |
for(auto selectivity : selectivities) { | |
bitmask_benchmark_nosimd(a, b, c, selectivity); | |
} | |
#ifndef DISABLE_AVX512 | |
for(auto selectivity : selectivities) { | |
bitmask_benchmark(a, b, c, selectivity); | |
} | |
#endif | |
for(auto selectivity : selectivities) { | |
selection_vector_benchmark(a, b, c, selectivity); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment