Last active
December 2, 2022 08:15
-
-
Save nixiz/c3bb8f6029b64f9281ac44cbc119209a to your computer and use it in GitHub Desktop.
cache friendly programming best practices with google/benchmark results
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <benchmark/benchmark.h> | |
#include <iostream> | |
#include <random> | |
#include <algorithm> | |
#include <vector> | |
#include <chrono> | |
#include <iterator> | |
#include <execution> | |
static void count_if_random(benchmark::State& state) | |
{ | |
std::vector<int> v(65536); | |
std::generate(std::begin(v), std::end(v), [] | |
{ | |
return (rand() % 2) ? 1 : -1; | |
}); | |
for (auto _ : state) { | |
auto count = std::count_if( | |
std::execution::seq, | |
std::begin(v), std::end(v), | |
[](auto x) | |
{ | |
return x > 0; | |
}); | |
benchmark::DoNotOptimize(count); | |
benchmark::ClobberMemory(); | |
} | |
} | |
BENCHMARK(count_if_random)->Unit(benchmark::kMicrosecond); | |
static void count_if_sequenced(benchmark::State& state) | |
{ | |
std::vector<int> v(65536); | |
std::generate(std::begin(v), std::end(v), [n = 0]() mutable { | |
return (++n % 2) ? 1 : -1; | |
}); | |
for (auto _ : state) { | |
auto count = std::count_if( | |
std::execution::seq, | |
std::begin(v), std::end(v), | |
[](auto x) | |
{ | |
return x > 0; | |
}); | |
benchmark::DoNotOptimize(count); | |
benchmark::ClobberMemory(); | |
} | |
} | |
BENCHMARK(count_if_sequenced)->Unit(benchmark::kMicrosecond); | |
static void count_if_sorted(benchmark::State& state) | |
{ | |
std::vector<int> v(65536); | |
std::generate(std::begin(v), std::end(v), | |
[] { | |
return (rand() % 2) ? 1 : -1; | |
}); | |
std::sort(v.begin(), v.end()); | |
for (auto _ : state) { | |
auto count = std::count_if( | |
std::execution::seq, | |
std::begin(v), std::end(v), | |
[](auto x) | |
{ | |
return x > 0; | |
}); | |
benchmark::DoNotOptimize(count); | |
benchmark::ClobberMemory(); | |
} | |
} | |
BENCHMARK(count_if_sorted)->Unit(benchmark::kMicrosecond); | |
namespace branch_prediction | |
{ | |
struct price | |
{ | |
virtual ~price() {} | |
virtual float getPrice() const noexcept { return 1.0f; } | |
}; | |
struct cheap : public price | |
{ | |
float getPrice() const noexcept override { return 2.0f; } | |
}; | |
struct expensive : public price | |
{ | |
float getPrice() const noexcept override { return 3.14159f; } | |
}; | |
} // namespace branch_prediction | |
static void virtual_call_sequenced(benchmark::State& state) | |
{ | |
using namespace branch_prediction; | |
std::vector<price*> pricelist; | |
std::fill_n(std::back_inserter(pricelist), 10000, new price); | |
std::fill_n(std::back_inserter(pricelist), 10000, new cheap); | |
std::fill_n(std::back_inserter(pricelist), 10000, new expensive); | |
for (auto _ : state) { | |
float sum = 0; | |
for (auto* p : pricelist) { | |
benchmark::DoNotOptimize(sum += p->getPrice()); | |
} | |
benchmark::ClobberMemory(); | |
} | |
} | |
BENCHMARK(virtual_call_sequenced)->Unit(benchmark::kMicrosecond); | |
static void virtual_call_shuffled(benchmark::State& state) | |
{ | |
using namespace branch_prediction; | |
std::vector<price*> pricelist; | |
std::random_device rd; | |
std::mt19937 g{ rd() }; | |
std::fill_n(std::back_inserter(pricelist), 10000, new price); | |
std::fill_n(std::back_inserter(pricelist), 10000, new cheap); | |
std::fill_n(std::back_inserter(pricelist), 10000, new expensive); | |
std::shuffle(pricelist.begin(), pricelist.end(), g); | |
for (auto _ : state) { | |
float sum = 0; | |
for (auto* p : pricelist) { | |
benchmark::DoNotOptimize(sum += p->getPrice()); | |
} | |
benchmark::ClobberMemory(); | |
} | |
} | |
BENCHMARK(virtual_call_shuffled)->Unit(benchmark::kMicrosecond); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <limits> | |
#include <benchmark/benchmark.h> | |
#include <type_traits> | |
#include <chrono> | |
#include <random> | |
#include <map> | |
#include <string> | |
#include <sstream> | |
#include <iostream> | |
#include <iomanip> | |
#include <execution> | |
namespace cpu_cache_misses { | |
enum class traversal_order { | |
row_major, | |
column_major | |
}; | |
template <unsigned int size> | |
struct random_map | |
{ | |
unsigned int map[size] = { 0 }; | |
random_map() { | |
srand(static_cast<unsigned int>(time(NULL))); | |
std::generate(std::execution::par, std::begin(map), std::end(map), [] { return std::rand() % size; }); | |
} | |
unsigned int operator()() const { | |
return map[++indx % size]; | |
} | |
private: | |
mutable unsigned long long indx = 0; | |
}; | |
const static random_map<8 << 10> random_numbers; | |
template <class T> | |
class matrix { | |
public: | |
using type_t = typename std::decay<T>::type; | |
using msize_t = unsigned long long; | |
explicit matrix(msize_t matrix_size) | |
{ | |
reset(matrix_size); | |
} | |
~matrix() | |
{ | |
if (data_ != nullptr) release(); | |
} | |
void reset(msize_t matrix_size) | |
{ | |
if (data_ != nullptr) release(); | |
matrix_dim = static_cast<msize_t>(sqrt(matrix_size)); | |
const auto dim_size = matrix_dim % 2 == 0 ? matrix_dim : matrix_dim + 1; | |
row_size = dim_size; | |
column_size = dim_size; | |
// check if matrix is equal sized | |
assert(row_size == column_size); | |
data_ = new type_t * [row_size]; | |
for (msize_t i = 0; i < row_size; i++) | |
{ | |
data_[i] = new type_t[column_size]; | |
// set initial values | |
for (msize_t j = 0; j < column_size; j++) | |
{ | |
// set random initial data | |
data_[i][j] = static_cast<type_t>(random_numbers()); | |
} | |
} | |
} | |
msize_t size() const { | |
return matrix_dim * matrix_dim; | |
} | |
msize_t rows() const { | |
return row_size; | |
} | |
msize_t columns() const { | |
return column_size; | |
} | |
type_t* operator[](msize_t index) { | |
return data_[index]; | |
} | |
type_t* operator[](msize_t index) const { | |
return data_[index]; | |
} | |
protected: | |
void release() { | |
// delete allocated source | |
for (size_t i = 0; i < row_size; i++) | |
{ | |
delete[] data_[i]; | |
} | |
delete[] data_; | |
data_ = nullptr; | |
} | |
private: | |
type_t** data_ = nullptr; | |
msize_t row_size{ 0 }; | |
msize_t column_size{ 0 }; | |
msize_t matrix_dim{ 0 }; | |
}; | |
} // namespace cpu_cache_misses | |
using namespace cpu_cache_misses; | |
static void sum_row_major(benchmark::State& state) | |
{ | |
using type = matrix<uint8_t>::msize_t; | |
const type matrix_size = (1024 * 1024) * state.range(0); | |
static matrix<uint8_t> mat(matrix_size); | |
if (matrix_size != mat.size()) { | |
mat.reset(matrix_size); | |
} | |
for (auto _ : state) { | |
type sum = 0; | |
for (type r = 0; r < mat.rows(); ++r) { | |
for (type c = 0; c < mat.columns(); ++c) { | |
benchmark::DoNotOptimize(sum += mat[r][c]); | |
} | |
} | |
benchmark::ClobberMemory(); | |
} | |
} | |
BENCHMARK(sum_row_major)->DenseRange(1, 128, 2)->Unit(benchmark::kMillisecond); | |
BENCHMARK(sum_row_major)->DenseRange(128, 1024, 128)->Unit(benchmark::kMillisecond); | |
using namespace cpu_cache_misses; | |
static void sum_column_major(benchmark::State& state) | |
{ | |
using type = matrix<uint8_t>::msize_t; | |
const type matrix_size = (1024 * 1024) * state.range(0); | |
static matrix<uint8_t> mat(matrix_size); | |
if (matrix_size != mat.size()) { | |
mat.reset(matrix_size); | |
} | |
for (auto _ : state) { | |
type sum = 0; | |
for (type c = 0; c < mat.columns(); ++c) { | |
for (type r = 0; r < mat.rows(); ++r) { | |
benchmark::DoNotOptimize(sum += mat[r][c]); | |
} | |
} | |
benchmark::ClobberMemory(); | |
} | |
} | |
BENCHMARK(sum_column_major)->DenseRange(1, 128, 2)->Unit(benchmark::kMillisecond); | |
BENCHMARK(sum_column_major)->DenseRange(128, 1024, 128)->Unit(benchmark::kMillisecond); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <benchmark/benchmark.h> | |
#include <atomic> | |
#include <thread> | |
#include <iostream> | |
#include <random> | |
#include <algorithm> | |
#include <vector> | |
#include <chrono> | |
template <typename input_t> | |
void work(input_t& a) | |
{ | |
for (int i = 0; i < 10000000; ++i) | |
a++; | |
} | |
void work_with_sentinel(std::atomic<int>& a) | |
{ | |
thread_local unsigned int sentinel = 0; | |
for (int i = 0; i < 10000000; ++i) | |
++sentinel; | |
a += sentinel; | |
} | |
namespace sharing_single_atomic | |
{ | |
inline int test_1() | |
{ | |
std::atomic<int> a; a = 0; | |
std::thread t1([&] { work(a); }); | |
t1.join(); | |
return a; | |
} | |
inline int test_2() | |
{ | |
std::atomic<int> a; a = 0; | |
std::thread t1([&] { work(a); }); | |
std::thread t2([&] { work(a); }); | |
t1.join(); t2.join(); | |
return a; | |
} | |
inline int test_3() | |
{ | |
std::atomic<int> a; a = 0; | |
std::thread t1([&] { work(a); }); | |
std::thread t2([&] { work(a); }); | |
std::thread t3([&] { work(a); }); | |
t1.join(); t2.join(); t3.join(); | |
return a; | |
} | |
inline int test_4() | |
{ | |
std::atomic<int> a; a = 0; | |
std::thread t1([&] { work(a); }); | |
std::thread t2([&] { work(a); }); | |
std::thread t3([&] { work(a); }); | |
std::thread t4([&] { work(a); }); | |
t1.join(); t2.join(); t3.join(); t4.join(); | |
return a; | |
} | |
inline int test_1_sentinel() | |
{ | |
std::atomic<int> a; a = 0; | |
std::thread t1([&] { work_with_sentinel(a); }); | |
t1.join(); | |
return a; | |
} | |
inline int test_2_sentinel() | |
{ | |
std::atomic<int> a; a = 0; | |
std::thread t1([&] { work_with_sentinel(a); }); | |
std::thread t2([&] { work_with_sentinel(a); }); | |
t1.join(); t2.join(); | |
return a; | |
} | |
inline int test_3_sentinel() | |
{ | |
std::atomic<int> a; a = 0; | |
std::thread t1([&] { work_with_sentinel(a); }); | |
std::thread t2([&] { work_with_sentinel(a); }); | |
std::thread t3([&] { work_with_sentinel(a); }); | |
t1.join(); t2.join(); t3.join(); | |
return a; | |
} | |
inline int test_4_sentinel() | |
{ | |
std::atomic<int> a; a = 0; | |
std::thread t1([&] { work_with_sentinel(a); }); | |
std::thread t2([&] { work_with_sentinel(a); }); | |
std::thread t3([&] { work_with_sentinel(a); }); | |
std::thread t4([&] { work_with_sentinel(a); }); | |
t1.join(); t2.join(); t3.join(); t4.join(); | |
return a; | |
} | |
} | |
static void test_sharing_single_atomic(benchmark::State& state, int test_num) | |
{ | |
using namespace sharing_single_atomic; | |
int (*fun)() = nullptr; | |
switch (test_num) | |
{ | |
case 1: | |
fun = test_1; | |
break; | |
case 2: | |
fun = test_2; | |
break; | |
case 3: | |
fun = test_3; | |
break; | |
case 4: | |
fun = test_4; | |
break; | |
case 5: | |
fun = test_1_sentinel; | |
break; | |
case 6: | |
fun = test_2_sentinel; | |
break; | |
case 7: | |
fun = test_3_sentinel; | |
break; | |
case 8: | |
fun = test_4_sentinel; | |
break; | |
default: | |
{ | |
std::cerr << "undefined test number given!" << std::endl; | |
std::terminate(); | |
} | |
break; | |
} | |
for (auto _ : state) { | |
benchmark::DoNotOptimize(fun()); | |
} | |
} | |
BENCHMARK_CAPTURE(test_sharing_single_atomic, one_threaded, 1)->UseRealTime()->Unit(benchmark::kMillisecond); | |
BENCHMARK_CAPTURE(test_sharing_single_atomic, two_threaded, 2)->UseRealTime()->Unit(benchmark::kMillisecond); | |
BENCHMARK_CAPTURE(test_sharing_single_atomic, three_threaded, 3)->UseRealTime()->Unit(benchmark::kMillisecond); | |
BENCHMARK_CAPTURE(test_sharing_single_atomic, four_threaded, 4)->UseRealTime()->Unit(benchmark::kMillisecond); | |
BENCHMARK_CAPTURE(test_sharing_single_atomic, one_threaded_with_sentinels, 5)->UseRealTime()->Unit(benchmark::kMillisecond); | |
BENCHMARK_CAPTURE(test_sharing_single_atomic, two_threaded_with_sentinels, 6)->UseRealTime()->Unit(benchmark::kMillisecond); | |
BENCHMARK_CAPTURE(test_sharing_single_atomic, three_threaded_with_sentinels, 7)->UseRealTime()->Unit(benchmark::kMillisecond); | |
BENCHMARK_CAPTURE(test_sharing_single_atomic, four_threaded_with_sentinels, 8)->UseRealTime()->Unit(benchmark::kMillisecond); | |
namespace sharing_atomics_in_one_chache_line | |
{ | |
inline int test_2() | |
{ | |
std::atomic<int> a; a = 0; | |
std::atomic<int> b; b = 0; | |
std::thread t1([&] { work(a); }); | |
std::thread t2([&] { work(b); }); | |
t1.join(); t2.join(); | |
return a + b; | |
} | |
inline int test_3() | |
{ | |
std::atomic<int> a; a = 0; | |
std::atomic<int> b; b = 0; | |
std::atomic<int> c; c = 0; | |
std::thread t1([&] { work(a); }); | |
std::thread t2([&] { work(b); }); | |
std::thread t3([&] { work(c); }); | |
t1.join(); t2.join(); t3.join(); | |
return a + b + c; | |
} | |
inline int test_4() | |
{ | |
std::atomic<int> a; a = 0; // &a: 0x...b2f7c0 | |
std::atomic<int> b; b = 0; // &b: 0x...b2f7c4 | |
std::atomic<int> c; c = 0; // &c: 0x...b2f7c8 | |
std::atomic<int> d; d = 0; // &d: 0x...b2f7cc | |
std::thread t1([&] { work(a); }); | |
std::thread t2([&] { work(b); }); | |
std::thread t3([&] { work(c); }); | |
std::thread t4([&] { work(d); }); | |
t1.join(); t2.join(); t3.join(); t4.join(); | |
return a + b + c + d; | |
} | |
} | |
static void test_sharing_atomics_in_one_chache_line(benchmark::State& state, int test_num) | |
{ | |
using namespace sharing_atomics_in_one_chache_line; | |
int (*fun)() = nullptr; | |
switch (test_num) | |
{ | |
case 2: | |
fun = test_2; | |
break; | |
case 3: | |
fun = test_3; | |
break; | |
case 4: | |
fun = test_4; | |
break; | |
default: | |
{ | |
std::cerr << "undefined test number given!" << std::endl; | |
std::terminate(); | |
} | |
break; | |
} | |
for (auto _ : state) { | |
benchmark::DoNotOptimize(fun()); | |
} | |
} | |
BENCHMARK_CAPTURE(test_sharing_atomics_in_one_chache_line, two_threaded, 2)->UseRealTime()->Unit(benchmark::kMillisecond); | |
BENCHMARK_CAPTURE(test_sharing_atomics_in_one_chache_line, three_threaded, 3)->UseRealTime()->Unit(benchmark::kMillisecond); | |
BENCHMARK_CAPTURE(test_sharing_atomics_in_one_chache_line, four_threaded, 4)->UseRealTime()->Unit(benchmark::kMillisecond); | |
namespace false_sharing_resolved | |
{ | |
// aligned type with the same size of cache line | |
struct alignas(64) aligned_type | |
{ | |
std::atomic<int> val; | |
}; | |
static_assert(sizeof(aligned_type) == 64, "structure alignment must be same as cache line"); | |
inline int test_1() | |
{ | |
aligned_type a; a.val = 0; // &a: 0x...4ff240 | |
std::thread t1([&a] { work(a.val); }); | |
t1.join(); | |
return a.val; | |
} | |
inline int test_2() | |
{ | |
aligned_type a; a.val = 0; // &a: 0x...4ff240 | |
aligned_type b; b.val = 0; // &b: 0x...4ff280 | |
std::thread t1([&a] { work(a.val); }); | |
std::thread t2([&b] { work(b.val); }); | |
t1.join(); t2.join(); | |
return a.val + b.val; | |
} | |
inline int test_3() | |
{ | |
aligned_type a; a.val = 0; // &a: 0x...4ff240 | |
aligned_type b; b.val = 0; // &b: 0x...4ff280 | |
aligned_type c; c.val = 0; // &c: 0x...4ff2c0 | |
std::thread t1([&a] { work(a.val); }); | |
std::thread t2([&b] { work(b.val); }); | |
std::thread t3([&c] { work(c.val); }); | |
t1.join(); t2.join(); t3.join(); | |
return a.val + b.val + c.val; | |
} | |
inline int test_4() | |
{ | |
aligned_type a; a.val = 0; // &a: 0x...4ff240 | |
aligned_type b; b.val = 0; // &b: 0x...4ff280 | |
aligned_type c; c.val = 0; // &c: 0x...4ff2c0 | |
aligned_type d; d.val = 0; // &d: 0x...4ff300 | |
std::thread t1([&a] { work(a.val); }); | |
std::thread t2([&b] { work(b.val); }); | |
std::thread t3([&c] { work(c.val); }); | |
std::thread t4([&d] { work(d.val); }); | |
t1.join(); t2.join(); t3.join(); t4.join(); | |
return a.val + b.val + c.val + d.val; | |
} | |
} | |
static void test_false_sharing_resolved(benchmark::State& state, int test_num) | |
{ | |
using namespace false_sharing_resolved; | |
int (*fun)() = nullptr; | |
switch (test_num) | |
{ | |
case 1: | |
fun = test_1; | |
break; | |
case 2: | |
fun = test_2; | |
break; | |
case 3: | |
fun = test_3; | |
break; | |
case 4: | |
fun = test_4; | |
break; | |
default: | |
{ | |
std::cerr << "undefined test number given!" << std::endl; | |
std::terminate(); | |
} | |
break; | |
} | |
for (auto _ : state) { | |
benchmark::DoNotOptimize(fun()); | |
} | |
} | |
BENCHMARK_CAPTURE(test_false_sharing_resolved, one_threaded, 1)->UseRealTime()->Unit(benchmark::kMillisecond); | |
BENCHMARK_CAPTURE(test_false_sharing_resolved, two_threaded, 2)->UseRealTime()->Unit(benchmark::kMillisecond); | |
BENCHMARK_CAPTURE(test_false_sharing_resolved, three_threaded, 3)->UseRealTime()->Unit(benchmark::kMillisecond); | |
BENCHMARK_CAPTURE(test_false_sharing_resolved, four_threaded, 4)->UseRealTime()->Unit(benchmark::kMillisecond); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment