Last active
November 15, 2025 17:03
-
-
Save vshabanov/bd885c9224e2891789a02f7366fb9b32 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // g++ -g -O3 -fopenmp -std=c++20 -Iboost_1_89_0 -Wall -Wpedantic hashtable.cpp && perf stat -e branches,branch-misses,bus-cycles,cache-misses,cache-references,cycles,instructions,ref-cycles ./a.out | |
| #include <atomic> | |
| #include <functional> | |
| #include <utility> | |
| #include <string> | |
| #include <vector> | |
| #include <stdexcept> | |
| #include <map> | |
| #include <unordered_map> | |
| #include <bit> | |
| #include <future> | |
| #include <chrono> | |
| #include <boost/unordered/unordered_flat_map.hpp> | |
| #include <boost/unordered/concurrent_flat_map.hpp> | |
| #include <omp.h> | |
| #include <mutex> | |
| #include <shared_mutex> | |
| #include <algorithm> | |
| #include <iterator> | |
| #include <random> | |
| #include <span> | |
| /* Concurrent reads-optimized linear-probe hash table. | |
| Lookups are lock-free: read a pointer to a backing array from an | |
| atomic variable and do the lookup. | |
| Inserts must be protected by a mutex. Insert atomically sets the | |
| value pointer in the array. Readers immediately see the change. | |
| On resize, a new backing array will be allocated, values will be | |
| copied and the array pointer will be atomically swapped to the new | |
| array. Readers can still read data from the old array without | |
| locks. | |
| Old arrays are never deallocated not to have any dangling pointers. | |
| Since we're doubling the array size each time, the maximum waste is | |
| O(n). | |
| === Performance === | |
| This table is much faster than `unordered_flat_map+shared_mutex` | |
| and faster than `concurrent_flat_map` in both sequential and | |
| parallel lookups (only around 300k keys it's slightly slower than | |
| `concurrent_flat_map` in a single threaded mode, in all other cases | |
| it's usually 2x+ times faster). | |
| Most importantly it doesn't have `concurrent_flat_map` pathology on | |
| small tables and hot keys, so it's 10x+ times faster on small tables. | |
| Inserts are not optimized. If every insert is protected by a mutex | |
| it could be 2-10 slower than other tables. | |
| Lookups are also faster than the unprotected `unordered_flat_map` | |
| for small (<4k) and big (>6M) tables. Small tables are faster due | |
| to a simpler logic, big tables are faster due to a sequential | |
| access to the main memory. In the middle, boost still has metadata | |
| with optimized checks in the L2 cache and becomes faster. The out | |
| of L3 cache drop happens later in `unordered_flat_map` so between | |
| 200k-500k keys `unordered_flat_map` lookups are 1.5-2x faster (can | |
| be tuned by increasing `LockFreeLookupHashTable` load factor). | |
| `unordered_flat_map` is a good choice for medium-sized tables | |
| (especially around 200k-500k) with no concurrent access, or in case | |
| of many inserts. | |
| `concurrent_flat_map` is considerably faster than | |
| `unordered_flat_map+shared_mutex`. But it uses spinning mutexes and | |
| is _pathologically_ slow and causes high system CPU usage (due to | |
| `sched_yield`) if multiple threads request the same key (or keys | |
| from the same group -- happens frequently in small maps). It's | |
| slightly faster for inserts due to sharding, but we optimize for | |
| reading. Its use case is a big table (>4k elements) with | |
| relatively high number of updates and no lookup hot spots (which | |
| might be hard to guarantee). Do not recommend. "benchmark-oriented" | |
| data structure that will fail you in the real world. | |
| */ | |
| template< | |
| typename Key, | |
| typename Value, | |
| typename Hash = std::hash<Key>, | |
| typename KeyEq = std::equal_to<Key> | |
| > | |
| class LockFreeLookupHashTable { | |
| // Small load factor as there's no advanced logic to fight clustering. | |
| // Goes out of L3 cache quicker but works faster on small and large tables. | |
| static constexpr int LOAD_FACTOR_NUMERATOR = 1; | |
| static constexpr int LOAD_FACTOR_DENOMINATOR = 3; | |
| struct Item { | |
| std::atomic<size_t> flag; // 0 - empty, nonZeroHash(hash) | |
| // Keep the key and the value flat in the array for faster | |
| // comparisons and value retrieval. | |
| // Slowdowns resizes, but we don't expect many of them. | |
| // Keeping value inside means that we get out of the L1 cache | |
| // faster. Might help to keep pointers. It's on the user. | |
| std::pair<Key, Value> keyValue; | |
| }; | |
| using Table = std::vector<Item>; | |
| std::atomic<Table*> table; // current pointer, swapped on resize | |
| // list of previous tables to have no dangling pointers and free | |
| // memory at the end | |
| std::vector<Table*> oldTables; | |
| size_t used; // number of value added, for load factor and resizing | |
| static inline Table emptyTable{}; // shared per-specialization empty array | |
| public: | |
| LockFreeLookupHashTable() : table(&emptyTable), used(0) {} | |
| LockFreeLookupHashTable(size_t capacity) : LockFreeLookupHashTable() { | |
| resize(capacity); | |
| } | |
| LockFreeLookupHashTable(const LockFreeLookupHashTable&) = delete; | |
| LockFreeLookupHashTable& operator=(const LockFreeLookupHashTable&) = delete; | |
| LockFreeLookupHashTable(LockFreeLookupHashTable&&) = delete; | |
| LockFreeLookupHashTable& operator=(LockFreeLookupHashTable&&) = delete; | |
| ~LockFreeLookupHashTable() { | |
| delete table.load(std::memory_order_acquire); | |
| for (auto t : oldTables) delete t; | |
| } | |
| // lock-free | |
| bool visit(const Key& key, const auto& f) const { | |
| Value *r = lookupPtr(key); | |
| if (r) | |
| f(key, *r); | |
| return r; | |
| } | |
| // lock-free | |
| const Value *lookupPtr(const Key& key) const { | |
| const Table& t = *table.load(std::memory_order_acquire); | |
| size_t size = t.size(); | |
| size_t mask = size - 1; | |
| size_t hash = Hash{}(key); | |
| size_t index = hash & mask; | |
| for (size_t i = 0; i < size; i++) { | |
| const Item& item = t[(index+i) & mask]; | |
| size_t flag = item.flag.load(std::memory_order_acquire); | |
| if (flag == nonZeroHash(hash) && KeyEq{}(key, item.keyValue.first)) | |
| return &item.keyValue.second; | |
| // ++nFailedLookups; | |
| // if (i >= maxFailedLookups) { maxFailedLookups = i+1; maxFailedLookupsKey = key; } | |
| if (!flag) | |
| return nullptr; | |
| } | |
| return nullptr; | |
| } | |
| // must be protected by a mutex | |
| const Value *emplace(const Key& key, const Value& value) { | |
| Table& t0 = *table.load(std::memory_order_acquire); | |
| size_t requiredSize = capacitySize(used + 1); | |
| Table& t = t0.size() >= requiredSize ? t0 : resize(t0, requiredSize); | |
| return emplaceIntoTable(t, key, value); | |
| } | |
| // must be protected by a mutex | |
| void reserve(size_t capacity) { | |
| size_t requiredSize = capacitySize(capacity); | |
| Table& t = *table.load(std::memory_order_acquire); | |
| if (t.size() < requiredSize) | |
| resize(t, requiredSize); | |
| } | |
| // debugging | |
| static constexpr double LOAD_FACTOR = (double)LOAD_FACTOR_NUMERATOR / LOAD_FACTOR_DENOMINATOR; | |
| double debugLoadFactor() { return (double)used / debugSize(); } | |
| size_t debugSize() { return table.load()->size(); } | |
| // mutable size_t nFailedLookups = 0; | |
| // mutable size_t maxFailedLookups = 0; | |
| // mutable Key maxFailedLookupsKey; | |
| private: | |
| // make hash non-zero to distinguish it from an empty item | |
| static inline size_t nonZeroHash(size_t h) { | |
| return h | 1; | |
| // scaling doesn't change things much | |
| // return (((__uint128_t)h * (UINT64_MAX - 1)) >> 64) + 1; | |
| } | |
| // table size enough to keep `capacity` items with a proper load factor | |
| size_t static capacitySize(size_t capacity) { | |
| return std::bit_ceil(capacity * LOAD_FACTOR_DENOMINATOR / LOAD_FACTOR_NUMERATOR); | |
| } | |
| const Value *emplaceIntoTable(Table& t, const Key& key, const Value& value) { | |
| size_t size = t.size(); | |
| size_t mask = size - 1; | |
| size_t hash = Hash{}(key); | |
| size_t index = hash & mask; | |
| for (size_t i = 0; i < size; i++) { | |
| Item& item = t[(index+i) & mask]; | |
| size_t flag = item.flag.load(std::memory_order_acquire); | |
| if (flag == nonZeroHash(hash) && KeyEq{}(key, item.keyValue.first)) | |
| return &item.keyValue.second; // return existing | |
| if (flag == 0) { | |
| ++used; | |
| item.keyValue.first = key; | |
| item.keyValue.second = value; | |
| item.flag.store(nonZeroHash(hash), std::memory_order_release); | |
| return &item.keyValue.second; | |
| } | |
| } | |
| // shouldn't end up here as we resize the table | |
| throw std::runtime_error("the table is full?"); | |
| } | |
| Table& resize(Table& t, size_t newSize) { | |
| Table& newTable = *new Table(newSize); | |
| used = 0; // emplaceIntoTable will update it | |
| for (const auto& item : t) | |
| if (item.flag.load(std::memory_order_acquire)) | |
| emplaceIntoTable(newTable, item.keyValue.first, item.keyValue.second); | |
| if (&t != &emptyTable) | |
| oldTables.push_back(&t); | |
| table.store(&newTable, std::memory_order_release); | |
| return newTable; | |
| // printf("resized to %lu\n", newTable.size()); | |
| } | |
| }; | |
| template< | |
| typename Key, | |
| typename Value, | |
| typename Hash = std::hash<Key>, | |
| typename KeyEq = std::equal_to<Key> | |
| > | |
| class LockFreeLookupFutureCache { | |
| LockFreeLookupHashTable<Key, std::shared_future<Value>, Hash, KeyEq> cache; | |
| std::mutex mutex; | |
| public: | |
| // func must be callable as Value() | |
| template<typename F> | |
| Value getOrCompute(const Key& key, F&& func) { | |
| std::shared_future<Value> *fut = cache.lookupPtr(key); | |
| if (fut) | |
| return fut->get(); | |
| std::packaged_task<Value()> task(std::forward<F>(func)); | |
| std::shared_future<Value> taskSharedFuture = task.get_future().share(); | |
| { | |
| std::scoped_lock _(mutex); | |
| fut = cache.lookupPtr(key); | |
| if (fut) | |
| return fut->get(); // already inserted | |
| fut = cache.emplace(key, taskSharedFuture); | |
| } | |
| // Run the computation outside the lock. packaged_task will set the | |
| // shared state (value or exception) for others to observe. | |
| task(); | |
| return fut->get(); | |
| } | |
| }; | |
| int main() { | |
| size_t k = 1000, M = 1000*k; | |
| std::vector<size_t> testKeySizes = { | |
| 100, //1*k, 2*k, 3*k, 4*k, 5*k, 6*k, 7*k, 8*k, 9*k, | |
| 10*k, //15*k, 25*k, 50*k, 75*k, | |
| 100*k,// 150*k, 200*k, 250*k, 300*k, 350*k, 400*k, 500*k, 600*k, 700*k, 800*k, 900*k, | |
| 1*M,// 2*M, 3*M, 4*M, 5*M, 6*M, 7*M, 8*M, 9*M, 10*M | |
| }; | |
| std::vector<int> testNThreads = { | |
| 1, 2, 4, 8, 16 | |
| }; | |
| const int nLookups = 100*1000*1000; | |
| // nKeys -> [(column, msec)] | |
| std::map<size_t, std::map<std::string, double> > graphs; | |
| // std::string mutexType = "null_mutex"; | |
| // struct null_mutex | |
| // { | |
| // void lock() {} | |
| // void unlock() noexcept {} | |
| // bool try_lock() { return true; } | |
| // }; | |
| // using mutex = null_mutex; | |
| // using lock = std::scoped_lock<mutex>; | |
| std::string mutexType = "shared_mutex"; | |
| using mutex = std::shared_mutex; | |
| using lock = std::shared_lock<mutex>; | |
| // std::string mutexType = "mutex"; | |
| // using mutex = std::mutex; | |
| // using lock = std::scoped_lock<mutex>; | |
| // using Key = size_t; | |
| // using Value = size_t; | |
| // auto value = [](const Key& k) { return k; }; | |
| using Key = std::string; | |
| using Value = size_t; | |
| auto value = [](const Key& k) { return k.size(); }; | |
| // using Value = std::string; | |
| // auto value = [](const Key& k) { return k; }; | |
| for (auto nThreads : testNThreads) { | |
| if (nThreads > omp_get_max_threads()) { | |
| printf("too many threads (%d > omp_get_max_threads = %d), skipping\n", nThreads, omp_get_max_threads()); | |
| continue; | |
| } | |
| for (auto nKeys : testKeySizes) { | |
| using namespace std::chrono; | |
| auto start = high_resolution_clock::now(); | |
| auto printRuntime = [&](const std::string& what) { | |
| auto end = high_resolution_clock::now(); | |
| duration<double, std::milli> elapsed = end - start; | |
| auto label = what + " (" + std::to_string(nThreads) + (nThreads > 1 ? " threads)" : " thread)"); | |
| printf("%60s %7lu %8.2f\n", label.c_str(), nKeys, elapsed.count()); | |
| graphs[nKeys][label] = elapsed.count(); | |
| // printf("%50s %8.2f ms\n", what.c_str(), elapsed.count()); | |
| start = high_resolution_clock::now(); | |
| }; | |
| std::vector<Key> keys; | |
| keys.reserve(nKeys); | |
| for (size_t i = 0; i < nKeys; i++) { | |
| // keys.emplace_back(i); | |
| auto key = //"long long prefix to slowdown comparisons" + | |
| std::to_string(i); | |
| keys.emplace_back(key); | |
| } | |
| std::span keysToInsert(keys); | |
| // std::span keysToInsert(keys.begin(), keys.size() / 2); | |
| std::span keysToLookup(keys); | |
| // std::span keysToLookup(keys.begin(), keys.size() / 5000); | |
| std::shuffle(keys.begin(), keys.end(), std::mt19937{}); | |
| std::string someKey = keys[nKeys / 2]; | |
| const int reserve = keysToInsert.size(); | |
| // printf("nKeys = %lu, reserve = %d, load factor = %.2f%%, total lookups = %d, threads = %d\n", | |
| // nKeys, reserve, 100.0*LockFreeLookupHashTable<Key,Value>::LOAD_FACTOR, nLookups, nThreads); | |
| // // the slowest, but faster than boost on nKeys<1000 (fits into L1 cache) | |
| // { | |
| // start = high_resolution_clock::now(); | |
| // std::unordered_map<Key, Value | |
| // ,boost::hash<Key> | |
| // // note that boost::hash is considerably | |
| // // faster for std::string (it's inlined) | |
| // > m; // perf shows DIV instructions | |
| // m.reserve(reserve); | |
| // for (auto& k : keysToInsert) m.emplace(k, value(k)); | |
| // printRuntime("std::unordered_map+" + mutexType + " append"); | |
| // mutex sm; | |
| // #pragma omp parallel for num_threads(nThreads) | |
| // for (size_t i = 0; i < nLookups/keysToLookup.size(); i++) { | |
| // const Value * volatile out; (void)out; | |
| // for (auto& k : keysToLookup) { | |
| // lock _(sm); | |
| // out = &m[k]; | |
| // } | |
| // } | |
| // printRuntime("std::unordered_map+" + mutexType + " lookup"); | |
| // } | |
| { | |
| mutex sm; | |
| start = high_resolution_clock::now(); | |
| boost::unordered_flat_map<Key, Value> m; | |
| m.reserve(reserve); | |
| #pragma omp parallel for num_threads(nThreads) | |
| for (auto& k : keysToInsert) { std::scoped_lock _(sm); m.emplace(k, value(k)); } | |
| printRuntime("boost::unordered_flat_map+" + mutexType + " append"); | |
| #pragma omp parallel for num_threads(nThreads) | |
| for (size_t i = 0; i < nLookups/keysToLookup.size(); i++) { | |
| const Value * volatile out; (void)out; | |
| for (auto& k : keysToLookup) { | |
| lock _(sm); | |
| out = &m[k]; | |
| } | |
| } | |
| printRuntime("boost::unordered_flat_map+" + mutexType + " lookup"); | |
| } | |
| { | |
| start = high_resolution_clock::now(); | |
| boost::concurrent_flat_map<Key, Value> m; | |
| m.reserve(reserve); | |
| #pragma omp parallel for num_threads(nThreads) | |
| for (auto& k : keysToInsert) m.emplace(k, value(k)); | |
| printRuntime("boost::concurrent_flat_map append"); | |
| #pragma omp parallel for num_threads(nThreads) | |
| for (size_t i = 0; i < nLookups/keysToLookup.size(); i++) { | |
| const Value * volatile out; (void)out; | |
| for (auto& k : keysToLookup) | |
| // it has a pathological behaviour (25% sys CPU running | |
| // sched_yield) if we put the same key (e.g. "123") here | |
| m.visit(k, [&](auto& x) { out = &x.second; }); | |
| } | |
| printRuntime("boost::concurrent_flat_map lookup"); | |
| } | |
| { | |
| start = high_resolution_clock::now(); | |
| LockFreeLookupHashTable<Key, Value, boost::hash<Key> > m; | |
| m.reserve(reserve); | |
| std::mutex mutex; | |
| #pragma omp parallel for num_threads(nThreads) | |
| for (auto& k : keysToInsert) { std::scoped_lock _(mutex); m.emplace(k, value(k)); }; | |
| printRuntime("LockFreeLookupHashTable append"); | |
| #pragma omp parallel for num_threads(nThreads) | |
| for (size_t i = 0; i < nLookups/keysToLookup.size(); i++) { | |
| const Value * volatile out; (void)out; | |
| for (auto& k : keysToLookup) | |
| out = m.lookupPtr(k); | |
| } | |
| printRuntime("LockFreeLookupHashTable lookup"); | |
| // printf("debugLoadFactor = %.2f%%, debugSize = %lu\n", m.debugLoadFactor()*100, m.debugSize()); | |
| // printf("nFailedLookups = %lu (%.2f%%), maxFailedLookups = %lu (\"%s\")\n", m.nFailedLookups, | |
| // 100.0 * m.nFailedLookups / nLookups, m.maxFailedLookups, m.maxFailedLookupsKey.data()); | |
| } | |
| } // nKeys | |
| } // nThreads | |
| printf("\nFor Excel:\n\n# keys"); | |
| for (auto [name,_] : graphs[testKeySizes[0]]) { | |
| printf(", %s", name.data()); | |
| } | |
| printf("\n"); | |
| for (auto [nKeys, line] : graphs) { | |
| printf("%lu", nKeys); | |
| for (auto [_, msec] : line ) | |
| printf(", %.2f", msec); | |
| printf("\n"); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment