vshabanov · November 15, 2025 17:03
diff --git a/LockFreeLookupHashTable.cpp b/LockFreeLookupHashTable.cpp
 // g++ -g -O3 -fopenmp -std=c++20 -Iboost_1_89_0 -Wall -Wpedantic hashtable.cpp && perf stat -e branches,branch-misses,bus-cycles,cache-misses,cache-references,cycles,instructions,ref-cycles ./a.out
 #include <atomic>
 #include <functional>
 #include <utility>
 #include <string>
 #include <vector>
 #include <stdexcept>
 #include <map>
 #include <unordered_map>
 #include <bit>
 #include <future>

 #include <chrono>
 #include <boost/unordered/unordered_flat_map.hpp>
 #include <boost/unordered/concurrent_flat_map.hpp>
 #include <omp.h>
 #include <mutex>
 #include <shared_mutex>
 #include <algorithm>
 #include <iterator>
 #include <random>
 #include <span>

 /* Concurrent reads-optimized linear-probe hash table.

   Lookups are lock-free: read a pointer to a backing array from an
   atomic variable and do the lookup.

   Inserts must be protected by a mutex. Insert atomically sets the
   value pointer in the array. Readers immediately see the change.

   On resize, a new backing array will be allocated, values will be
   copied and the array pointer will be atomically swapped to the new
   array. Readers can still read data from the old array without
   locks.

   Old arrays are never deallocated not to have any dangling pointers.
   Since we're doubling the array size each time, the maximum waste is
   O(n).

   === Performance ===

   This table is much faster than `unordered_flat_map+shared_mutex`
   and faster than `concurrent_flat_map` in both sequential and
   parallel lookups (only around 300k keys it's slightly slower than
   `concurrent_flat_map` in a single threaded mode, in all other cases
   it's usually 2x+ times faster).

   Most importantly it doesn't have `concurrent_flat_map` pathology on
   small tables and hot keys, so it's 10x+ times faster on small tables.

   Inserts are not optimized. If every insert is protected by a mutex
   it could be 2-10 slower than other tables.

   Lookups are also faster than the unprotected `unordered_flat_map`
   for small (<4k) and big (>6M) tables. Small tables are faster due
   to a simpler logic, big tables are faster due to a sequential
   access to the main memory. In the middle, boost still has metadata
   with optimized checks in the L2 cache and becomes faster. The out
   of L3 cache drop happens later in `unordered_flat_map` so between
   200k-500k keys `unordered_flat_map` lookups are 1.5-2x faster (can
   be tuned by increasing `LockFreeLookupHashTable` load factor).

   `unordered_flat_map` is a good choice for medium-sized tables
   (especially around 200k-500k) with no concurrent access, or in case
   of many inserts.

   `concurrent_flat_map` is considerably faster than
   `unordered_flat_map+shared_mutex`. But it uses spinning mutexes and
   is _pathologically_ slow and causes high system CPU usage (due to
   `sched_yield`) if multiple threads request the same key (or keys
   from the same group -- happens frequently in small maps). It's
   slightly faster for inserts due to sharding, but we optimize for
   reading. Its use case is a big table (>4k elements) with
   relatively high number of updates and no lookup hot spots (which
   might be hard to guarantee). Do not recommend. "benchmark-oriented"
   data structure that will fail you in the real world.
 */
 template<
    typename Key,
    typename Value,
    typename Hash = std::hash<Key>,
    typename KeyEq = std::equal_to<Key>
 >
 class LockFreeLookupHashTable {
    // Small load factor as there's no advanced logic to fight clustering.
    // Goes out of L3 cache quicker but works faster on small and large tables.
    static constexpr int LOAD_FACTOR_NUMERATOR   = 1;
    static constexpr int LOAD_FACTOR_DENOMINATOR = 3;

    struct Item {
        std::atomic<size_t> flag; // 0 - empty, nonZeroHash(hash)
        // Keep the key and the value flat in the array for faster
        // comparisons and value retrieval.
        // Slowdowns resizes, but we don't expect many of them.
        // Keeping value inside means that we get out of the L1 cache
        // faster. Might help to keep pointers. It's on the user.
        std::pair<Key, Value> keyValue;
    };

    using Table = std::vector<Item>;

    std::atomic<Table*> table; // current pointer, swapped on resize

    // list of previous tables to have no dangling pointers and free
    // memory at the end
    std::vector<Table*> oldTables;
    size_t used; // number of value added, for load factor and resizing

    static inline Table emptyTable{}; // shared per-specialization empty array

 public:
    LockFreeLookupHashTable() : table(&emptyTable), used(0) {}
    LockFreeLookupHashTable(size_t capacity) : LockFreeLookupHashTable() {
        resize(capacity);
    }

    LockFreeLookupHashTable(const LockFreeLookupHashTable&) = delete;
    LockFreeLookupHashTable& operator=(const LockFreeLookupHashTable&) = delete;
    LockFreeLookupHashTable(LockFreeLookupHashTable&&) = delete;
    LockFreeLookupHashTable& operator=(LockFreeLookupHashTable&&) = delete;

    ~LockFreeLookupHashTable() {
        delete table.load(std::memory_order_acquire);
        for (auto t : oldTables) delete t;
    }

    // lock-free
    bool visit(const Key& key, const auto& f) const {
        Value *r = lookupPtr(key);
        if (r)
            f(key, *r);
        return r;
    }

    // lock-free
    const Value *lookupPtr(const Key& key) const {
        const Table& t = *table.load(std::memory_order_acquire);
        size_t size = t.size();
        size_t mask = size - 1;
        size_t hash = Hash{}(key);
        size_t index = hash & mask;
        for (size_t i = 0; i < size; i++) {
            const Item& item = t[(index+i) & mask];
            size_t flag = item.flag.load(std::memory_order_acquire);
            if (flag == nonZeroHash(hash) && KeyEq{}(key, item.keyValue.first))
                return &item.keyValue.second;
 //             ++nFailedLookups;
 //             if (i >= maxFailedLookups) { maxFailedLookups = i+1; maxFailedLookupsKey = key; }
            if (!flag)
                return nullptr;
        }
        return nullptr;
    }

    // must be protected by a mutex
    const Value *emplace(const Key& key, const Value& value) {
        Table& t0 = *table.load(std::memory_order_acquire);
        size_t requiredSize = capacitySize(used + 1);
        Table& t = t0.size() >= requiredSize ? t0 : resize(t0, requiredSize);
        return emplaceIntoTable(t, key, value);
    }

    // must be protected by a mutex
    void reserve(size_t capacity) {
        size_t requiredSize = capacitySize(capacity);
        Table& t = *table.load(std::memory_order_acquire);
        if (t.size() < requiredSize)
            resize(t, requiredSize);
    }

    // debugging
    static constexpr double LOAD_FACTOR = (double)LOAD_FACTOR_NUMERATOR / LOAD_FACTOR_DENOMINATOR;
    double debugLoadFactor() { return (double)used / debugSize(); }
    size_t debugSize() { return table.load()->size(); }

 //     mutable size_t nFailedLookups = 0;
 //     mutable size_t maxFailedLookups = 0;
 //     mutable Key maxFailedLookupsKey;

 private:
    // make hash non-zero to distinguish it from an empty item
    static inline size_t nonZeroHash(size_t h) {
        return h | 1;
        // scaling doesn't change things much
 //         return (((__uint128_t)h * (UINT64_MAX - 1)) >> 64) + 1;
    }
    // table size enough to keep `capacity` items with a proper load factor
    size_t static capacitySize(size_t capacity) {
        return std::bit_ceil(capacity * LOAD_FACTOR_DENOMINATOR / LOAD_FACTOR_NUMERATOR);
    }
    const Value *emplaceIntoTable(Table& t, const Key& key, const Value& value) {
        size_t size = t.size();
        size_t mask = size - 1;
        size_t hash = Hash{}(key);
        size_t index = hash & mask;
        for (size_t i = 0; i < size; i++) {
            Item& item = t[(index+i) & mask];
            size_t flag = item.flag.load(std::memory_order_acquire);
            if (flag == nonZeroHash(hash) && KeyEq{}(key, item.keyValue.first))
                return &item.keyValue.second; // return existing
            if (flag == 0) {
                ++used;
                item.keyValue.first = key;
                item.keyValue.second = value;
                item.flag.store(nonZeroHash(hash), std::memory_order_release);
                return &item.keyValue.second;
            }
        }
        // shouldn't end up here as we resize the table
        throw std::runtime_error("the table is full?");
    }
    Table& resize(Table& t, size_t newSize) {
        Table& newTable = *new Table(newSize);
        used = 0; // emplaceIntoTable will update it
        for (const auto& item : t)
            if (item.flag.load(std::memory_order_acquire))
                emplaceIntoTable(newTable, item.keyValue.first, item.keyValue.second);
        if (&t != &emptyTable)
            oldTables.push_back(&t);
        table.store(&newTable, std::memory_order_release);
        return newTable;
 //         printf("resized to %lu\n", newTable.size());
    }
 };

 template<
    typename Key,
    typename Value,
    typename Hash = std::hash<Key>,
    typename KeyEq = std::equal_to<Key>
 >
 class LockFreeLookupFutureCache {
    LockFreeLookupHashTable<Key, std::shared_future<Value>, Hash, KeyEq> cache;
    std::mutex mutex;
 public:
    // func must be callable as Value()
    template<typename F>
    Value getOrCompute(const Key& key, F&& func) {
        std::shared_future<Value> *fut = cache.lookupPtr(key);
        if (fut)
            return fut->get();

        std::packaged_task<Value()> task(std::forward<F>(func));
        std::shared_future<Value> taskSharedFuture = task.get_future().share();

        {
            std::scoped_lock _(mutex);
            fut = cache.lookupPtr(key);
            if (fut)
                return fut->get(); // already inserted
            fut = cache.emplace(key, taskSharedFuture);
        }

        // Run the computation outside the lock. packaged_task will set the
        // shared state (value or exception) for others to observe.
        task();
        return fut->get();
    }
 };

 int main() {
    size_t k = 1000, M = 1000*k;
    std::vector<size_t> testKeySizes = {
        100, //1*k, 2*k, 3*k, 4*k, 5*k, 6*k, 7*k, 8*k, 9*k,
        10*k, //15*k, 25*k, 50*k, 75*k,
        100*k,// 150*k, 200*k, 250*k, 300*k, 350*k, 400*k, 500*k, 600*k, 700*k, 800*k, 900*k,
        1*M,// 2*M, 3*M, 4*M, 5*M, 6*M, 7*M, 8*M, 9*M, 10*M
    };
    std::vector<int> testNThreads = {
        1, 2, 4, 8, 16
    };

    const int nLookups = 100*1000*1000;

    // nKeys -> [(column, msec)]
    std::map<size_t, std::map<std::string, double> > graphs;

 //     std::string mutexType = "null_mutex";
 //     struct null_mutex
 //     {
 //         void lock() {}
 //         void unlock() noexcept {}
 //         bool try_lock() { return true; }
 //     };
 //     using mutex = null_mutex;
 //     using lock = std::scoped_lock<mutex>;
    std::string mutexType = "shared_mutex";
    using mutex = std::shared_mutex;
    using lock = std::shared_lock<mutex>;
 //     std::string mutexType = "mutex";
 //     using mutex = std::mutex;
 //     using lock = std::scoped_lock<mutex>;

 //     using Key = size_t;
 //     using Value = size_t;
 //     auto value = [](const Key& k) { return k; };
    using Key = std::string;
    using Value = size_t;
    auto value = [](const Key& k) { return k.size(); };
 //     using Value = std::string;
 //     auto value = [](const Key& k) { return k; };

    for (auto nThreads : testNThreads) {
        if (nThreads > omp_get_max_threads()) {
            printf("too many threads (%d > omp_get_max_threads = %d), skipping\n", nThreads, omp_get_max_threads());
            continue;
        }
    for (auto nKeys : testKeySizes) {
    using namespace std::chrono;
    auto start = high_resolution_clock::now();
    auto printRuntime = [&](const std::string& what) {
        auto end = high_resolution_clock::now();
        duration<double, std::milli> elapsed = end - start;
        auto label = what + " (" + std::to_string(nThreads) + (nThreads > 1 ? " threads)" : " thread)");
        printf("%60s  %7lu %8.2f\n", label.c_str(), nKeys, elapsed.count());
        graphs[nKeys][label] = elapsed.count();
 //        printf("%50s %8.2f ms\n", what.c_str(), elapsed.count());
        start = high_resolution_clock::now();
    };

    std::vector<Key> keys;
    keys.reserve(nKeys);
    for (size_t i = 0; i < nKeys; i++) {
 //         keys.emplace_back(i);
        auto key = //"long long prefix to slowdown comparisons" +
            std::to_string(i);
        keys.emplace_back(key);
    }
    std::span keysToInsert(keys);
 //     std::span keysToInsert(keys.begin(), keys.size() / 2);
    std::span keysToLookup(keys);
 //     std::span keysToLookup(keys.begin(), keys.size() / 5000);
    std::shuffle(keys.begin(), keys.end(), std::mt19937{});
    std::string someKey = keys[nKeys / 2];

    const int reserve = keysToInsert.size();

 //     printf("nKeys = %lu, reserve = %d, load factor = %.2f%%, total lookups = %d, threads = %d\n",
 //            nKeys, reserve, 100.0*LockFreeLookupHashTable<Key,Value>::LOAD_FACTOR, nLookups, nThreads);

 //     // the slowest, but faster than boost on nKeys<1000 (fits into L1 cache)
 //     {
 //         start = high_resolution_clock::now();
 //         std::unordered_map<Key, Value
 //                            ,boost::hash<Key>
 //                            // note that boost::hash is considerably
 //                            // faster for std::string (it's inlined)
 //                            > m; // perf shows DIV instructions
 //         m.reserve(reserve);
 //         for (auto& k : keysToInsert) m.emplace(k, value(k));
 //         printRuntime("std::unordered_map+" + mutexType + " append");
 //         mutex sm;
 //         #pragma omp parallel for num_threads(nThreads)
 //         for (size_t i = 0; i < nLookups/keysToLookup.size(); i++) {
 //             const Value * volatile out; (void)out;
 //             for (auto& k : keysToLookup) {
 //                 lock _(sm);
 //                 out = &m[k];
 //             }
 //         }
 //         printRuntime("std::unordered_map+" + mutexType + " lookup");
 //     }
    {
        mutex sm;
        start = high_resolution_clock::now();
        boost::unordered_flat_map<Key, Value> m;
        m.reserve(reserve);
        #pragma omp parallel for num_threads(nThreads)
        for (auto& k : keysToInsert) { std::scoped_lock _(sm); m.emplace(k, value(k)); }
        printRuntime("boost::unordered_flat_map+" + mutexType + " append");
        #pragma omp parallel for num_threads(nThreads)
        for (size_t i = 0; i < nLookups/keysToLookup.size(); i++) {
            const Value * volatile out; (void)out;
            for (auto& k : keysToLookup) {
                lock _(sm);
                out = &m[k];
            }
        }
        printRuntime("boost::unordered_flat_map+" + mutexType + " lookup");
    }
    {
        start = high_resolution_clock::now();
        boost::concurrent_flat_map<Key, Value> m;
        m.reserve(reserve);
        #pragma omp parallel for num_threads(nThreads)
        for (auto& k : keysToInsert) m.emplace(k, value(k));
        printRuntime("boost::concurrent_flat_map append");
        #pragma omp parallel for num_threads(nThreads)
        for (size_t i = 0; i < nLookups/keysToLookup.size(); i++) {
            const Value * volatile out; (void)out;
            for (auto& k : keysToLookup)
                // it has a pathological behaviour (25% sys CPU running
                // sched_yield) if we put the same key (e.g. "123") here
                m.visit(k, [&](auto& x) { out = &x.second; });
        }
        printRuntime("boost::concurrent_flat_map lookup");
    }
    {
        start = high_resolution_clock::now();
        LockFreeLookupHashTable<Key, Value, boost::hash<Key> > m;
        m.reserve(reserve);
        std::mutex mutex;
        #pragma omp parallel for num_threads(nThreads)
        for (auto& k : keysToInsert) { std::scoped_lock _(mutex); m.emplace(k, value(k)); };
        printRuntime("LockFreeLookupHashTable append");
        #pragma omp parallel for num_threads(nThreads)
        for (size_t i = 0; i < nLookups/keysToLookup.size(); i++) {
            const Value * volatile out; (void)out;
            for (auto& k : keysToLookup)
                out = m.lookupPtr(k);
        }
        printRuntime("LockFreeLookupHashTable lookup");
 //         printf("debugLoadFactor = %.2f%%, debugSize = %lu\n", m.debugLoadFactor()*100, m.debugSize());
 //         printf("nFailedLookups = %lu (%.2f%%), maxFailedLookups = %lu (\"%s\")\n", m.nFailedLookups,
 //                100.0 * m.nFailedLookups / nLookups, m.maxFailedLookups, m.maxFailedLookupsKey.data());
    }
    } // nKeys
    } // nThreads

    printf("\nFor Excel:\n\n# keys");
    for (auto [name,_] : graphs[testKeySizes[0]]) {
        printf(", %s", name.data());
    }
    printf("\n");
    for (auto [nKeys, line] : graphs) {
        printf("%lu", nKeys);
        for (auto [_, msec] : line )
            printf(", %.2f", msec);
        printf("\n");
    }
 }
	// g++ -g -O3 -fopenmp -std=c++20 -Iboost_1_89_0 -Wall -Wpedantic hashtable.cpp && perf stat -e branches,branch-misses,bus-cycles,cache-misses,cache-references,cycles,instructions,ref-cycles ./a.out
	#include <atomic>
	#include <functional>
	#include <utility>
	#include <string>
	#include <vector>
	#include <stdexcept>
	#include <map>
	#include <unordered_map>
	#include <bit>
	#include <future>

	#include <chrono>
	#include <boost/unordered/unordered_flat_map.hpp>
	#include <boost/unordered/concurrent_flat_map.hpp>
	#include <omp.h>
	#include <mutex>
	#include <shared_mutex>
	#include <algorithm>
	#include <iterator>
	#include <random>
	#include <span>

	/* Concurrent reads-optimized linear-probe hash table.

	Lookups are lock-free: read a pointer to a backing array from an
	atomic variable and do the lookup.

	Inserts must be protected by a mutex. Insert atomically sets the
	value pointer in the array. Readers immediately see the change.

	On resize, a new backing array will be allocated, values will be
	copied and the array pointer will be atomically swapped to the new
	array. Readers can still read data from the old array without
	locks.

	Old arrays are never deallocated not to have any dangling pointers.
	Since we're doubling the array size each time, the maximum waste is
	O(n).

	=== Performance ===

	This table is much faster than `unordered_flat_map+shared_mutex`
	and faster than `concurrent_flat_map` in both sequential and
	parallel lookups (only around 300k keys it's slightly slower than
	`concurrent_flat_map` in a single threaded mode, in all other cases
	it's usually 2x+ times faster).

	Most importantly it doesn't have `concurrent_flat_map` pathology on
	small tables and hot keys, so it's 10x+ times faster on small tables.

	Inserts are not optimized. If every insert is protected by a mutex
	it could be 2-10 slower than other tables.

	Lookups are also faster than the unprotected `unordered_flat_map`
	for small (<4k) and big (>6M) tables. Small tables are faster due
	to a simpler logic, big tables are faster due to a sequential
	access to the main memory. In the middle, boost still has metadata
	with optimized checks in the L2 cache and becomes faster. The out
	of L3 cache drop happens later in `unordered_flat_map` so between
	200k-500k keys `unordered_flat_map` lookups are 1.5-2x faster (can
	be tuned by increasing `LockFreeLookupHashTable` load factor).

	`unordered_flat_map` is a good choice for medium-sized tables
	(especially around 200k-500k) with no concurrent access, or in case
	of many inserts.

	`concurrent_flat_map` is considerably faster than
	`unordered_flat_map+shared_mutex`. But it uses spinning mutexes and
	is _pathologically_ slow and causes high system CPU usage (due to
	`sched_yield`) if multiple threads request the same key (or keys
	from the same group -- happens frequently in small maps). It's
	slightly faster for inserts due to sharding, but we optimize for
	reading. Its use case is a big table (>4k elements) with
	relatively high number of updates and no lookup hot spots (which
	might be hard to guarantee). Do not recommend. "benchmark-oriented"
	data structure that will fail you in the real world.
	*/
	template<
	typename Key,
	typename Value,
	typename Hash = std::hash<Key>,
	typename KeyEq = std::equal_to<Key>
	>
	class LockFreeLookupHashTable {
	// Small load factor as there's no advanced logic to fight clustering.
	// Goes out of L3 cache quicker but works faster on small and large tables.
	static constexpr int LOAD_FACTOR_NUMERATOR = 1;
	static constexpr int LOAD_FACTOR_DENOMINATOR = 3;

	struct Item {
	std::atomic<size_t> flag; // 0 - empty, nonZeroHash(hash)
	// Keep the key and the value flat in the array for faster
	// comparisons and value retrieval.
	// Slowdowns resizes, but we don't expect many of them.
	// Keeping value inside means that we get out of the L1 cache
	// faster. Might help to keep pointers. It's on the user.
	std::pair<Key, Value> keyValue;
	};

	using Table = std::vector<Item>;

	std::atomic<Table*> table; // current pointer, swapped on resize

	// list of previous tables to have no dangling pointers and free
	// memory at the end
	std::vector<Table*> oldTables;
	size_t used; // number of value added, for load factor and resizing

	static inline Table emptyTable{}; // shared per-specialization empty array

	public:
	LockFreeLookupHashTable() : table(&emptyTable), used(0) {}
	LockFreeLookupHashTable(size_t capacity) : LockFreeLookupHashTable() {
	resize(capacity);
	}

	LockFreeLookupHashTable(const LockFreeLookupHashTable&) = delete;
	LockFreeLookupHashTable& operator=(const LockFreeLookupHashTable&) = delete;
	LockFreeLookupHashTable(LockFreeLookupHashTable&&) = delete;
	LockFreeLookupHashTable& operator=(LockFreeLookupHashTable&&) = delete;

	~LockFreeLookupHashTable() {
	delete table.load(std::memory_order_acquire);
	for (auto t : oldTables) delete t;
	}

	// lock-free
	bool visit(const Key& key, const auto& f) const {
	Value *r = lookupPtr(key);
	if (r)
	f(key, *r);
	return r;
	}

	// lock-free
	const Value *lookupPtr(const Key& key) const {
	const Table& t = *table.load(std::memory_order_acquire);
	size_t size = t.size();
	size_t mask = size - 1;
	size_t hash = Hash{}(key);
	size_t index = hash & mask;
	for (size_t i = 0; i < size; i++) {
	const Item& item = t[(index+i) & mask];
	size_t flag = item.flag.load(std::memory_order_acquire);
	if (flag == nonZeroHash(hash) && KeyEq{}(key, item.keyValue.first))
	return &item.keyValue.second;
	// ++nFailedLookups;
	// if (i >= maxFailedLookups) { maxFailedLookups = i+1; maxFailedLookupsKey = key; }
	if (!flag)
	return nullptr;
	}
	return nullptr;
	}

	// must be protected by a mutex
	const Value *emplace(const Key& key, const Value& value) {
	Table& t0 = *table.load(std::memory_order_acquire);
	size_t requiredSize = capacitySize(used + 1);
	Table& t = t0.size() >= requiredSize ? t0 : resize(t0, requiredSize);
	return emplaceIntoTable(t, key, value);
	}

	// must be protected by a mutex
	void reserve(size_t capacity) {
	size_t requiredSize = capacitySize(capacity);
	Table& t = *table.load(std::memory_order_acquire);
	if (t.size() < requiredSize)
	resize(t, requiredSize);
	}

	// debugging
	static constexpr double LOAD_FACTOR = (double)LOAD_FACTOR_NUMERATOR / LOAD_FACTOR_DENOMINATOR;
	double debugLoadFactor() { return (double)used / debugSize(); }
	size_t debugSize() { return table.load()->size(); }

	// mutable size_t nFailedLookups = 0;
	// mutable size_t maxFailedLookups = 0;
	// mutable Key maxFailedLookupsKey;

	private:
	// make hash non-zero to distinguish it from an empty item
	static inline size_t nonZeroHash(size_t h) {
	return h \| 1;
	// scaling doesn't change things much
	// return (((__uint128_t)h * (UINT64_MAX - 1)) >> 64) + 1;
	}
	// table size enough to keep `capacity` items with a proper load factor
	size_t static capacitySize(size_t capacity) {
	return std::bit_ceil(capacity * LOAD_FACTOR_DENOMINATOR / LOAD_FACTOR_NUMERATOR);
	}
	const Value *emplaceIntoTable(Table& t, const Key& key, const Value& value) {
	size_t size = t.size();
	size_t mask = size - 1;
	size_t hash = Hash{}(key);
	size_t index = hash & mask;
	for (size_t i = 0; i < size; i++) {
	Item& item = t[(index+i) & mask];
	size_t flag = item.flag.load(std::memory_order_acquire);
	if (flag == nonZeroHash(hash) && KeyEq{}(key, item.keyValue.first))
	return &item.keyValue.second; // return existing
	if (flag == 0) {
	++used;
	item.keyValue.first = key;
	item.keyValue.second = value;
	item.flag.store(nonZeroHash(hash), std::memory_order_release);
	return &item.keyValue.second;
	}
	}
	// shouldn't end up here as we resize the table
	throw std::runtime_error("the table is full?");
	}
	Table& resize(Table& t, size_t newSize) {
	Table& newTable = *new Table(newSize);
	used = 0; // emplaceIntoTable will update it
	for (const auto& item : t)
	if (item.flag.load(std::memory_order_acquire))
	emplaceIntoTable(newTable, item.keyValue.first, item.keyValue.second);
	if (&t != &emptyTable)
	oldTables.push_back(&t);
	table.store(&newTable, std::memory_order_release);
	return newTable;
	// printf("resized to %lu\n", newTable.size());
	}
	};

	template<
	typename Key,
	typename Value,
	typename Hash = std::hash<Key>,
	typename KeyEq = std::equal_to<Key>
	>
	class LockFreeLookupFutureCache {
	LockFreeLookupHashTable<Key, std::shared_future<Value>, Hash, KeyEq> cache;
	std::mutex mutex;
	public:
	// func must be callable as Value()
	template<typename F>
	Value getOrCompute(const Key& key, F&& func) {
	std::shared_future<Value> *fut = cache.lookupPtr(key);
	if (fut)
	return fut->get();

	std::packaged_task<Value()> task(std::forward<F>(func));
	std::shared_future<Value> taskSharedFuture = task.get_future().share();

	{
	std::scoped_lock _(mutex);
	fut = cache.lookupPtr(key);
	if (fut)
	return fut->get(); // already inserted
	fut = cache.emplace(key, taskSharedFuture);
	}

	// Run the computation outside the lock. packaged_task will set the
	// shared state (value or exception) for others to observe.
	task();
	return fut->get();
	}
	};

	int main() {
	size_t k = 1000, M = 1000*k;
	std::vector<size_t> testKeySizes = {
	100, //1k, 2k, 3k, 4k, 5k, 6k, 7k, 8k, 9*k,
	10k, //15k, 25k, 50k, 75*k,
	100k,// 150k, 200k, 250k, 300k, 350k, 400k, 500k, 600k, 700k, 800k, 900k,
	1M,// 2M, 3M, 4M, 5M, 6M, 7M, 8M, 9M, 10M
	};
	std::vector<int> testNThreads = {
	1, 2, 4, 8, 16
	};

	const int nLookups = 10010001000;

	// nKeys -> [(column, msec)]
	std::map<size_t, std::map<std::string, double> > graphs;

	// std::string mutexType = "null_mutex";
	// struct null_mutex
	// {
	// void lock() {}
	// void unlock() noexcept {}
	// bool try_lock() { return true; }
	// };
	// using mutex = null_mutex;
	// using lock = std::scoped_lock<mutex>;
	std::string mutexType = "shared_mutex";
	using mutex = std::shared_mutex;
	using lock = std::shared_lock<mutex>;
	// std::string mutexType = "mutex";
	// using mutex = std::mutex;
	// using lock = std::scoped_lock<mutex>;

	// using Key = size_t;
	// using Value = size_t;
	// auto value = [](const Key& k) { return k; };
	using Key = std::string;
	using Value = size_t;
	auto value = [](const Key& k) { return k.size(); };
	// using Value = std::string;
	// auto value = [](const Key& k) { return k; };

	for (auto nThreads : testNThreads) {
	if (nThreads > omp_get_max_threads()) {
	printf("too many threads (%d > omp_get_max_threads = %d), skipping\n", nThreads, omp_get_max_threads());
	continue;
	}
	for (auto nKeys : testKeySizes) {
	using namespace std::chrono;
	auto start = high_resolution_clock::now();
	auto printRuntime = [&](const std::string& what) {
	auto end = high_resolution_clock::now();
	duration<double, std::milli> elapsed = end - start;
	auto label = what + " (" + std::to_string(nThreads) + (nThreads > 1 ? " threads)" : " thread)");
	printf("%60s %7lu %8.2f\n", label.c_str(), nKeys, elapsed.count());
	graphs[nKeys][label] = elapsed.count();
	// printf("%50s %8.2f ms\n", what.c_str(), elapsed.count());
	start = high_resolution_clock::now();
	};

	std::vector<Key> keys;
	keys.reserve(nKeys);
	for (size_t i = 0; i < nKeys; i++) {
	// keys.emplace_back(i);
	auto key = //"long long prefix to slowdown comparisons" +
	std::to_string(i);
	keys.emplace_back(key);
	}
	std::span keysToInsert(keys);
	// std::span keysToInsert(keys.begin(), keys.size() / 2);
	std::span keysToLookup(keys);
	// std::span keysToLookup(keys.begin(), keys.size() / 5000);
	std::shuffle(keys.begin(), keys.end(), std::mt19937{});
	std::string someKey = keys[nKeys / 2];

	const int reserve = keysToInsert.size();

	// printf("nKeys = %lu, reserve = %d, load factor = %.2f%%, total lookups = %d, threads = %d\n",
	// nKeys, reserve, 100.0*LockFreeLookupHashTable<Key,Value>::LOAD_FACTOR, nLookups, nThreads);

	// // the slowest, but faster than boost on nKeys<1000 (fits into L1 cache)
	// {
	// start = high_resolution_clock::now();
	// std::unordered_map<Key, Value
	// ,boost::hash<Key>
	// // note that boost::hash is considerably
	// // faster for std::string (it's inlined)
	// > m; // perf shows DIV instructions
	// m.reserve(reserve);
	// for (auto& k : keysToInsert) m.emplace(k, value(k));
	// printRuntime("std::unordered_map+" + mutexType + " append");
	// mutex sm;
	// #pragma omp parallel for num_threads(nThreads)
	// for (size_t i = 0; i < nLookups/keysToLookup.size(); i++) {
	// const Value * volatile out; (void)out;
	// for (auto& k : keysToLookup) {
	// lock _(sm);
	// out = &m[k];
	// }
	// }
	// printRuntime("std::unordered_map+" + mutexType + " lookup");
	// }
	{
	mutex sm;
	start = high_resolution_clock::now();
	boost::unordered_flat_map<Key, Value> m;
	m.reserve(reserve);
	#pragma omp parallel for num_threads(nThreads)
	for (auto& k : keysToInsert) { std::scoped_lock _(sm); m.emplace(k, value(k)); }
	printRuntime("boost::unordered_flat_map+" + mutexType + " append");
	#pragma omp parallel for num_threads(nThreads)
	for (size_t i = 0; i < nLookups/keysToLookup.size(); i++) {
	const Value * volatile out; (void)out;
	for (auto& k : keysToLookup) {
	lock _(sm);
	out = &m[k];
	}
	}
	printRuntime("boost::unordered_flat_map+" + mutexType + " lookup");
	}
	{
	start = high_resolution_clock::now();
	boost::concurrent_flat_map<Key, Value> m;
	m.reserve(reserve);
	#pragma omp parallel for num_threads(nThreads)
	for (auto& k : keysToInsert) m.emplace(k, value(k));
	printRuntime("boost::concurrent_flat_map append");
	#pragma omp parallel for num_threads(nThreads)
	for (size_t i = 0; i < nLookups/keysToLookup.size(); i++) {
	const Value * volatile out; (void)out;
	for (auto& k : keysToLookup)
	// it has a pathological behaviour (25% sys CPU running
	// sched_yield) if we put the same key (e.g. "123") here
	m.visit(k, [&](auto& x) { out = &x.second; });
	}
	printRuntime("boost::concurrent_flat_map lookup");
	}
	{
	start = high_resolution_clock::now();
	LockFreeLookupHashTable<Key, Value, boost::hash<Key> > m;
	m.reserve(reserve);
	std::mutex mutex;
	#pragma omp parallel for num_threads(nThreads)
	for (auto& k : keysToInsert) { std::scoped_lock _(mutex); m.emplace(k, value(k)); };
	printRuntime("LockFreeLookupHashTable append");
	#pragma omp parallel for num_threads(nThreads)
	for (size_t i = 0; i < nLookups/keysToLookup.size(); i++) {
	const Value * volatile out; (void)out;
	for (auto& k : keysToLookup)
	out = m.lookupPtr(k);
	}
	printRuntime("LockFreeLookupHashTable lookup");
	// printf("debugLoadFactor = %.2f%%, debugSize = %lu\n", m.debugLoadFactor()*100, m.debugSize());
	// printf("nFailedLookups = %lu (%.2f%%), maxFailedLookups = %lu (\"%s\")\n", m.nFailedLookups,
	// 100.0 * m.nFailedLookups / nLookups, m.maxFailedLookups, m.maxFailedLookupsKey.data());
	}
	} // nKeys
	} // nThreads

	printf("\nFor Excel:\n\n# keys");
	for (auto [name,_] : graphs[testKeySizes[0]]) {
	printf(", %s", name.data());
	}
	printf("\n");
	for (auto [nKeys, line] : graphs) {
	printf("%lu", nKeys);
	for (auto [_, msec] : line )
	printf(", %.2f", msec);
	printf("\n");
	}
	}
No results found