Last active
January 17, 2024 20:06
-
-
Save cgiosy/dda34f4701482f1c6d07fb521d43325f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define LOAD_FACTOR 0.5 | |
#include "robin-hood.hpp" | |
#define ERASE_TECHNIQUE USE_SHIFTING | |
#include "robin-hood.hpp" | |
#include "random.hpp" | |
#include "lib/bytell_hash_map.hpp" | |
#define LOOKUP_ZIPF true | |
#include <cmath> | |
#include <cstdint> | |
#include <cstddef> | |
#include <cassert> | |
#include <iomanip> | |
#include <iostream> | |
#include <fstream> | |
#include <iterator> | |
#include <type_traits> | |
#include <chrono> | |
#include <random> | |
#include <bit> | |
#include <array> | |
#include <vector> | |
#include <variant> | |
#include <algorithm> | |
#include <map> | |
uint64_t chainCnt = 0; | |
double zeta(double const z, uint64_t const N) { | |
double n = 0; | |
for (uint64_t i = 1; i <= N; i++) n += std::pow(i, -z); | |
return n; | |
} | |
#define now() std::chrono::steady_clock::now() | |
#define ms(a, b) std::chrono::duration<double>(b - a).count() * 1e12 | |
template<class T> | |
void shuffle(std::vector<T>& v) { | |
cgiosy::shuffle(cgiosy::wyrand, v.begin(), v.end()); | |
} | |
template<class Map> | |
void _bench(std::string TestName, size_t TC, size_t N, size_t M) { | |
constexpr double Z = 0.75; | |
// Initialize test data | |
std::vector<uint64_t> keys, lookups; | |
keys.reserve(N); | |
lookups.reserve(M); | |
auto const first = M / zeta(Z, N); | |
for (uint64_t i = 0; i < N; i++) { | |
#if LOOKUP_ZIPF | |
uint64_t cnt = first * pow(i + 1, -Z) + 0.5; | |
#else | |
uint64_t cnt = M / N; | |
#endif | |
auto key = cgiosy::wyrand(); | |
keys.push_back(key); | |
while (cnt--) lookups.push_back(key); | |
} | |
auto erasedKeys = keys; | |
shuffle(keys); | |
shuffle(lookups); | |
shuffle(erasedKeys); | |
std::map<std::string, std::vector<double>> timeTable; | |
auto measureTime = [&](std::string name, double n, auto f) { | |
auto const timeStart = now(); | |
f(); | |
auto const elapsed = ms(timeStart, now()); | |
timeTable[name].push_back(elapsed / n); | |
}; | |
for (uint64_t testCount = 0; testCount < TC; testCount++) { | |
Map set(N); | |
#if 0 | |
for (uint64_t i = 0; i < 30; i++) { | |
measureTime("insert", keys.size(), [&] { | |
for (auto const& key : keys) | |
set.emplace(key); | |
}); | |
assert(set.size() == keys.size()); | |
shuffle(erasedKeys); | |
measureTime("erase", erasedKeys.size(), [&] { | |
for (auto const& key : erasedKeys) | |
set.erase(key); | |
}); | |
assert(set.size() == 0); | |
} | |
measureTime("insert", keys.size(), [&] { | |
for (auto const& key : keys) | |
set.emplace(key); | |
}); | |
#else | |
measureTime("insert (initial)", keys.size(), [&] { | |
for (auto const& key : keys) | |
set.emplace(key); | |
}); | |
for (uint64_t i = 0; i < 30; i++) { | |
assert(set.size() == keys.size()); | |
measureTime("erase + insert", erasedKeys.size(), [&] { | |
for (auto const& key : erasedKeys) { | |
set.erase(key); | |
set.emplace(key); | |
} | |
}); | |
shuffle(erasedKeys); | |
} | |
#endif | |
assert(set.size() == keys.size()); | |
measureTime("contains (successful)", lookups.size(), [&] { | |
uint64_t res = 0; | |
for (auto const& key : lookups) | |
res += set.count(key); | |
assert(res == lookups.size()); | |
}); | |
measureTime("contains (unsuccessful)", lookups.size(), [&] { | |
uint64_t res = 0; | |
for (auto const& key : lookups) | |
res += set.count(key ^ 1); | |
assert(res == 0); | |
}); | |
measureTime("erase", erasedKeys.size(), [&] { | |
for (auto const& key : erasedKeys) | |
set.erase(key); | |
}); | |
measureTime("insert", erasedKeys.size(), [&] { | |
for (auto const& key : keys) | |
set.emplace(key); | |
}); | |
} | |
std::cout << "## " << TestName << std::endl; | |
for (auto [name, timeList] : timeTable) { | |
std::sort(timeList.begin(), timeList.end()); | |
double sum = 0; | |
for (auto const& time : timeList) | |
sum += time; | |
std::cout << "### " << name << '\n'; | |
std::cout << "- avg: " << sum / timeList.size() << '\n'; | |
std::cout << "- p50: " << timeList[int(timeList.size() * 0.5)] << '\n'; | |
std::cout << "- p99: " << timeList[int(timeList.size() * 0.99)] << '\n'; | |
std::cout << std::endl; | |
} | |
std::cout << std::endl; | |
} | |
#define bench(T, ...) _bench<T>(#T, __VA_ARGS__) | |
int main() { | |
std::ios::sync_with_stdio(0); | |
std::cin.tie(0); | |
std::cout << std::fixed << std::setprecision(3); | |
size_t L = 10, R = 10; | |
for (size_t n = L; n <= R; n += 5) { | |
uint64_t TEST_COUNT = 128 << (R - n); | |
size_t N = (1 << n) * (LOAD_FACTOR * 2 / 3) - 10; | |
size_t M = N * 5; | |
double P = 0.; | |
std::cout << "# " << n << std::endl; | |
for (size_t i = 0; i < 3; i++) { | |
bench(ska::bytell_hash_set<uint64_t>, TEST_COUNT, N, M); | |
bench(HashTable<uint64_t>, TEST_COUNT, N, M); | |
bench(HashTableShift<uint64_t>, TEST_COUNT, N, M); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdint> | |
#include <memory> | |
#include <iterator> | |
#include <algorithm> | |
#include <random> | |
#include <array> | |
namespace cgiosy { | |
template<int RW = 1, int LOCAL = 2, class T> | |
inline void prefetch(T& value) noexcept { | |
__builtin_prefetch(std::addressof(value), RW, LOCAL); | |
} | |
template<class Rng> | |
inline uint64_t random(Rng& rng, uint64_t const n) { | |
auto m = __uint128_t(rng()) * n; | |
if (uint64_t(m) < n) | |
while (uint64_t(m) < -n % n) | |
m = __uint128_t(rng()) * n; | |
return m >> 64; | |
} | |
template<class Rng, class Iter> | |
void shuffle(Rng& rng, Iter const a, Iter const b) { | |
size_t constexpr SZ = 512; | |
auto const n = std::distance(a, b); | |
if (n < std::max(SZ * 4, (1 << 20) / sizeof(decltype(*a)))) { | |
for (auto i = n - 1; i > 0; i -= 1) | |
std::swap(a[i], a[random(rng, i + 1)]); | |
return; | |
} | |
std::array<uint64_t, SZ> indexes; | |
for (auto p = SZ - 1; ~p; p -= 1) | |
prefetch(indexes[p] = random(rng, n + p - (SZ - 1))); | |
auto p = SZ - 1; | |
for (auto i = n - 1; i > SZ - 1; i -= 1) { | |
std::swap(a[i], a[indexes[p]]); | |
prefetch(indexes[p] = random(rng, i + 1 - SZ)); | |
p = int64_t(p - 1) < 0 ? SZ - 1 : p - 1; | |
} | |
for (auto i = SZ - 1; i > 0; i -= 1) { | |
std::swap(a[i], a[indexes[p]]); | |
p = int64_t(p - 1) < 0 ? SZ - 1 : p - 1; | |
} | |
} | |
static inline uint64_t wyseed = std::random_device{}(); | |
inline uint64_t wyrand() { | |
wyseed += 0xA0761D6478BD642F; | |
auto v = wyseed * __uint128_t(wyseed ^ 0xE7037ED1A0B428DB); | |
return v ^ v >> 64; | |
} | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cmath> | |
#include <cstdint> | |
#include <iterator> | |
#include <variant> | |
#include <utility> | |
#include <algorithm> | |
#ifndef USE_SHIFTING | |
#define USE_SHIFTING 1 | |
#endif | |
#ifndef USE_GRAVEYARD | |
#define USE_GRAVEYARD 2 | |
#endif | |
#ifndef ERASE_TECHNIQUE | |
#define ERASE_TECHNIQUE USE_GRAVEYARD | |
#endif | |
#ifndef LOAD_FACTOR | |
#define LOAD_FACTOR 0.5 | |
#endif | |
#if ERASE_TECHNIQUE == USE_GRAVEYARD | |
#define HASH_TABLE HashTable | |
#else | |
#define HASH_TABLE HashTableShift | |
#endif | |
template<class Key, class Value = std::monostate> | |
struct HASH_TABLE { | |
#define USE_NONE 0 | |
#define USE_GOLDEN 1 | |
#define USE_WYRAND 2 | |
#define USE_SPLITMIX 3 | |
#define HASH_TYPE USE_GOLDEN | |
static inline uint64_t hash(uint64_t x) { | |
#if HASH_TYPE == USE_GOLDEN | |
// https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/include/linux/hash.h | |
x *= 0x61C8864680B583EB; | |
#elif HASH_TYPE == USE_WYRAND | |
// https://github.com/wangyi-fudan/wyhash/blob/e77036ac1943369dc03e611cde52a8570f8ceefe/wyhash.h#L149 | |
auto y = x * __uint128_t(x ^ 0xE7037ED1A0B428DB); | |
x = y ^ y >> 64; | |
#elif HASH_TYPE == USE_SPLITMIX | |
// https://xoshiro.di.unimi.it/splitmix64.c | |
x += 0x9E3779B97F4A7C15; | |
x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9; | |
x = (x ^ (x >> 27)) * 0x94D049BB133111EB; | |
return x ^ (x >> 31); | |
#endif | |
return x; | |
} | |
union KV { | |
struct { | |
Key key; | |
[[no_unique_address]] Value value; | |
}; | |
struct { | |
Key first; | |
[[no_unique_address]] Value second; | |
}; | |
}; | |
struct iterator { | |
using iterator_category = std::bidirectional_iterator_tag; | |
using difference_type = std::ptrdiff_t; | |
using value_type = KV; | |
using pointer = value_type*; | |
using reference = value_type&; | |
uint8_t* const data; | |
size_t const mask; | |
size_t pos; | |
inline reference operator*() const { return ((KV*)data)[pos]; } | |
inline pointer operator->() const { return &((KV*)data)[pos]; } | |
inline uint8_t& meta() const { | |
return data[~pos]; | |
} | |
inline uint8_t filled() const { | |
return uint8_t(meta() + 1) >= uint8_t(1 + 1); | |
} | |
inline uint8_t isHead() const { | |
return meta() <= 1; | |
} | |
inline iterator& operator++() { | |
pos = pos + 1 & mask; | |
return *this; | |
} | |
inline iterator& operator--() { | |
pos = pos - 1 & mask; | |
return *this; | |
} | |
inline bool operator!=(iterator const& b) const { | |
return pos != b.pos; | |
}; | |
}; | |
uint8_t* data; | |
size_t keyCount; | |
#if ERASE_TECHNIQUE == USE_GRAVEYARD | |
size_t graveyardCount; | |
#endif | |
uint8_t lg; | |
HASH_TABLE(size_t const reserved = 0) { | |
lg = reserved ? 64 - __builtin_clzll(size_t(reserved / LOAD_FACTOR)) : 0; | |
data = new uint8_t[size_t(sizeof(KV) + 1) << lg]; | |
std::fill(data, data + capacity(), 0); | |
data += capacity(); | |
keyCount = 0; | |
#if ERASE_TECHNIQUE == USE_GRAVEYARD | |
graveyardCount = capacity() >> 2; | |
#endif | |
} | |
~HASH_TABLE() { | |
deallocate(); | |
} | |
void deallocate() { | |
data -= capacity(); | |
operator delete[](data, size_t(sizeof(KV) + 1) << lg); | |
} | |
#if ERASE_TECHNIQUE == USE_GRAVEYARD | |
inline void graveyard() { | |
if (graveyardCount > 0) { | |
graveyardCount -= 1; | |
return; | |
} | |
for (size_t i = 0; i < capacity(); i += 1) | |
data[~i] = data[~i] == -1 ? 0 : data[~i]; | |
auto const remain = capacity() - keyCount; | |
auto const tombPeriod = (capacity() * 2) / remain; | |
graveyardCount = remain >> 2; | |
for (size_t i = 0; i < capacity(); i += tombPeriod) | |
data[~i] = data[~i] == 0 ? -1 : data[~i]; | |
} | |
#endif | |
inline size_t capacity() const { | |
return size_t(1) << lg; | |
} | |
inline size_t size() const { | |
return keyCount; | |
} | |
inline bool empty() const { | |
return size() == 0; | |
} | |
template<class Hash> | |
inline iterator iter(Hash const h) { | |
auto const pos = h >> (sizeof(Hash) * 8 - lg); | |
return {data, capacity() - 1, pos}; | |
} | |
inline iterator find(Key const& key) { | |
auto it = iter(hash(key)); | |
uint8_t dist = 1; | |
while (!(it->key == key)) [[unlikely]] { | |
if (it.meta() < dist) break; | |
++it; | |
++dist; | |
} | |
return it; | |
} | |
inline bool contains(Key const& key) { | |
return find(key)->key == key; | |
} | |
inline size_t count(Key const& key) { | |
return contains(key); | |
} | |
inline std::pair<iterator, bool> emplace(Key key, Value&& value = std::monostate{}) { | |
auto it = iter(hash(key)); | |
uint8_t dist = 1; | |
while (true) { | |
if (!it.filled()) [[likely]] { | |
it->key = std::move(key); | |
it->value = std::move(value); | |
it.meta() = dist; | |
break; | |
} | |
if (it->key == key) return {it, false}; | |
if (it.meta() < dist) { | |
std::swap(key, it->key); | |
std::swap(value, it->value); | |
std::swap(dist, it.meta()); | |
} | |
++it; | |
++dist; | |
} | |
keyCount += 1; | |
#if ERASE_TECHNIQUE == USE_GRAVEYARD | |
graveyard(); | |
#endif | |
return {it, true}; | |
} | |
inline std::pair<iterator, bool> insert(KV&& kv) { | |
emplace(std::move(kv.key), std::move(kv.value)); | |
} | |
inline void erase(iterator it) { | |
#if ERASE_TECHNIQUE == USE_GRAVEYARD | |
it->key = {}; | |
it->value = {}; | |
it.meta() = -1; | |
keyCount -= 1; | |
#if ERASE_TECHNIQUE == USE_GRAVEYARD | |
graveyard(); | |
#endif | |
#else | |
auto nxt = it; | |
++nxt; | |
while (!nxt.isHead()) { | |
it->key = std::move(nxt->key); | |
it->value = std::move(nxt->value); | |
it.meta() = nxt.meta() - 1; | |
++it; | |
++nxt; | |
} | |
it->key = {}; | |
it->value = {}; | |
it.meta() = 0; | |
keyCount -= 1; | |
#endif | |
} | |
inline size_t erase(Key const& key) { | |
auto it = find(key); | |
if (!(it->key == key)) return 0; | |
erase(it); | |
return 1; | |
} | |
}; | |
#undef ERASE_TECHNIQUE | |
#undef HASH_TABLE |
Author
cgiosy
commented
Aug 1, 2022
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment