Skip to content

Instantly share code, notes, and snippets.

@jweinst1
Last active January 12, 2026 04:48
Show Gist options
  • Select an option

  • Save jweinst1/4fc762249cc1243effb41d816bd51637 to your computer and use it in GitHub Desktop.

Select an option

Save jweinst1/4fc762249cc1243effb41d816bd51637 to your computer and use it in GitHub Desktop.
hashmap for murmur3
#include <iostream>
#include <vector>
#include <unordered_map>
#include <random>
#include <chrono>
#include <cstdint>
// =======================================================
// MurmurHash3 x64 → 64-bit
// =======================================================
static inline uint32_t rotl32(uint32_t x, int8_t r) {
return (x << r) | (x >> (32 - r));
}
static inline uint32_t fmix32(uint32_t h) {
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
uint32_t murmur3_32(const void* key, size_t len, uint32_t seed = 0) {
const uint8_t* data = static_cast<const uint8_t*>(key);
const size_t nblocks = len / 4;
uint32_t h1 = seed;
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
// body
const uint32_t* blocks = reinterpret_cast<const uint32_t*>(data);
for (size_t i = 0; i < nblocks; i++) {
uint32_t k1 = blocks[i];
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
h1 = rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
}
// tail
const uint8_t* tail = data + nblocks * 4;
uint32_t k1 = 0;
switch (len & 3) {
case 3: k1 ^= uint32_t(tail[2]) << 16;
case 2: k1 ^= uint32_t(tail[1]) << 8;
case 1: k1 ^= uint32_t(tail[0]);
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
}
// finalization
h1 ^= static_cast<uint32_t>(len);
h1 = fmix32(h1);
return h1;
}
// =======================================================
// Linear-probing hash-only map
// =======================================================
template<typename V>
class HashOnlyLinearMap {
struct KVPair {
uint32_t _hash = 0;
V _value;
inline bool add(uint32_t hash, const V& val) {
if (_hash == 0) {
_hash = hash;
_value = val;
return true;
} else if (_hash == hash) {
_value = val;
return true;
}
return false;
}
inline bool find(uint32_t hash, const V** val) const {
if (_hash == 0) {
return false;
} else if (_hash == hash) {
*val = &_value;
return true;
}
return false;
}
};
struct Slot {
KVPair pairs[8];
inline bool add(uint32_t hash, const V& val) {
return pairs[0].add(hash, val) ||
pairs[1].add(hash, val) ||
pairs[2].add(hash, val) ||
pairs[3].add(hash, val) ||
pairs[4].add(hash, val) ||
pairs[5].add(hash, val) ||
pairs[6].add(hash, val) ||
pairs[7].add(hash, val);
}
inline bool find(uint32_t hash, const V** val) const {
return pairs[0].find(hash, val) ||
pairs[1].find(hash, val) ||
pairs[2].find(hash, val) ||
pairs[3].find(hash, val) ||
pairs[4].find(hash, val) ||
pairs[5].find(hash, val) ||
pairs[6].find(hash, val) ||
pairs[7].find(hash, val);
}
};
std::vector<Slot> table;
size_t mask;
public:
explicit HashOnlyLinearMap(size_t power_of_two_capacity) {
size_t cap = 1ULL << power_of_two_capacity;
table.resize(cap);
mask = cap - 1;
}
void insert(uint32_t hash, const V& value) {
size_t idx = hash & mask;
while (true) {
Slot& s = table[idx];
if (s.add(hash, value)) {
return;
}
idx = (idx + 1) & mask; // linear probe
}
}
const V* find(uint32_t hash) const {
size_t idx = hash & mask;
const V* myPtr = nullptr;
while (true) {
const Slot& s = table[idx];
if (s.find(hash, &myPtr)) {
return myPtr;
}
idx = (idx + 1) & mask;
}
}
};
// =======================================================
// Benchmark
// =======================================================
int main() {
constexpr size_t N = 10'000'000;
std::vector<uint32_t> keys(N);
std::mt19937_64 rng(123);
for (auto& k : keys) k = rng();
// Custom map
HashOnlyLinearMap<uint32_t> custom_map(26); // ~2M slots
/*
* Custom linear-probe map: 69689us
std::unordered_map: 298924us
no hash, per insert / lookup half
Custom linear-probe map: 18881us
std::unordered_map: 278250us
jweinstein@JOSWEINS-M-J60X cwork % ./hashbench
Custom linear-probe map: 91792us
std::unordered_map: 3308128us
32bit
jweinstein@JOSWEINS-M-J60X cwork % ./hashbench
Custom linear-probe map: 75551us
std::unordered_map: 3468358us
*/
for (auto k : keys) {
uint32_t h = murmur3_32(&k, sizeof(k));
custom_map.insert(k, k);
}
auto t1 = std::chrono::high_resolution_clock::now();
uint32_t sum1 = 0;
for (const auto& k : keys) {
uint32_t h = murmur3_32(&k, sizeof(k));
if (auto* v = custom_map.find(k)) {
sum1 += *v;
}
}
auto t2 = std::chrono::high_resolution_clock::now();
// std::unordered_map
std::unordered_map<uint32_t, std::vector<uint32_t>> std_map;
auto t3 = std::chrono::high_resolution_clock::now();
for (auto k : keys) {
std_map[k].push_back(k);
}
uint32_t sum2 = 0;
for (auto k : keys) {
for (auto x : std_map[k]) sum2 += x;
}
auto t4 = std::chrono::high_resolution_clock::now();
std::cout << "Custom linear-probe map: "
<< std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << "us\n";
std::cout << "std::unordered_map: "
<< std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3).count() << "us\n";
std::cout << "Ignore sums: " << sum1 << " " << sum2 << "\n";
}
#include <iostream>
#include <vector>
#include <unordered_map>
#include <random>
#include <chrono>
#include <cstdint>
// =======================================================
// MurmurHash3 x64 → 64-bit
// =======================================================
static inline uint64_t rotl64(uint64_t x, int8_t r) {
return (x << r) | (x >> (64 - r));
}
static inline uint64_t fmix64(uint64_t k) {
k ^= k >> 33;
k *= 0xff51afd7ed558ccdULL;
k ^= k >> 33;
k *= 0xc4ceb9fe1a85ec53ULL;
k ^= k >> 33;
return k;
}
uint64_t murmur3_64(const void* key, size_t len, uint64_t seed = 0) {
const uint8_t* data = static_cast<const uint8_t*>(key);
size_t nblocks = len / 16;
uint64_t h1 = seed;
uint64_t h2 = seed;
const uint64_t c1 = 0x87c37b91114253d5ULL;
const uint64_t c2 = 0x4cf5ad432745937fULL;
const uint64_t* blocks = (const uint64_t*)data;
for (size_t i = 0; i < nblocks; i++) {
uint64_t k1 = blocks[i * 2];
uint64_t k2 = blocks[i * 2 + 1];
k1 *= c1; k1 = rotl64(k1, 31); k1 *= c2; h1 ^= k1;
h1 = rotl64(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729;
k2 *= c2; k2 = rotl64(k2, 33); k2 *= c1; h2 ^= k2;
h2 = rotl64(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5;
}
const uint8_t* tail = data + nblocks * 16;
uint64_t k1 = 0, k2 = 0;
switch (len & 15) {
case 15: k2 ^= uint64_t(tail[14]) << 48;
case 14: k2 ^= uint64_t(tail[13]) << 40;
case 13: k2 ^= uint64_t(tail[12]) << 32;
case 12: k2 ^= uint64_t(tail[11]) << 24;
case 11: k2 ^= uint64_t(tail[10]) << 16;
case 10: k2 ^= uint64_t(tail[9]) << 8;
case 9: k2 ^= uint64_t(tail[8]);
k2 *= c2; k2 = rotl64(k2, 33); k2 *= c1; h2 ^= k2;
case 8: k1 ^= uint64_t(tail[7]) << 56;
case 7: k1 ^= uint64_t(tail[6]) << 48;
case 6: k1 ^= uint64_t(tail[5]) << 40;
case 5: k1 ^= uint64_t(tail[4]) << 32;
case 4: k1 ^= uint64_t(tail[3]) << 24;
case 3: k1 ^= uint64_t(tail[2]) << 16;
case 2: k1 ^= uint64_t(tail[1]) << 8;
case 1: k1 ^= uint64_t(tail[0]);
k1 *= c1; k1 = rotl64(k1, 31); k1 *= c2; h1 ^= k1;
}
h1 ^= len;
h2 ^= len;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
return h1;
}
// =======================================================
// Linear-probing hash-only map
// =======================================================
template<typename V>
class HashOnlyLinearMap {
struct KVPair {
uint64_t _hash = 0;
V _value;
inline bool add(uint64_t hash, const V& val) {
if (_hash == 0) {
_hash = hash;
_value = val;
return true;
} else if (_hash == hash) {
_value = val;
return true;
}
return false;
}
inline bool find(uint64_t hash, const V** val) const {
if (_hash == 0) {
return false;
} else if (_hash == hash) {
*val = &_value;
return true;
}
return false;
}
};
struct Slot {
KVPair pairs[8];
inline bool add(uint64_t hash, const V& val) {
return pairs[0].add(hash, val) ||
pairs[1].add(hash, val) ||
pairs[2].add(hash, val) ||
pairs[3].add(hash, val) ||
pairs[4].add(hash, val) ||
pairs[5].add(hash, val) ||
pairs[6].add(hash, val) ||
pairs[7].add(hash, val);
}
inline bool find(uint64_t hash, const V** val) const {
return pairs[0].find(hash, val) ||
pairs[1].find(hash, val) ||
pairs[2].find(hash, val) ||
pairs[3].find(hash, val) ||
pairs[4].find(hash, val) ||
pairs[5].find(hash, val) ||
pairs[6].find(hash, val) ||
pairs[7].find(hash, val);
}
};
std::vector<Slot> table;
size_t mask;
public:
explicit HashOnlyLinearMap(size_t power_of_two_capacity) {
size_t cap = 1ULL << power_of_two_capacity;
table.resize(cap);
mask = cap - 1;
}
void insert(uint64_t hash, const V& value) {
size_t idx = hash & mask;
while (true) {
Slot& s = table[idx];
if (s.add(hash, value)) {
return;
}
idx = (idx + 1) & mask; // linear probe
}
}
const V* find(uint64_t hash) const {
size_t idx = hash & mask;
const V* myPtr = nullptr;
while (true) {
const Slot& s = table[idx];
if (s.find(hash, &myPtr)) {
return myPtr;
}
idx = (idx + 1) & mask;
}
}
};
// =======================================================
// Benchmark
// =======================================================
int main() {
constexpr size_t N = 2'000'000;
std::vector<uint64_t> keys(N);
std::mt19937_64 rng(123);
for (auto& k : keys) k = rng();
// Custom map
HashOnlyLinearMap<uint64_t> custom_map(24); // ~2M slots
/*
* Custom linear-probe map: 69689us
std::unordered_map: 298924us
no hash, per insert / lookup half
Custom linear-probe map: 18881us
std::unordered_map: 278250us
*/
for (auto k : keys) {
//uint64_t h = murmur3_64(&k, sizeof(k));
custom_map.insert(k, k);
}
auto t1 = std::chrono::high_resolution_clock::now();
uint64_t sum1 = 0;
for (const auto& k : keys) {
//uint64_t h = murmur3_64(&k, sizeof(k));
if (auto* v = custom_map.find(k)) {
sum1 += *v;
}
}
auto t2 = std::chrono::high_resolution_clock::now();
// std::unordered_map
std::unordered_map<uint64_t, std::vector<uint64_t>> std_map;
std_map.reserve(N);
auto t3 = std::chrono::high_resolution_clock::now();
for (auto k : keys) {
std_map[k].push_back(k);
}
uint64_t sum2 = 0;
for (auto k : keys) {
for (auto x : std_map[k]) sum2 += x;
}
auto t4 = std::chrono::high_resolution_clock::now();
std::cout << "Custom linear-probe map: "
<< std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << "us\n";
std::cout << "std::unordered_map: "
<< std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3).count() << "us\n";
std::cout << "Ignore sums: " << sum1 << " " << sum2 << "\n";
}
#include <iostream>
#include <vector>
#include <unordered_map>
#include <random>
#include <chrono>
#include <cstdint>
// =======================================================
// MurmurHash3 x64 → 64-bit
// =======================================================
static inline uint64_t rotl64(uint64_t x, int8_t r) {
return (x << r) | (x >> (64 - r));
}
static inline uint64_t fmix64(uint64_t k) {
k ^= k >> 33;
k *= 0xff51afd7ed558ccdULL;
k ^= k >> 33;
k *= 0xc4ceb9fe1a85ec53ULL;
k ^= k >> 33;
return k;
}
uint64_t murmur3_64(const void* key, size_t len, uint64_t seed = 0) {
const uint8_t* data = static_cast<const uint8_t*>(key);
size_t nblocks = len / 16;
uint64_t h1 = seed;
uint64_t h2 = seed;
const uint64_t c1 = 0x87c37b91114253d5ULL;
const uint64_t c2 = 0x4cf5ad432745937fULL;
const uint64_t* blocks = (const uint64_t*)data;
for (size_t i = 0; i < nblocks; i++) {
uint64_t k1 = blocks[i * 2];
uint64_t k2 = blocks[i * 2 + 1];
k1 *= c1; k1 = rotl64(k1, 31); k1 *= c2; h1 ^= k1;
h1 = rotl64(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729;
k2 *= c2; k2 = rotl64(k2, 33); k2 *= c1; h2 ^= k2;
h2 = rotl64(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5;
}
const uint8_t* tail = data + nblocks * 16;
uint64_t k1 = 0, k2 = 0;
switch (len & 15) {
case 15: k2 ^= uint64_t(tail[14]) << 48;
case 14: k2 ^= uint64_t(tail[13]) << 40;
case 13: k2 ^= uint64_t(tail[12]) << 32;
case 12: k2 ^= uint64_t(tail[11]) << 24;
case 11: k2 ^= uint64_t(tail[10]) << 16;
case 10: k2 ^= uint64_t(tail[9]) << 8;
case 9: k2 ^= uint64_t(tail[8]);
k2 *= c2; k2 = rotl64(k2, 33); k2 *= c1; h2 ^= k2;
case 8: k1 ^= uint64_t(tail[7]) << 56;
case 7: k1 ^= uint64_t(tail[6]) << 48;
case 6: k1 ^= uint64_t(tail[5]) << 40;
case 5: k1 ^= uint64_t(tail[4]) << 32;
case 4: k1 ^= uint64_t(tail[3]) << 24;
case 3: k1 ^= uint64_t(tail[2]) << 16;
case 2: k1 ^= uint64_t(tail[1]) << 8;
case 1: k1 ^= uint64_t(tail[0]);
k1 *= c1; k1 = rotl64(k1, 31); k1 *= c2; h1 ^= k1;
}
h1 ^= len;
h2 ^= len;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
return h1;
}
// =======================================================
// Linear-probing hash-only map
// =======================================================
template<typename V>
class HashOnlyLinearMap {
struct Slot {
uint64_t hash = 0;
bool occupied = false;
V value;
};
std::vector<Slot> table;
size_t mask;
public:
explicit HashOnlyLinearMap(size_t power_of_two_capacity) {
size_t cap = 1ULL << power_of_two_capacity;
table.resize(cap);
mask = cap - 1;
}
void insert(uint64_t hash, const V& value) {
size_t idx = hash & mask;
while (true) {
Slot& s = table[idx];
if (!s.occupied) {
s.occupied = true;
s.hash = hash;
s.value = value;
return;
}
if (s.hash == hash) {
s.value = value;
return;
}
idx = (idx + 1) & mask; // linear probe
}
}
const V* find(uint64_t hash) const {
size_t idx = hash & mask;
while (true) {
const Slot& s = table[idx];
if (!s.occupied) {
return nullptr;
}
if (s.hash == hash) {
return &s.value;
}
idx = (idx + 1) & mask;
}
}
};
// =======================================================
// Benchmark
// =======================================================
int main() {
constexpr size_t N = 2'000'000;
std::vector<uint64_t> keys(N);
std::mt19937_64 rng(123);
for (auto& k : keys) k = rng();
// Custom map
HashOnlyLinearMap<uint64_t> custom_map(24); // ~2M slots
/*
* Custom linear-probe map: 69689us
std::unordered_map: 298924us
*/
auto t1 = std::chrono::high_resolution_clock::now();
for (auto k : keys) {
uint64_t h = murmur3_64(&k, sizeof(k));
custom_map.insert(h, k);
}
uint64_t sum1 = 0;
for (const auto& k : keys) {
uint64_t h = murmur3_64(&k, sizeof(k));
if (auto* v = custom_map.find(h)) {
sum1 += *v;
}
}
auto t2 = std::chrono::high_resolution_clock::now();
// std::unordered_map
std::unordered_map<uint64_t, std::vector<uint64_t>> std_map;
std_map.reserve(N);
auto t3 = std::chrono::high_resolution_clock::now();
for (auto k : keys) {
std_map[k].push_back(k);
}
uint64_t sum2 = 0;
for (auto k : keys) {
for (auto x : std_map[k]) sum2 += x;
}
auto t4 = std::chrono::high_resolution_clock::now();
std::cout << "Custom linear-probe map: "
<< std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << "us\n";
std::cout << "std::unordered_map: "
<< std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3).count() << "us\n";
std::cout << "Ignore sums: " << sum1 << " " << sum2 << "\n";
}
#include <iostream>
#include <vector>
#include <unordered_map>
#include <random>
#include <chrono>
#include <cstdint>
// =======================================================
// MurmurHash3 x64 → 64-bit
// =======================================================
static inline uint64_t rotl64(uint64_t x, int8_t r) {
return (x << r) | (x >> (64 - r));
}
static inline uint64_t fmix64(uint64_t k) {
k ^= k >> 33;
k *= 0xff51afd7ed558ccdULL;
k ^= k >> 33;
k *= 0xc4ceb9fe1a85ec53ULL;
k ^= k >> 33;
return k;
}
uint64_t murmur3_64(const void* key, size_t len, uint64_t seed = 0) {
const uint8_t* data = static_cast<const uint8_t*>(key);
size_t nblocks = len / 16;
uint64_t h1 = seed;
uint64_t h2 = seed;
const uint64_t c1 = 0x87c37b91114253d5ULL;
const uint64_t c2 = 0x4cf5ad432745937fULL;
const uint64_t* blocks = (const uint64_t*)data;
for (size_t i = 0; i < nblocks; i++) {
uint64_t k1 = blocks[i * 2];
uint64_t k2 = blocks[i * 2 + 1];
k1 *= c1; k1 = rotl64(k1, 31); k1 *= c2; h1 ^= k1;
h1 = rotl64(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729;
k2 *= c2; k2 = rotl64(k2, 33); k2 *= c1; h2 ^= k2;
h2 = rotl64(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5;
}
const uint8_t* tail = data + nblocks * 16;
uint64_t k1 = 0, k2 = 0;
switch (len & 15) {
case 15: k2 ^= uint64_t(tail[14]) << 48;
case 14: k2 ^= uint64_t(tail[13]) << 40;
case 13: k2 ^= uint64_t(tail[12]) << 32;
case 12: k2 ^= uint64_t(tail[11]) << 24;
case 11: k2 ^= uint64_t(tail[10]) << 16;
case 10: k2 ^= uint64_t(tail[9]) << 8;
case 9: k2 ^= uint64_t(tail[8]);
k2 *= c2; k2 = rotl64(k2, 33); k2 *= c1; h2 ^= k2;
case 8: k1 ^= uint64_t(tail[7]) << 56;
case 7: k1 ^= uint64_t(tail[6]) << 48;
case 6: k1 ^= uint64_t(tail[5]) << 40;
case 5: k1 ^= uint64_t(tail[4]) << 32;
case 4: k1 ^= uint64_t(tail[3]) << 24;
case 3: k1 ^= uint64_t(tail[2]) << 16;
case 2: k1 ^= uint64_t(tail[1]) << 8;
case 1: k1 ^= uint64_t(tail[0]);
k1 *= c1; k1 = rotl64(k1, 31); k1 *= c2; h1 ^= k1;
}
h1 ^= len;
h2 ^= len;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
return h1;
}
// =======================================================
// Linear-probing hash-only map
// =======================================================
template<typename V>
class HashOnlyLinearMap {
struct Slot {
uint64_t hash = 0;
bool occupied = false;
V value;
};
std::vector<Slot> table;
size_t mask;
public:
explicit HashOnlyLinearMap(size_t power_of_two_capacity) {
size_t cap = 1ULL << power_of_two_capacity;
table.resize(cap);
mask = cap - 1;
}
void insert(uint64_t hash, const V& value) {
size_t idx = hash & mask;
while (true) {
Slot& s = table[idx];
if (!s.occupied) {
s.occupied = true;
s.hash = hash;
s.value = value;
return;
}
if (s.hash == hash) {
s.value = value;
return;
}
idx = (idx + 1) & mask; // linear probe
}
}
const V* find(uint64_t hash) const {
size_t idx = hash & mask;
while (true) {
const Slot& s = table[idx];
if (!s.occupied) {
return nullptr;
}
if (s.hash == hash) {
return &s.value;
}
idx = (idx + 1) & mask;
}
}
};
// =======================================================
// Benchmark
// =======================================================
int main() {
constexpr size_t N = 2'000'000;
std::vector<uint64_t> keys(N);
std::mt19937_64 rng(123);
for (auto& k : keys) k = rng();
// Custom map
HashOnlyLinearMap<uint64_t> custom_map(24); // ~2M slots
/*
* Custom linear-probe map: 69689us
std::unordered_map: 298924us
no hash, per insert / lookup half
Custom linear-probe map: 18881us
std::unordered_map: 278250us
*/
for (auto k : keys) {
//uint64_t h = murmur3_64(&k, sizeof(k));
custom_map.insert(k, k);
}
auto t1 = std::chrono::high_resolution_clock::now();
uint64_t sum1 = 0;
for (const auto& k : keys) {
//uint64_t h = murmur3_64(&k, sizeof(k));
if (auto* v = custom_map.find(k)) {
sum1 += *v;
}
}
auto t2 = std::chrono::high_resolution_clock::now();
// std::unordered_map
std::unordered_map<uint64_t, std::vector<uint64_t>> std_map;
std_map.reserve(N);
auto t3 = std::chrono::high_resolution_clock::now();
for (auto k : keys) {
std_map[k].push_back(k);
}
uint64_t sum2 = 0;
for (auto k : keys) {
for (auto x : std_map[k]) sum2 += x;
}
auto t4 = std::chrono::high_resolution_clock::now();
std::cout << "Custom linear-probe map: "
<< std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << "us\n";
std::cout << "std::unordered_map: "
<< std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3).count() << "us\n";
std::cout << "Ignore sums: " << sum1 << " " << sum2 << "\n";
}
#include <iostream>
#include <vector>
#include <unordered_map>
#include <random>
#include <chrono>
#include <cstdint>
// =======================================================
// MurmurHash3 x64 → 64-bit
// =======================================================
static inline uint64_t rotl64(uint64_t x, int8_t r) {
return (x << r) | (x >> (64 - r));
}
static inline uint64_t fmix64(uint64_t k) {
k ^= k >> 33;
k *= 0xff51afd7ed558ccdULL;
k ^= k >> 33;
k *= 0xc4ceb9fe1a85ec53ULL;
k ^= k >> 33;
return k;
}
uint64_t murmur3_64(const void* key, size_t len, uint64_t seed = 0) {
const uint8_t* data = static_cast<const uint8_t*>(key);
size_t nblocks = len / 16;
uint64_t h1 = seed;
uint64_t h2 = seed;
const uint64_t c1 = 0x87c37b91114253d5ULL;
const uint64_t c2 = 0x4cf5ad432745937fULL;
const uint64_t* blocks = (const uint64_t*)data;
for (size_t i = 0; i < nblocks; i++) {
uint64_t k1 = blocks[i * 2];
uint64_t k2 = blocks[i * 2 + 1];
k1 *= c1; k1 = rotl64(k1, 31); k1 *= c2; h1 ^= k1;
h1 = rotl64(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729;
k2 *= c2; k2 = rotl64(k2, 33); k2 *= c1; h2 ^= k2;
h2 = rotl64(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5;
}
const uint8_t* tail = data + nblocks * 16;
uint64_t k1 = 0, k2 = 0;
switch (len & 15) {
case 15: k2 ^= uint64_t(tail[14]) << 48;
case 14: k2 ^= uint64_t(tail[13]) << 40;
case 13: k2 ^= uint64_t(tail[12]) << 32;
case 12: k2 ^= uint64_t(tail[11]) << 24;
case 11: k2 ^= uint64_t(tail[10]) << 16;
case 10: k2 ^= uint64_t(tail[9]) << 8;
case 9: k2 ^= uint64_t(tail[8]);
k2 *= c2; k2 = rotl64(k2, 33); k2 *= c1; h2 ^= k2;
case 8: k1 ^= uint64_t(tail[7]) << 56;
case 7: k1 ^= uint64_t(tail[6]) << 48;
case 6: k1 ^= uint64_t(tail[5]) << 40;
case 5: k1 ^= uint64_t(tail[4]) << 32;
case 4: k1 ^= uint64_t(tail[3]) << 24;
case 3: k1 ^= uint64_t(tail[2]) << 16;
case 2: k1 ^= uint64_t(tail[1]) << 8;
case 1: k1 ^= uint64_t(tail[0]);
k1 *= c1; k1 = rotl64(k1, 31); k1 *= c2; h1 ^= k1;
}
h1 ^= len;
h2 ^= len;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
return h1;
}
// =======================================================
// Linear-probing hash-only map
// =======================================================
template<typename V>
class HashOnlyLinearMap {
struct KVPair {
uint64_t _hash = 0;
V _value;
inline bool add(uint64_t hash, const V& val) {
if (_hash == 0) {
_hash = hash;
_value = val;
return true;
} else if (_hash == hash) {
_value = val;
return true;
}
return false;
}
inline bool find(uint64_t hash, const V** val) const {
if (_hash == 0) {
return false;
} else if (_hash == hash) {
*val = &_value;
return true;
}
return false;
}
};
struct Slot {
KVPair pairs[8];
inline bool add(uint64_t hash, const V& val) {
return pairs[0].add(hash, val) ||
pairs[1].add(hash, val) ||
pairs[2].add(hash, val) ||
pairs[3].add(hash, val) ||
pairs[4].add(hash, val) ||
pairs[5].add(hash, val) ||
pairs[6].add(hash, val) ||
pairs[7].add(hash, val);
}
inline bool find(uint64_t hash, const V** val) const {
return pairs[0].find(hash, val) ||
pairs[1].find(hash, val) ||
pairs[2].find(hash, val) ||
pairs[3].find(hash, val) ||
pairs[4].find(hash, val) ||
pairs[5].find(hash, val) ||
pairs[6].find(hash, val) ||
pairs[7].find(hash, val);
}
};
std::vector<Slot> table;
size_t mask;
public:
explicit HashOnlyLinearMap(size_t power_of_two_capacity) {
size_t cap = 1ULL << power_of_two_capacity;
table.resize(cap);
mask = cap - 1;
}
void insert(uint64_t hash, const V& value) {
size_t idx = hash & mask;
while (true) {
Slot& s = table[idx];
if (s.add(hash, value)) {
return;
}
idx = (idx + 1) & mask; // linear probe
}
}
const V* find(uint64_t hash) const {
size_t idx = hash & mask;
const V* myPtr = nullptr;
while (true) {
const Slot& s = table[idx];
if (s.find(hash, &myPtr)) {
return myPtr;
}
idx = (idx + 1) & mask;
}
}
};
// =======================================================
// Benchmark
// =======================================================
int main() {
constexpr size_t N = 10'000'000;
std::vector<uint64_t> keys(N);
std::mt19937_64 rng(123);
for (auto& k : keys) k = rng();
// Custom map
HashOnlyLinearMap<uint64_t> custom_map(24); // ~2M slots
/*
* Custom linear-probe map: 69689us
std::unordered_map: 298924us
no hash, per insert / lookup half
Custom linear-probe map: 18881us
std::unordered_map: 278250us
jweinstein@JOSWEINS-M-J60X cwork % ./hashbench
Custom linear-probe map: 91792us
std::unordered_map: 3308128us
*/
for (auto k : keys) {
//uint64_t h = murmur3_64(&k, sizeof(k));
custom_map.insert(k, k);
}
auto t1 = std::chrono::high_resolution_clock::now();
uint64_t sum1 = 0;
for (const auto& k : keys) {
//uint64_t h = murmur3_64(&k, sizeof(k));
if (auto* v = custom_map.find(k)) {
sum1 += *v;
}
}
auto t2 = std::chrono::high_resolution_clock::now();
// std::unordered_map
std::unordered_map<uint64_t, std::vector<uint64_t>> std_map;
auto t3 = std::chrono::high_resolution_clock::now();
for (auto k : keys) {
std_map[k].push_back(k);
}
uint64_t sum2 = 0;
for (auto k : keys) {
for (auto x : std_map[k]) sum2 += x;
}
auto t4 = std::chrono::high_resolution_clock::now();
std::cout << "Custom linear-probe map: "
<< std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << "us\n";
std::cout << "std::unordered_map: "
<< std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3).count() << "us\n";
std::cout << "Ignore sums: " << sum1 << " " << sum2 << "\n";
}
@jweinst1
Copy link
Author

The 10 million run for inline chaining does around 91ms for 10 million keys lookup

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment