Skip to content

Instantly share code, notes, and snippets.

@jweinst1
Last active January 17, 2026 00:13
Show Gist options
  • Select an option

  • Save jweinst1/7fbeff5bbcc4deebcaef8fa654b4257d to your computer and use it in GitHub Desktop.

Select an option

Save jweinst1/7fbeff5bbcc4deebcaef8fa654b4257d to your computer and use it in GitHub Desktop.
write o1 hashmap
#include <iostream>
#include <vector>
#include <unordered_map>
#include <random>
#include <chrono>
#include <cstdint>
#include <limits>
#include <cstring>
static inline uint32_t rotl32(uint32_t x, int8_t r) {
return (x << r) | (x >> (32 - r));
}
static inline uint32_t fmix32(uint32_t h) {
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
uint32_t murmur3_32(const void* key, size_t len, uint32_t seed = 0) {
const uint8_t* data = static_cast<const uint8_t*>(key);
const size_t nblocks = len / 4;
uint32_t h1 = seed;
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
// body
const uint32_t* blocks = reinterpret_cast<const uint32_t*>(data);
for (size_t i = 0; i < nblocks; i++) {
uint32_t k1 = blocks[i];
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
h1 = rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
}
// tail
const uint8_t* tail = data + nblocks * 4;
uint32_t k1 = 0;
switch (len & 3) {
case 3: k1 ^= uint32_t(tail[2]) << 16;
case 2: k1 ^= uint32_t(tail[1]) << 8;
case 1: k1 ^= uint32_t(tail[0]);
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
}
// finalization
h1 ^= static_cast<uint32_t>(len);
h1 = fmix32(h1);
return h1;
}
struct IdGen {
std::random_device rd;
std::mt19937 gen;
std::uniform_int_distribution<uint64_t> distrib;
IdGen(): rd(), gen(rd()), distrib(std::numeric_limits<uint64_t>::min(),
std::numeric_limits<uint64_t>::max()) {}
uint64_t next() {
return static_cast<uint64_t>(distrib(gen));
}
};
/**
* constexpr void set(size_t idx) {
const size_t block = idx >> 6;
const size_t offset = idx & 63;
bins[block].set(offset);
}
* */
struct bit64set {
size_t data = 0;
inline int getSlot(size_t idx) const {
const size_t mask = ~((1ULL << idx) - 1);
const size_t candidates = (~data) & mask;
if (candidates == 0) {
return -1;
}
return __builtin_ctzll(candidates);
}
inline void set(int idx) {
data |= size_t{1} << idx;
}
inline bool test(int idx) const {
return data & size_t{1} << idx;
}
inline bool isFull() const {
return data == std::numeric_limits<std::size_t>::max();
}
};
struct super2bit64set {
bit64set mark;
bit64set data[64];
inline int getSlot(size_t idx) const {
const size_t block = idx >> 6;
const size_t offset = idx & 63;
int top = mark.getSlot(block);
if (top == -1) {
return -1;
}
return data[top].getSlot(offset) + (block << 6);
}
inline void set(size_t idx) {
const size_t block = idx >> 6;
const size_t offset = idx & 63;
data[block].set(offset);
if (data[block].isFull()) {
mark.set(block);
}
}
inline bool isFull() const {
return mark.isFull();
}
};
struct super3bit64set {
bit64set mark;
super2bit64set data[64];
inline long getSlot(size_t idx) {
const size_t block = idx & 4095;
const size_t fblock = idx >> 12;
int top = mark.getSlot(fblock);
if (top == -1) {
return -1;
}
return data[top].getSlot(block) + (fblock << 12);
}
inline void set(size_t idx) {
const size_t block = idx & 4095;
const size_t fblock = idx >> 12;
data[fblock].set(block);
if (data[fblock].isFull()) {
mark.set(fblock);
}
}
inline bool isFull() const {
return mark.isFull();
}
};
static constexpr size_t MAX_18_BIT_NUM = (size_t{1} << 18) - 1;
struct super4bit64set {
bit64set mark;
super3bit64set data[64];
inline long getSlot(size_t idx) {
const size_t block = idx & MAX_18_BIT_NUM;
const size_t fblock = idx >> 18;
int top = mark.getSlot(fblock);
if (top == -1) {
return -1;
}
return data[top].getSlot(block) + (fblock << 18);
}
inline void set(size_t idx) {
const size_t block = idx & MAX_18_BIT_NUM;
const size_t fblock = idx >> 18;
data[fblock].set(block);
if (data[fblock].isFull()) {
mark.set(fblock);
}
}
};
static void testBitSetters() {
bit64set foo{0b1111100111};
printf("%d\n", foo.getSlot(2));
printf("%d\n", foo.getSlot(7));
super2bit64set foo2;
foo2.set(1002);
foo2.set(1005);
foo2.set(1006);
printf("%d\n", foo2.getSlot(1002));
super3bit64set foo3;
foo3.set(9001);
foo3.set(9002);
foo3.set(9004);
foo3.set(9005);
foo3.set(19004);
foo3.set(54004);
printf("%ld\n", foo3.getSlot(9001));
printf("%ld\n", foo3.getSlot(19004));
printf("%ld\n", foo3.getSlot(54004));
}
static constexpr size_t keyMask = (1 << 18) - 1;
static void level3perftest() {
IdGen gens;
std::vector<uint32_t> nums;
for (int i = 0; i < 10000000; ++i) // 27ms
{
nums.push_back(gens.next() & keyMask);
}
super3bit64set mySets[70];
int curSet = 0;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nums.size(); ++i)
{
const auto mySlot = mySets[curSet].getSlot(nums[i]);
if (mySlot == -1) {
++curSet;
continue;
}
mySets[curSet].set(mySlot);
}
auto end = std::chrono::high_resolution_clock::now();
printf("cur set %d\n", curSet);
std::cout << "level 3 " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us\n";
}
/**
* making this larger spreads small accesses across a wider area
* making this smaller concentrates them and makes some areas hot
* */
static constexpr size_t keyMask2 = (1 << 24) - 1;
int main(int argc, char const *argv[])
{
IdGen gens;
std::vector<uint32_t> nums;
for (int i = 0; i < 10000000; ++i) // 66ms
{
nums.push_back(gens.next() & keyMask2);
}
super4bit64set mySet;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nums.size(); ++i)
{
const auto mySlot = mySet.getSlot(nums[i]);
if (mySlot == -1) {
continue;
}
mySet.set(mySlot);
}
auto end = std::chrono::high_resolution_clock::now();
std::cout << "level 4 " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us\n";
return 0;
}
#include <iostream>
#include <vector>
#include <unordered_map>
#include <random>
#include <chrono>
#include <cstdint>
#include <limits>
#include <cstring>
static inline uint32_t rotl32(uint32_t x, int8_t r) {
return (x << r) | (x >> (32 - r));
}
static inline uint32_t fmix32(uint32_t h) {
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
uint32_t murmur3_32(const void* key, size_t len, uint32_t seed = 0) {
const uint8_t* data = static_cast<const uint8_t*>(key);
const size_t nblocks = len / 4;
uint32_t h1 = seed;
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
// body
const uint32_t* blocks = reinterpret_cast<const uint32_t*>(data);
for (size_t i = 0; i < nblocks; i++) {
uint32_t k1 = blocks[i];
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
h1 = rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
}
// tail
const uint8_t* tail = data + nblocks * 4;
uint32_t k1 = 0;
switch (len & 3) {
case 3: k1 ^= uint32_t(tail[2]) << 16;
case 2: k1 ^= uint32_t(tail[1]) << 8;
case 1: k1 ^= uint32_t(tail[0]);
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
}
// finalization
h1 ^= static_cast<uint32_t>(len);
h1 = fmix32(h1);
return h1;
}
struct IdGen {
std::random_device rd;
std::mt19937 gen;
std::uniform_int_distribution<uint64_t> distrib;
IdGen(): rd(), gen(rd()), distrib(std::numeric_limits<uint64_t>::min(),
std::numeric_limits<uint64_t>::max()) {}
uint64_t next() {
return static_cast<uint64_t>(distrib(gen));
}
};
/**
* constexpr void set(size_t idx) {
const size_t block = idx >> 6;
const size_t offset = idx & 63;
bins[block].set(offset);
}
* */
struct bit64set {
size_t data = 0;
inline int getSlot(size_t idx) const {
const size_t mask = ~((1ULL << idx) - 1);
const size_t candidates = (~data) & mask;
if (candidates == 0) {
return -1;
}
return __builtin_ctzll(candidates);
}
inline void set(int idx) {
data |= size_t{1} << idx;
}
inline bool test(int idx) const {
return data & size_t{1} << idx;
}
inline bool isFull() const {
return data == std::numeric_limits<std::size_t>::max();
}
};
struct super2bit64set {
bit64set mark;
bit64set data[64];
inline int getSlot(size_t idx) const {
const size_t block = idx >> 6;
const size_t offset = idx & 63;
int top = mark.getSlot(block);
if (top == -1) {
return -1;
}
return data[top].getSlot(offset) + (block << 6);
}
inline void set(size_t idx) {
const size_t block = idx >> 6;
const size_t offset = idx & 63;
data[block].set(offset);
if (data[block].isFull()) {
mark.set(block);
}
}
inline bool isFull() const {
return mark.isFull();
}
};
struct super3bit64set {
bit64set mark;
super2bit64set data[64];
inline long getSlot(size_t idx) {
const size_t block = idx & 4095;
const size_t fblock = idx >> 12;
int top = mark.getSlot(fblock);
if (top == -1) {
return -1;
}
return data[top].getSlot(block) + (fblock << 12);
}
inline void set(size_t idx) {
const size_t block = idx & 4095;
const size_t fblock = idx >> 12;
data[fblock].set(block);
if (data[fblock].isFull()) {
mark.set(fblock);
}
}
inline bool isFull() const {
return mark.isFull();
}
};
static constexpr size_t MAX_18_BIT_NUM = (size_t{1} << 18) - 1;
struct super4bit64set {
bit64set mark;
super3bit64set data[64];
inline long getSlot(size_t idx) {
const size_t block = idx & MAX_18_BIT_NUM;
const size_t fblock = idx >> 18;
int top = mark.getSlot(fblock);
if (top == -1) {
return -1;
}
return data[top].getSlot(block) + (fblock << 18);
}
inline void set(size_t idx) {
const size_t block = idx & MAX_18_BIT_NUM;
const size_t fblock = idx >> 18;
data[fblock].set(block);
if (data[fblock].isFull()) {
mark.set(fblock);
}
}
};
static void testBitSetters() {
bit64set foo{0b1111100111};
printf("%d\n", foo.getSlot(2));
printf("%d\n", foo.getSlot(7));
super2bit64set foo2;
foo2.set(1002);
foo2.set(1005);
foo2.set(1006);
printf("%d\n", foo2.getSlot(1002));
super3bit64set foo3;
foo3.set(9001);
foo3.set(9002);
foo3.set(9004);
foo3.set(9005);
foo3.set(19004);
foo3.set(54004);
printf("%ld\n", foo3.getSlot(9001));
printf("%ld\n", foo3.getSlot(19004));
printf("%ld\n", foo3.getSlot(54004));
}
static constexpr size_t keyMask = (1 << 18) - 1;
static void level3perftest() {
IdGen gens;
std::vector<uint32_t> nums;
for (int i = 0; i < 10000000; ++i) // 27ms
{
nums.push_back(gens.next() & keyMask);
}
super3bit64set mySets[70];
int curSet = 0;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nums.size(); ++i)
{
const auto mySlot = mySets[curSet].getSlot(nums[i]);
if (mySlot == -1) {
++curSet;
continue;
}
mySets[curSet].set(mySlot);
}
auto end = std::chrono::high_resolution_clock::now();
printf("cur set %d\n", curSet);
std::cout << "level 3 " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us\n";
}
/**
* making this larger spreads small accesses across a wider area
* making this smaller concentrates them and makes some areas hot
* */
static constexpr size_t keyMask2 = (1 << 24) - 1;
static void levle4perftest() {
IdGen gens;
std::vector<uint32_t> nums;
for (int i = 0; i < 10000000; ++i) // 66ms
{
nums.push_back(gens.next() & keyMask2);
}
super4bit64set mySet;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nums.size(); ++i)
{
const auto mySlot = mySet.getSlot(nums[i]);
if (mySlot == -1) {
continue;
}
mySet.set(mySlot);
}
auto end = std::chrono::high_resolution_clock::now();
std::cout << "level 4 " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us\n";
}
enum class HashQueryRes {
Empty,
Occupied,
Found
};
class HashOnlyLinearMap {
struct KVPair {
uint32_t _hash = 0;
uint32_t _value;
inline bool add(uint32_t hash, uint32_t val) {
if (_hash == 0) {
_hash = hash;
_value = val;
return true;
} else if (_hash == hash) {
_value = val;
return true;
}
return false;
}
inline HashQueryRes find(uint32_t hash, uint32_t* val) const {
if (_hash == 0) {
return HashQueryRes::Empty;
} else if (_hash == hash) {
*val = _value;
return HashQueryRes::Found;
}
return HashQueryRes::Occupied;
}
};
struct Slot {
KVPair pair;
inline bool add(uint32_t hash, uint32_t val) {
return pair.add(hash, val);
}
inline HashQueryRes find(uint32_t hash, uint32_t* val) const {
return pair.find(hash, val);
}
};
std::vector<Slot> table;
size_t mask;
public:
explicit HashOnlyLinearMap(size_t power_of_two_capacity) {
size_t cap = 1ULL << power_of_two_capacity;
table.resize(cap);
mask = cap - 1;
}
void insert(uint32_t hash, uint32_t value) {
size_t idx = hash & mask;
while (true) {
Slot& s = table[idx];
if (s.add(hash, value)) {
return;
}
idx = (idx + 1) & mask; // linear probe
}
}
uint32_t find(uint32_t hash) const {
size_t idx = hash & mask;
uint32_t myNum;
while (true) {
const Slot& s = table[idx];
const HashQueryRes res = s.find(hash, &myNum);
if (res == HashQueryRes::Found) {
return myNum;
} else if (res == HashQueryRes::Empty) {
return 0;
}
idx = (idx + 1) & mask;
}
}
};
static void hashperftest() {
static constexpr size_t keyMask3 = (1 << 18) - 1;
IdGen gens;
std::vector<uint32_t> nums;
/**
* hash ins 12226us
822616934hash find 7942us
* */
for (int i = 0; i < 10000000; ++i)
{
nums.push_back(gens.next() & keyMask3);
}
HashOnlyLinearMap foobar(24);
auto startIns = std::chrono::high_resolution_clock::now();
for (const auto& num : nums)
{
foobar.insert(num, num);
}
auto endIns = std::chrono::high_resolution_clock::now();
std::cout << "hash ins " << std::chrono::duration_cast<std::chrono::microseconds>(endIns - startIns).count() << "us\n";
auto startFind = std::chrono::high_resolution_clock::now();
uint32_t total = 0;
for (const auto& num : nums)
{
total += foobar.find(num);
}
auto endFind = std::chrono::high_resolution_clock::now();
std::cout << total << "hash find " << std::chrono::duration_cast<std::chrono::microseconds>(endFind - startFind).count() << "us\n";
}
int main(int argc, char const *argv[])
{
hashperftest();
return 0;
}
#include <iostream>
#include <vector>
#include <unordered_map>
#include <random>
#include <chrono>
#include <cstdint>
#include <limits>
#include <cstring>
static inline uint32_t rotl32(uint32_t x, int8_t r) {
return (x << r) | (x >> (32 - r));
}
static inline uint32_t fmix32(uint32_t h) {
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
uint32_t murmur3_32(const void* key, size_t len, uint32_t seed = 0) {
const uint8_t* data = static_cast<const uint8_t*>(key);
const size_t nblocks = len / 4;
uint32_t h1 = seed;
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
// body
const uint32_t* blocks = reinterpret_cast<const uint32_t*>(data);
for (size_t i = 0; i < nblocks; i++) {
uint32_t k1 = blocks[i];
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
h1 = rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
}
// tail
const uint8_t* tail = data + nblocks * 4;
uint32_t k1 = 0;
switch (len & 3) {
case 3: k1 ^= uint32_t(tail[2]) << 16;
case 2: k1 ^= uint32_t(tail[1]) << 8;
case 1: k1 ^= uint32_t(tail[0]);
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
}
// finalization
h1 ^= static_cast<uint32_t>(len);
h1 = fmix32(h1);
return h1;
}
struct IdGen {
std::random_device rd;
std::mt19937 gen;
std::uniform_int_distribution<uint64_t> distrib;
IdGen(): rd(), gen(rd()), distrib(std::numeric_limits<uint64_t>::min(),
std::numeric_limits<uint64_t>::max()) {}
uint64_t next() {
return static_cast<uint64_t>(distrib(gen));
}
};
/**
* constexpr void set(size_t idx) {
const size_t block = idx >> 6;
const size_t offset = idx & 63;
bins[block].set(offset);
}
* */
struct bit64set {
size_t data = 0;
inline int getSlot(size_t idx) const {
const size_t mask = ~((1ULL << idx) - 1);
const size_t candidates = (~data) & mask;
if (candidates == 0) {
return -1;
}
return __builtin_ctzll(candidates);
}
inline void set(int idx) {
data |= size_t{1} << idx;
}
inline bool isFull() const {
return data == std::numeric_limits<std::size_t>::max();
}
};
struct super2bit64set {
bit64set mark;
bit64set data[64];
inline int getSlot(size_t idx) const {
const size_t block = idx >> 6;
const size_t offset = idx & 63;
int top = mark.getSlot(block);
if (top == -1) {
return -1;
}
return data[top].getSlot(offset) + (block << 6);
}
inline void set(size_t idx) {
const size_t block = idx >> 6;
const size_t offset = idx & 63;
data[block].set(offset);
if (data[block].isFull()) {
mark.set(block);
}
}
inline bool isFull() const {
return mark.isFull();
}
};
struct super3bit64set {
bit64set mark;
super2bit64set data[64];
inline long getSlot(size_t idx) {
const size_t block = idx & 4095;
const size_t fblock = idx >> 12;
int top = mark.getSlot(fblock);
if (top == -1) {
return -1;
}
return data[top].getSlot(block) + (fblock << 12);
}
inline void set(size_t idx) {
const size_t block = idx & 4095;
const size_t fblock = idx >> 12;
data[fblock].set(block);
if (data[fblock].isFull()) {
mark.set(fblock);
}
}
};
static void testBitSetters() {
bit64set foo{0b1111100111};
printf("%d\n", foo.getSlot(2));
printf("%d\n", foo.getSlot(7));
super2bit64set foo2;
foo2.set(1002);
foo2.set(1005);
foo2.set(1006);
printf("%d\n", foo2.getSlot(1002));
super3bit64set foo3;
foo3.set(9001);
foo3.set(9002);
foo3.set(9004);
foo3.set(9005);
foo3.set(19004);
foo3.set(54004);
printf("%ld\n", foo3.getSlot(9001));
printf("%ld\n", foo3.getSlot(19004));
printf("%ld\n", foo3.getSlot(54004));
}
static constexpr size_t keyMask = (1 << 18) - 1;
int main(int argc, char const *argv[])
{
IdGen gens;
std::vector<uint32_t> nums;
for (int i = 0; i < 10000000; ++i) // 27ms
{
nums.push_back(gens.next() & keyMask);
}
super3bit64set mySets[70];
int curSet = 0;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nums.size(); ++i)
{
const auto mySlot = mySets[curSet].getSlot(nums[i]);
if (mySlot == -1) {
++curSet;
continue;
}
mySets[curSet].set(mySlot);
}
auto end = std::chrono::high_resolution_clock::now();
std::cout << " " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us\n";
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment