Last active
July 13, 2020 06:28
-
-
Save chenshuo/a0e7a13622578445168d to your computer and use it in GitHub Desktop.
khash in C++
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <type_traits> | |
typedef uint32_t khint_t; | |
typedef khint_t khiter_t; | |
template<typename KEY, typename VALUE> | |
class KHash | |
{ | |
khint_t n_buckets = 0, size_ = 0, n_occupied = 0, upper_bound = 0; | |
uint32_t* flags_ = nullptr; | |
KEY* keys_ = nullptr; | |
static constexpr bool kh_is_map = !std::is_void<VALUE>::value; | |
typedef typename std::conditional<kh_is_map, VALUE, char>::type ValueType; | |
ValueType* vals_ = nullptr; | |
public: | |
KHash() { } | |
~KHash() | |
{ | |
free(flags_); | |
free(keys_); | |
free(vals_); | |
} | |
void clear(); | |
khiter_t get(KEY key) const; | |
khiter_t put(KEY key, int* ret); | |
// typename std::enable_if<kh_is_map, ValueType>::type& | |
ValueType& val(khiter_t x) { return vals_[x]; } | |
void del(khiter_t x); | |
khint_t size() const { return size_; } | |
private: | |
int resize(khint_t new_n_buckets); | |
// #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) | |
// #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) | |
// #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) | |
// #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) | |
//#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) | |
//#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) | |
//#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) | |
static bool __ac_isempty(uint32_t* flag, khint_t i) { return ((flag[i>>4]>>((i&0xfU)<<1))&2); } | |
bool __ac_isdel(khint_t i) { return ((flags_[i>>4]>>((i&0xfU)<<1))&1); } | |
bool __ac_iseither(khint_t i) { return ((flags_[i>>4]>>((i&0xfU)<<1))&3); } | |
static bool __ac_set_isempty_false(uint32_t* flag, khint_t i) { return (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))); } | |
bool __ac_set_isboth_false(khint_t i) { return (flags_[i>>4]&=~(3ul<<((i&0xfU)<<1))); } | |
bool __ac_set_isdel_true(khint_t i) { return (flags_[i>>4]|=1ul<<((i&0xfU)<<1)); } | |
static khint_t __hash_func(khint_t k) { return k; } | |
static bool __hash_equal(khint_t x, khint_t y) { return x == y; } | |
/* | |
static khint_t __hash_func(uint64_t k) { return k; } | |
static bool __hash_equal(uint64_t x, uint64_t y) { return x == y; } | |
*/ | |
static khint_t __hash_func(const char *s) | |
{ | |
// __ac_X31_hash_string | |
khint_t h = (khint_t)*s; | |
if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; | |
return h; | |
} | |
static bool __hash_equal(const char* x, const char* y) { return strcmp(x, y) == 0; } | |
KHash(const KHash&) = delete; | |
void operator=(const KHash&) = delete; | |
}; | |
template<typename KEY, typename VALUE> | |
inline khiter_t KHash<KEY, VALUE>::put(KEY key, int* ret) | |
{ | |
khint_t x; | |
auto h = this; | |
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ | |
if (h->n_buckets > (h->size_<<1)) { | |
if (resize(h->n_buckets - 1) < 0) { /* clear "deleted" elements */ | |
*ret = -1; return h->n_buckets; | |
} | |
} else if (resize(h->n_buckets + 1) < 0) { /* expand the hash table */ | |
*ret = -1; return h->n_buckets; | |
} | |
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ | |
{ | |
khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; | |
x = site = h->n_buckets; k = __hash_func(key); i = k & mask; | |
if (__ac_isempty(h->flags_, i)) x = i; /* for speed up */ | |
else { | |
last = i; | |
while (!__ac_isempty(h->flags_, i) && (__ac_isdel(i) || !__hash_equal(h->keys_[i], key))) { | |
if (__ac_isdel(i)) site = i; | |
i = (i + (++step)) & mask; | |
if (i == last) { x = site; break; } | |
} | |
if (x == h->n_buckets) { | |
if (__ac_isempty(h->flags_, i) && site != h->n_buckets) x = site; | |
else x = i; | |
} | |
} | |
} | |
if (__ac_isempty(h->flags_, x)) { /* not present at all */ | |
h->keys_[x] = key; | |
__ac_set_isboth_false(x); | |
++h->size_; ++h->n_occupied; | |
*ret = 1; | |
} else if (__ac_isdel(x)) { /* deleted */ | |
h->keys_[x] = key; | |
__ac_set_isboth_false(x); | |
++h->size_; | |
*ret = 2; | |
} else *ret = 0; /* Don't touch h->keys_[x] if present and not deleted */ | |
return x; | |
} | |
template<typename KEY, typename VALUE> | |
inline int KHash<KEY, VALUE>::resize(khint_t new_n_buckets) | |
{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ | |
#ifndef kroundup32 | |
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
#endif | |
#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) | |
static const double __ac_HASH_UPPER = 0.77; | |
auto h = this; | |
uint32_t *new_flags = 0; | |
khint_t j = 1; | |
{ | |
kroundup32(new_n_buckets); | |
if (new_n_buckets < 4) new_n_buckets = 4; | |
if (h->size_ >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ | |
else { /* hash table size to be changed (shrink or expand); rehash */ | |
new_flags = (uint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(uint32_t)); | |
if (!new_flags) return -1; | |
memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(uint32_t)); | |
if (h->n_buckets < new_n_buckets) { /* expand */ | |
KEY *new_keys = (KEY*)realloc((void *)h->keys_, new_n_buckets * sizeof(KEY)); | |
if (!new_keys) { free(new_flags); return -1; } | |
h->keys_ = new_keys; | |
if (kh_is_map) { | |
ValueType *new_vals = (ValueType*)realloc((void *)h->vals_, new_n_buckets * sizeof(ValueType)); | |
if (!new_vals) { free(new_flags); return -1; } | |
h->vals_ = new_vals; | |
} | |
} /* otherwise shrink */ | |
} | |
} | |
if (j) { /* rehashing is needed */ | |
for (j = 0; j != h->n_buckets; ++j) { | |
if (__ac_iseither(j) == 0) { | |
KEY key = h->keys_[j]; | |
ValueType val; | |
khint_t new_mask; | |
new_mask = new_n_buckets - 1; | |
if (kh_is_map) val = h->vals_[j]; | |
__ac_set_isdel_true(j); | |
while (1) { /* kick-out process; sort of like in Cuckoo hashing */ | |
khint_t k, i, step = 0; | |
k = __hash_func(key); | |
i = k & new_mask; | |
while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; | |
__ac_set_isempty_false(new_flags, i); | |
if (i < h->n_buckets && __ac_iseither(i) == 0) { /* kick out the existing element */ | |
{ KEY tmp = h->keys_[i]; h->keys_[i] = key; key = tmp; } | |
if (kh_is_map) { ValueType tmp = h->vals_[i]; h->vals_[i] = val; val = tmp; } | |
__ac_set_isdel_true(i); /* mark it as deleted in the old hash table */ | |
} else { /* write the element and jump out of the loop */ | |
h->keys_[i] = key; | |
if (kh_is_map) h->vals_[i] = val; | |
break; | |
} | |
} | |
} | |
} | |
if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ | |
h->keys_ = (KEY*)realloc((void *)h->keys_, new_n_buckets * sizeof(KEY)); | |
if (kh_is_map) h->vals_ = (ValueType*)realloc((void *)h->vals_, new_n_buckets * sizeof(ValueType)); | |
} | |
free(h->flags_); /* free the working space */ | |
h->flags_ = new_flags; | |
h->n_buckets = new_n_buckets; | |
h->n_occupied = h->size_; | |
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); | |
} | |
return 0; | |
#undef kroundup32 | |
#undef __ac_fsize | |
} | |
template<typename KEY, typename VALUE> | |
inline void KHash<KEY, VALUE>::del(khiter_t x) | |
{ | |
auto h = this; | |
if (x != h->n_buckets && !__ac_iseither(x)) { | |
__ac_set_isdel_true(x); | |
--h->size_; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <assert.h> | |
#include <time.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <sys/time.h> | |
#include <memory> | |
#include "khash.hpp" | |
#include "khash.h" | |
KHASH_SET_INIT_STR(str) | |
KHASH_MAP_INIT_INT(int, unsigned char) | |
static int data_size = 5000000; | |
static unsigned *int_data; | |
static char **str_data; | |
void ht_init_data() | |
{ | |
int i; | |
char buf[256]; | |
khint32_t x = 11; | |
printf("--- generating data... "); | |
int_data = (unsigned*)calloc(data_size, sizeof(unsigned)); | |
str_data = (char**)calloc(data_size, sizeof(char*)); | |
for (i = 0; i < data_size; ++i) { | |
int_data[i] = (unsigned)(data_size * ((double)x / UINT_MAX) / 4) * 271828183u; | |
sprintf(buf, "%x", int_data[i]); | |
str_data[i] = strdup(buf); | |
x = 1664525L * x + 1013904223L; | |
} | |
printf("done!\n"); | |
} | |
void ht_destroy_data() | |
{ | |
int i; | |
for (i = 0; i < data_size; ++i) free(str_data[i]); | |
free(str_data); free(int_data); | |
} | |
void ht_khash_int() | |
{ | |
int i, ret; | |
unsigned *data = int_data; | |
khash_t(int) *h; | |
unsigned k; | |
h = kh_init(int); | |
for (i = 0; i < data_size; ++i) { | |
k = kh_put(int, h, data[i], &ret); | |
kh_val(h, k) = i&0xff; | |
if (!ret) kh_del(int, h, k); | |
} | |
printf("[ht_khash_int] size: %u\n", kh_size(h)); | |
kh_destroy(int, h); | |
} | |
void ht_khash_int2() | |
{ | |
int i, ret; | |
unsigned *data = int_data; | |
KHash<int, unsigned char> h; | |
// khash_t(int) *h; | |
unsigned k; | |
//h = kh_init(int); | |
for (i = 0; i < data_size; ++i) { | |
k = h.put(data[i], &ret); | |
h.val(k) = i&0xff; | |
if (!ret) h.del(k); | |
} | |
printf("[ht_khash_int2] size: %u\n", h.size()); | |
//kh_destroy(int, h); | |
} | |
void ht_khash_int3() | |
{ | |
int i, ret; | |
unsigned *data = int_data; | |
std::unique_ptr<KHash<int, unsigned char>> h(new KHash<int, unsigned char>); | |
// khash_t(int) *h; | |
unsigned k; | |
//h = kh_init(int); | |
for (i = 0; i < data_size; ++i) { | |
k = h->put(data[i], &ret); | |
h->val(k) = i&0xff; | |
if (!ret) h->del(k); | |
} | |
printf("[ht_khash_int3] size: %u\n", h->size()); | |
//kh_destroy(int, h); | |
} | |
void ht_khash_str() | |
{ | |
int i, ret; | |
char **data = str_data; | |
khash_t(str) *h; | |
unsigned k; | |
h = kh_init(str); | |
for (i = 0; i < data_size; ++i) { | |
k = kh_put(str, h, data[i], &ret); | |
if (!ret) kh_del(str, h, k); | |
} | |
printf("[ht_khash_str] size: %u\n", kh_size(h)); | |
kh_destroy(str, h); | |
} | |
void ht_khash_str2() | |
{ | |
int i, ret; | |
char **data = str_data; | |
//khash_t(str) *h; | |
KHash<const char*, void> h; | |
unsigned k; | |
//h = kh_init(str); | |
for (i = 0; i < data_size; ++i) { | |
k = h.put(data[i], &ret); | |
if (!ret) h.del(k); | |
} | |
printf("[ht_khash_str2] size: %u\n", h.size()); | |
//kh_destroy(str, h); | |
} | |
double now() | |
{ | |
struct timeval tv; | |
gettimeofday(&tv, NULL); | |
return tv.tv_sec + tv.tv_usec / 1000000.0; | |
} | |
void ht_timing(void (*f)(void)) | |
{ | |
double t = now(); | |
(*f)(); | |
printf("[ht_timing] %.4lf sec\n", (now() - t)); | |
} | |
int main(int argc, char *argv[]) | |
{ | |
if (argc > 1) data_size = atoi(argv[1]); | |
ht_init_data(); | |
ht_timing(ht_khash_int); | |
ht_timing(ht_khash_int2); | |
ht_timing(ht_khash_int3); | |
ht_timing(ht_khash_int); | |
ht_timing(ht_khash_int2); | |
ht_timing(ht_khash_int3); | |
//* | |
ht_timing(ht_khash_str); | |
ht_timing(ht_khash_str2); | |
ht_timing(ht_khash_str); | |
ht_timing(ht_khash_str2); | |
//*/ | |
//ht_timing(ht_khash_unpack); | |
//ht_timing(ht_khash_packed); | |
ht_destroy_data(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment