Skip to content

Instantly share code, notes, and snippets.

@cellularmitosis
Last active July 29, 2021 18:53
Show Gist options
  • Save cellularmitosis/3327379b151445c602ad1406729fa4b4 to your computer and use it in GitHub Desktop.
Save cellularmitosis/3327379b151445c602ad1406729fa4b4 to your computer and use it in GitHub Desktop.
Hash table in C, part 5: better hashing

Blog 2020/8/21

<- previous | index | next ->

Hash table in C, part 5: better hashing

Let's learn how to implement a hash table in C! (See also part 1, 2, 3 and 4).

Let's take a look at the importance of the hash function by comparing "terrible hash" to DJB2 and SDBM

size_t djb2Hash(void* datap, size_t length) {
    size_t hash = 5381;
    uint8_t* cursorp = datap;
    for (size_t i=0; i < length; cursorp++, i++) {
        hash = ((hash << 5) + hash) ^ (*cursorp);
    }
    return hash;
}
size_t sdbmHash(void* datap, size_t length) {
    size_t hash = 0;
    uint8_t* cursorp = datap;
    for (size_t i=0; i < length; cursorp++, i++) {
        hash = (*cursorp) + (hash << 6) + (hash << 16) - hash;
    }
    return hash;
}

Hashing the dictionary

To analyze the performance of these hash functions, we'll add all of the words (about 100,000) in /usr/share/dict/words as keys in a hash table, then look at a histogram of slot list lengths.

We'll use a function pointer so that we can easily pass in different hashing functions.

HashTable* hashTheDictionary(size_t (*hashFuncp)(void*, size_t)) {
    // open the dictionary file
    int fd = open("/usr/share/dict/words", O_RDONLY);
    assert(fd != -1);

    // get the size of the file
    struct stat statbuf;
    int err = fstat(fd, &statbuf);
    assert(err == 0);
    off_t fsize = statbuf.st_size;

    // read the file into memory.
    char* fbuff = (char*)malloc(fsize);
    assert(fbuff != NULL);
    ssize_t bytesRead = read(fd, fbuff, fsize);
    assert(bytesRead == fsize);

    HashTable* htp = HashTableMake(1);
    char* valuep = "common value";
    char* cursorp = fbuff;
    char* endp = fbuff + fsize;
    char* keyp = cursorp;
    while (cursorp != endp) {
        if (*cursorp == '\n') {
            *cursorp = '\0';
            size_t keyLength = cursorp - keyp;
            if (keyLength > 0) {
                size_t keyHash = hashFuncp(keyp, keyLength);
                HashTableSet(htp, keyp, keyLength, keyHash, valuep);
            }
            cursorp++;
            keyp = cursorp;
            continue;
        } else {
            cursorp++;
            continue;
        }
    }

    return htp;
}
void analyzeHashTable(HashTable* htp) {
    #define LENGTHS_LEN (1000)
    size_t score = 0;
    size_t lengths[LENGTHS_LEN] = {0};
    for (size_t i=0; i < htp->capacity; i++) {
        HTSlot* slotp = htp->slotspp[i];
        size_t chainLength = 0;
        while (slotp != NULL) {
            if (slotp->valuep != NULL) {
                chainLength++;
            }
            slotp = slotp->nextp;
            continue;
        }
        lengths[chainLength]++;
        score += (chainLength * chainLength);
        continue;
    }

    printf("Chain lengths histogram:\n");
    printf("length occurrences\n");
    printf("------ -----------\n");
    for (size_t i=0; i < LENGTHS_LEN; i++) {
        if (lengths[i] == 0) {
            continue;
        }

        if (i < 10) {
            printf("     ");
        } else if (i < 100) {
            printf("    ");
        } else if (i < 1000) {
            printf("   ");
        } else if (i < 10000) {
            printf("  ");
        } else if (i < 100000) {
            printf(" ");
        }
        printf("%zu %zu\n", i, lengths[i]);
    }
    printf("population: %zu, slots: %zu, empty slots: %zu, score: %zu\n",
        htp->population, htp->capacity, lengths[0], score
    );
}
int main(int argc, char** argv) {
    HashTable* htp;

    printf("\ndjb2 hash\n");
    htp = hashTheDictionary(djb2Hash);
    analyzeHashTable(htp);

    printf("\nsdbm hash\n");
    htp = hashTheDictionary(sdbmHash);
    analyzeHashTable(htp);

    printf("\nterrible hash\n");
    htp = hashTheDictionary(terribleHash);
    analyzeHashTable(htp);

    return 0;
}

Results

What we are looking for is a "short tail". The ideal hash function would give us an even distribution of hash values. That is, if we had N slots and N keys, all of our slot lists would be length 1.

Practically, some slots will be empty and others will be length > 1.

terrible hash

First, let's take a look at "terrible hash":

terrible hash
Chain lengths histogram:
length occurrences
------ -----------
     0 63741
     1 206
     2 95
     3 67
     4 55
     5 35
     6 36
     7 28
     8 28
     9 32
    10 20
    11 20
    12 13
    13 22
    14 14
    15 12
(...many rows elided...)
    82 8
    83 10
    84 12
    85 7
    86 11
    87 6
    88 8
    89 2
    90 6
    91 5
(...many rows elided...)
   266 1
   267 1
   268 2
   269 2
   272 2
   274 1
   279 1
   283 1
   288 2
   298 1
population: 102401, slots: 65536, empty slots: 63741, score: 13240151

Oof! This is truly terrible! 97% of the slots are unused, and one of the slots has a chain which is 298 elements long!!!

This hash produces a long tail, which is the opposite of what we want!

DJB2 and SDBM

Now let's compare that to DJB2 and SDBM.

DJB2 hash
Chain lengths histogram:
length occurrences
------ -----------
     0 13809
     1 21547
     2 16591
     3 8633
     4 3484
     5 1096
     6 292
     7 69
     8 13
     9 2
population: 102401, slots: 65536, empty slots: 13809, score: 263639
SDBM hash
Chain lengths histogram:
length occurrences
------ -----------
     0 13778
     1 21522
     2 16539
     3 8841
     4 3447
     5 1056
     6 275
     7 66
     8 11
    10 1
population: 102401, slots: 65536, empty slots: 13778, score: 262737

Visually:

Chain length histogram1

DBJ2 and SDBM are remarkably similar in distribution!

Now let's compare DJB2 to "terrible hash" :)

Chain length histogram2

The difference in performance is so severe that they can't really be compared on the same graph! Let's zoom in:

Chain length histogram3

Next time

In part 6, we'll take a look at using prime numbers for our slot array sizes.

#include "HashTable.h"
#include <stdio.h> // fprintf
#include <stdlib.h> // exit
#include <string.h> // memset
#include <stdint.h> // uint8_t
#include <stdbool.h> // true
#include <assert.h> // assert
void* dmalloc(size_t size) {
void* p = malloc(size);
if (p == NULL) {
perror("Error: malloc failed: ");
exit(1);
}
memset(p, 0, size);
return p;
}
#define malloc(x) dont_use_malloc_use_dmalloc
size_t terribleHash(void* datap, size_t length) {
size_t sum = 0;
for (size_t i=0; i < length; i++) {
uint8_t* byte = datap + i;
sum += *byte;
continue;
}
return sum;
}
size_t djb2Hash(void* datap, size_t length) {
size_t hash = 5381;
uint8_t* cursorp = datap;
for (size_t i=0; i < length; cursorp++, i++) {
hash = ((hash << 5) + hash) ^ (*cursorp);
}
return hash;
}
size_t sdbmHash(void* datap, size_t length) {
size_t hash = 0;
uint8_t* cursorp = datap;
for (size_t i=0; i < length; cursorp++, i++) {
hash = (*cursorp) + (hash << 6) + (hash << 16) - hash;
}
return hash;
}
static HTSlot* _HTSlotMake() {
HTSlot* sp = dmalloc(sizeof(HTSlot));
return sp;
}
HashTable* HashTableMake(size_t capacity) {
HashTable* htp = dmalloc(sizeof(HashTable));
htp->capacity = capacity;
htp->minCapacity = capacity;
htp->slotspp = dmalloc(sizeof(void*) * capacity);
return htp;
}
void HashTableFree(HashTable* htp) {
for (size_t i=0; i < htp->capacity; i++) {
HTSlot* slotp = htp->slotspp[i];
while (slotp != NULL) {
HTSlot* tmp = slotp;
free(slotp);
slotp = tmp->nextp;
continue;
}
}
free(htp->slotspp);
free(htp);
}
void* HashTableGet(HashTable* htp, void* keyp, size_t keyLength,
size_t keyHash
) {
size_t slotIndex = keyHash % htp->capacity;
HTSlot* cursorp = htp->slotspp[slotIndex];
while (cursorp != NULL) {
if (cursorp->keyHash == keyHash
&& cursorp->keyLength == keyLength
&& memcmp(cursorp->keyp, keyp, keyLength) == 0
) {
return cursorp->valuep;
} else {
cursorp = cursorp->nextp;
continue;
}
}
return NULL;
}
static void _adjustCapacityIfNeeded(HashTable* htp);
void HashTableSet(HashTable* htp, void* keyp, size_t keyLength,
size_t keyHash, void* valuep
) {
size_t origPop = htp->population;
size_t slotIndex = keyHash % htp->capacity;
HTSlot* cursorp = htp->slotspp[slotIndex];
while (cursorp != NULL) {
if (cursorp->keyHash == keyHash
&& cursorp->keyLength == keyLength
&& memcmp(cursorp->keyp, keyp, keyLength) == 0
) {
if (cursorp->valuep == NULL && valuep != NULL) {
htp->population++;
} else if (cursorp->valuep != NULL && valuep == NULL) {
htp->population--;
}
cursorp->valuep = valuep;
goto adjust;
} else {
cursorp = cursorp->nextp;
continue;
}
}
// no existing match? append a new slot to the list.
if (valuep != NULL) {
HTSlot* newSlotp = _HTSlotMake();
newSlotp->keyp = keyp;
newSlotp->keyLength = keyLength;
newSlotp->keyHash = keyHash;
newSlotp->valuep = valuep;
if (htp->slotspp[slotIndex] == NULL) {
htp->slotspp[slotIndex] = newSlotp;
} else {
HTSlot* lastp = htp->slotspp[slotIndex];
while (lastp->nextp != NULL) {
lastp = lastp->nextp;
continue;
}
lastp->nextp = newSlotp;
}
htp->population++;
}
adjust:
if (origPop != htp->population) {
_adjustCapacityIfNeeded(htp);
}
}
static bool _shouldShrink(HashTable* htp) {
return htp->capacity >= (htp->population * 2);
}
static bool _shouldGrow(HashTable* htp) {
return htp->population >= (htp->capacity * 2);
}
static void _adjustCapacityIfNeeded(HashTable* htp) {
size_t newCapacity;
if (_shouldShrink(htp)) {
newCapacity = htp->capacity / 2;
} else if (_shouldGrow(htp)) {
newCapacity = htp->capacity * 2;
} else {
return;
}
if (newCapacity < htp->minCapacity) {
return;
}
HTSlot** slots2pp = dmalloc(sizeof(void*) * newCapacity);
for (size_t i=0; i < htp->capacity; i++) {
HTSlot* cursorp = htp->slotspp[i];
while (cursorp != NULL) {
HTSlot* tmp = cursorp;
cursorp = cursorp->nextp;
if (tmp->valuep != NULL) {
tmp->nextp = NULL;
size_t slotIndex = tmp->keyHash % newCapacity;
if (slots2pp[slotIndex] == NULL) {
slots2pp[slotIndex] = tmp;
} else {
HTSlot* lastp = slots2pp[slotIndex];
while (lastp->nextp != NULL) {
lastp = lastp->nextp;
continue;
}
lastp->nextp = tmp;
}
}
continue;
}
}
free(htp->slotspp);
htp->slotspp = slots2pp;
htp->capacity = newCapacity;
}
#ifndef _HASHTABLE_H_
#define _HASHTABLE_H_
#include <unistd.h> // size_t
size_t terribleHash(void* datap, size_t length);
size_t djb2Hash(void* datap, size_t length);
size_t sdbmHash(void* datap, size_t length);
struct _HTSlot {
void* keyp;
size_t keyLength;
size_t keyHash;
void* valuep;
struct _HTSlot* nextp;
};
typedef struct _HTSlot HTSlot;
struct _HashTable {
HTSlot** slotspp;
size_t capacity;
size_t population;
size_t minCapacity;
};
typedef struct _HashTable HashTable;
HashTable* HashTableMake(size_t capacity);
void HashTableFree(HashTable* htp);
void* HashTableGet(HashTable* htp, void* keyp, size_t keyLength,
size_t keyHash
);
void HashTableSet(HashTable* htp, void* keyp, size_t keyLength, size_t keyHash,
void* valuep
);
#endif
#include <fcntl.h> // open
#include <assert.h> // assert
#include <stdlib.h> // malloc
#include <sys/stat.h> // fstat
#include <sys/types.h>
#include <unistd.h> // read
#include <stdio.h> // printf
#include <stdint.h> // uint8_t
#include "HashTable.h"
HashTable* hashTheDictionary(size_t (*hashFuncp)(void*, size_t)) {
// open the dictionary file
int fd = open("/usr/share/dict/words", O_RDONLY);
assert(fd != -1);
// get the size of the file
struct stat statbuf;
int err = fstat(fd, &statbuf);
assert(err == 0);
off_t fsize = statbuf.st_size;
// read the file into memory.
char* fbuff = (char*)malloc(fsize);
assert(fbuff != NULL);
ssize_t bytesRead = read(fd, fbuff, fsize);
assert(bytesRead == fsize);
HashTable* htp = HashTableMake(1);
char* valuep = "common value";
char* cursorp = fbuff;
char* endp = fbuff + fsize;
char* keyp = cursorp;
while (cursorp != endp) {
if (*cursorp == '\n') {
*cursorp = '\0';
size_t keyLength = cursorp - keyp;
if (keyLength > 0) {
size_t keyHash = hashFuncp(keyp, keyLength);
HashTableSet(htp, keyp, keyLength, keyHash, valuep);
}
cursorp++;
keyp = cursorp;
continue;
} else {
cursorp++;
continue;
}
}
return htp;
}
void analyzeHashTable(HashTable* htp) {
#define LENGTHS_LEN (1000)
size_t score = 0;
size_t lengths[LENGTHS_LEN] = {0};
for (size_t i=0; i < htp->capacity; i++) {
HTSlot* slotp = htp->slotspp[i];
size_t chainLength = 0;
while (slotp != NULL) {
if (slotp->valuep != NULL) {
chainLength++;
}
slotp = slotp->nextp;
continue;
}
lengths[chainLength]++;
score += (chainLength * chainLength);
continue;
}
printf("Chain lengths histogram:\n");
printf("length occurrences\n");
printf("------ -----------\n");
for (size_t i=0; i < LENGTHS_LEN; i++) {
if (lengths[i] == 0) {
continue;
}
if (i < 10) {
printf(" ");
} else if (i < 100) {
printf(" ");
} else if (i < 1000) {
printf(" ");
} else if (i < 10000) {
printf(" ");
} else if (i < 100000) {
printf(" ");
}
printf("%zu %zu\n", i, lengths[i]);
}
printf("population: %zu, slots: %zu, empty slots: %zu, score: %zu\n",
htp->population, htp->capacity, lengths[0], score
);
}
int main(int argc, char** argv) {
HashTable* htp;
printf("\ndjb2 hash\n");
htp = hashTheDictionary(djb2Hash);
analyzeHashTable(htp);
printf("\nsdbm hash\n");
htp = hashTheDictionary(sdbmHash);
analyzeHashTable(htp);
printf("\nterrible hash\n");
htp = hashTheDictionary(terribleHash);
analyzeHashTable(htp);
return 0;
}
default: tests
./tests
tests: HashTable.o tests.c
gcc -g -std=c99 -Wall -Werror HashTable.o tests.c -o tests
histograms: HashTable.o histograms.c
gcc -g -std=c99 -Wall -Werror HashTable.o histograms.c -o histograms
HashTable.o: HashTable.h HashTable.c
gcc -g -std=c99 -Wall -Werror -c HashTable.c
clean:
rm -f HashTable.o tests histograms
.PHONY: default clean
#include "HashTable.h"
#include <assert.h> // assert
#include <stdio.h> // printf
#include <string.h> // strlen, strcmp
void testMake() {
printf("%s... ", __FUNCTION__);
size_t capacity = 3;
HashTable* htp = HashTableMake(capacity);
assert(htp != NULL);
assert(htp->capacity == 3);
printf("OK\n");
}
void testFree() {
printf("%s... ", __FUNCTION__);
size_t capacity = 3;
HashTable* htp = HashTableMake(capacity);
HashTableFree(htp);
printf("OK\n");
}
void testSet() {
printf("%s... ", __FUNCTION__);
size_t capacity = 3;
HashTable* htp = HashTableMake(capacity);
char* keyp = "hello";
size_t keyLength = strlen(keyp) + 1;
size_t keyHash = terribleHash(keyp, keyLength);
char* valuep = "world";
HashTableSet(htp, keyp, keyLength, keyHash, valuep);
assert(htp->population == 1);
printf("OK\n");
}
void testGet() {
printf("%s... ", __FUNCTION__);
size_t capacity = 3;
HashTable* htp = HashTableMake(capacity);
char* keyp = "hello";
size_t keyLength = strlen(keyp) + 1;
size_t keyHash = terribleHash(keyp, keyLength);
char* valuep = "world";
HashTableSet(htp, keyp, keyLength, keyHash, valuep);
char* gotp = HashTableGet(htp, keyp, keyLength, keyHash);
assert(strcmp(valuep, gotp) == 0);
printf("OK\n");
}
void testSlotCollision() {
printf("%s... ", __FUNCTION__);
size_t capacity = 1;
HashTable* htp = HashTableMake(capacity);
char* key1p = "hello";
size_t key1Length = strlen(key1p) + 1;
size_t key1Hash = terribleHash(key1p, key1Length);
char* value1p = "world";
HashTableSet(htp, key1p, key1Length, key1Hash, value1p);
assert(htp->population == 1);
char* key2p = "foo";
size_t key2Length = strlen(key2p) + 1;
size_t key2Hash = terribleHash(key2p, key2Length);
char* value2p = "bar";
HashTableSet(htp, key2p, key2Length, key2Hash, value2p);
assert(htp->population == 2);
char* got1p = HashTableGet(htp, key1p, key1Length, key1Hash);
assert(strcmp(value1p, got1p) == 0);
char* got2p = HashTableGet(htp, key2p, key2Length, key2Hash);
assert(strcmp(value2p, got2p) == 0);
printf("OK\n");
}
void testHashCollision() {
printf("%s... ", __FUNCTION__);
size_t capacity = 1;
HashTable* htp = HashTableMake(capacity);
char* key1p = "ab";
size_t key1Length = strlen(key1p) + 1;
size_t key1Hash = terribleHash(key1p, key1Length);
char* value1p = "cd";
HashTableSet(htp, key1p, key1Length, key1Hash, value1p);
assert(htp->population == 1);
char* key2p = "ba";
size_t key2Length = strlen(key2p) + 1;
size_t key2Hash = terribleHash(key2p, key2Length);
char* value2p = "dc";
HashTableSet(htp, key2p, key2Length, key2Hash, value2p);
assert(htp->population == 2);
char* got1p = HashTableGet(htp, key1p, key1Length, key1Hash);
assert(strcmp(value1p, got1p) == 0);
char* got2p = HashTableGet(htp, key2p, key2Length, key2Hash);
assert(strcmp(value2p, got2p) == 0);
printf("OK\n");
}
void testGrow() {
printf("%s... ", __FUNCTION__);
size_t capacity = 1;
HashTable* htp = HashTableMake(capacity);
assert(htp->population == 0);
assert(htp->capacity == capacity);
char* key1p = "hello";
size_t key1Length = strlen(key1p) + 1;
size_t key1Hash = terribleHash(key1p, key1Length);
char* value1p = "world";
HashTableSet(htp, key1p, key1Length, key1Hash, value1p);
assert(htp->population == 1);
assert(htp->capacity == capacity);
char* key2p = "foo";
size_t key2Length = strlen(key2p) + 1;
size_t key2Hash = terribleHash(key2p, key2Length);
char* value2p = "bar";
HashTableSet(htp, key2p, key2Length, key2Hash, value2p);
assert(htp->population == 2);
assert(htp->capacity == capacity*2);
printf("OK\n");
}
void testShrink() {
printf("%s... ", __FUNCTION__);
size_t capacity = 1;
HashTable* htp = HashTableMake(capacity);
assert(htp->population == 0);
assert(htp->capacity == capacity);
char* key1p = "hello";
size_t key1Length = strlen(key1p) + 1;
size_t key1Hash = terribleHash(key1p, key1Length);
char* value1p = "world";
HashTableSet(htp, key1p, key1Length, key1Hash, value1p);
assert(htp->population == 1);
assert(htp->capacity == capacity);
char* key2p = "foo";
size_t key2Length = strlen(key2p) + 1;
size_t key2Hash = terribleHash(key2p, key2Length);
char* value2p = "bar";
HashTableSet(htp, key2p, key2Length, key2Hash, value2p);
assert(htp->population == 2);
assert(htp->capacity == capacity*2);
HashTableSet(htp, key2p, key2Length, key2Hash, NULL);
assert(htp->population == 1);
assert(htp->capacity == capacity);
printf("OK\n");
}
int main(int argc, char** argv) {
testMake();
testFree();
testSet();
testGet();
testSlotCollision();
testHashCollision();
testGrow();
testShrink();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment