Skip to content

Instantly share code, notes, and snippets.

@UplinkCoder
Created August 28, 2024 22:41
Show Gist options
  • Save UplinkCoder/84a38f9843b3315281da044c2c7b1c9c to your computer and use it in GitHub Desktop.
Save UplinkCoder/84a38f9843b3315281da044c2c7b1c9c to your computer and use it in GitHub Desktop.
test_probing_with_the_bible
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <ctype.h>
#include <time.h>
#include "crc32c.c"
#define TABLE_SIZE ((uint32_t)(4096 * 16)) // Size of the hash table
typedef struct {
uint32_t hashKey;
const char* value;
bool occupied;
uint32_t chainLength;
} HashTableSlot;
typedef struct {
uint32_t (*GetSlotIndex)(uint32_t hash, uint32_t probeCount);
uint32_t slotCount;
uint32_t totalChainLength;
uint32_t totalDisplacement;
uint32_t entryCount;
uint32_t hits;
uint32_t misses;
HashTableSlot* slots;
} HashTable;
HashTable HashTable_Init(uint32_t slotCount, uint32_t (*GetSlotIndex)(uint32_t hash, uint32_t probeCount)) {
HashTable table;
memset(&table, 0, sizeof(table));
table.GetSlotIndex = GetSlotIndex;
table.slots = (HashTableSlot*)calloc(slotCount, sizeof(*table.slots));
table.slotCount = slotCount;
return table;
}
// Simple primary hash functison
uint32_t primaryHash(uint32_t key) {
return key;
}
// Linear probing strategy
uint32_t linearProbing(uint32_t hashKey, uint32_t i) {
return (hashKey + i);
}
// Linear probing strategy 2
uint32_t linearProbing2(uint32_t hashKey, uint32_t i) {
return (hashKey + (i * ((hashKey & 3)|1) ));
}
// Quadratic probing strategy
uint32_t quadraticProbing(uint32_t hashKey, uint32_t i) {
return (hashKey + (i*i));
}
// Combined probing strategy
uint32_t combinedProbing(uint32_t hashKey, uint32_t i) {
return (hashKey + (i < 16 ? i : i*i));
}
// Quadratic probing strategy
uint32_t quadraticProbing2(uint32_t hashKey, uint32_t i) {
return quadraticProbing(hashKey, i) + ((hashKey & 31) | 1);
}
/// returns true if the value has already been added.
/// false otherwise
bool GetOrAdd(HashTable* table, uint32_t key, const char* value) {
uint32_t hashKey = key;
uint32_t length = (key & 0xFF000000) >> 24;
uint32_t probeLength = 0;
uint32_t slotCount = table->slotCount;
int32_t idealSlotIndex = (int32_t) (hashKey % slotCount);
for (uint32_t i = 0;;i++) {
uint32_t slotIndex = (table->GetSlotIndex(hashKey, i) % slotCount);
HashTableSlot* slot = &table->slots[slotIndex];
if (i > TABLE_SIZE * 3)
{
// printf("Probe Limit reached aborting\n");
// abort();
int k = 2;
}
if (!slot->occupied) {
// Found an empty slot
slot->hashKey = hashKey;
slot->value = value;
slot->occupied = true;
slot->chainLength = i;
table->totalChainLength += i;
table->totalDisplacement += abs(((int32_t) slotIndex) - idealSlotIndex);
table->entryCount += 1;
if (i == 0) table->hits += 1;
// printf("%d\n", key);
return false;
} else {
if (slot->hashKey == key)
{
if (0 == memcmp(slot->value, value, length))
{
return true;
}
else
{
printf("!!! Oh a real hash collision !!!!\n");
}
}
}
}
// Table is full, insertion failed
return false;
}
static time_t initTs;
void HashTable_PrintStats(HashTable *self)
{
HashTable table = *self;
printf("Average Hits: %g %%\n", (double) table.hits / table.entryCount);
printf("Total Collisions: %u \n", table.totalChainLength);
printf("Average Chain Length: %lf\n", (double)table.totalChainLength / table.entryCount);
printf("Average Displacement: %lf\n\n", (double)table.totalDisplacement / table.entryCount);
}
typedef struct {
const char* textP;
uint8_t length;
} word_t;
typedef struct {
uint32_t (*FuncPtr)(uint32_t hash, uint32_t probeCount);
const char* FuncName;
} NextIndexFunc_t;
int main() {
double loadFactors[] = { 0.5, 0.75, 0.80, 0.95, 0.9999 };
const uint32_t slotCount = /*14197;*/ 14011;
FILE* fbible = fopen("bible.txt", "rb");
uint32_t approxWordCount = 0;
word_t* words = 0;
uint32_t wordCount = 0;
uint32_t wordCapacity = 0;
uint32_t uniqueWords = 0;
uint32_t size = 0;
char* wordStart = 0;
char * bible = 0;
bool inWord = false;
HashTable wordTable;
fseek(fbible, 0, SEEK_END);
size = ftell(fbible);
fseek(fbible, 0, SEEK_SET);
approxWordCount = 800000;
words = malloc(sizeof(word_t) * approxWordCount);
wordCapacity = approxWordCount;
// initTs = time(NULL);
bible = malloc(size + 1);
fread(bible, 1, size, fbible);
bible[size] = '\0';
for(char c, *p = bible; c = *p; ++p)
{
if (isalpha(c)) // if it's a letter
{
if (!inWord) // we are not in a word so a new one starts
{
wordStart = p;
inWord = true;
}
} else { // not a letter
if (inWord)
{
uint8_t wordLength = (uint8_t)(p - wordStart);
word_t word = {wordStart, wordLength};
words[wordCount++] = word;
}
inWord = false;
}
}
printf("wordCount: %u\n", wordCount);
printf("ApproxWordCount: %u\n", approxWordCount);
printf("loadFactor: %lf\n\n", ((double)13992 / slotCount));
NextIndexFunc_t NextIndexFuncs[] = {
{linearProbing, "linearProbing"},
{quadraticProbing, "quadraticProbing"},
{combinedProbing, "combinedProbing"},
{linearProbing2, "linearProbing2"},
};
for(int fnInx = 0; fnInx < (sizeof(NextIndexFuncs) / sizeof(*NextIndexFuncs)); fnInx++)
{
uniqueWords = 0;
NextIndexFunc_t fn = NextIndexFuncs[fnInx];
wordTable = HashTable_Init(slotCount, fn.FuncPtr);
printf("FuncName: %s\n", fn.FuncName);
for(uint32_t i = 0; i < wordCount; i++) {
uint32_t hashKey = (crc32c(~0, words[i].textP, words[i].length) & 0x00FFFFFF) | words[i].length << 24;
bool existing = GetOrAdd(&wordTable, hashKey, words[i].textP);
if (!existing) {
uniqueWords++;
// printf("%.*s\n", words[i].length, words[i].textP);
}
}
printf("UniqueWords: %u\n\n", uniqueWords);
HashTable_PrintStats(&wordTable);
}
return 0;
}
#include <time.h>
#include "crc32c.c"
#define TABLE_SIZE ((uint32_t)(4096 * 16)) // Size of the hash table
typedef struct {
uint32_t hashKey;
const char* value;
bool occupied;
uint32_t chainLength;
} HashTableSlot;
typedef struct {
uint32_t (*GetSlotIndex)(uint32_t hash, uint32_t probeCount);
uint32_t slotCount;
uint32_t totalChainLength;
uint32_t totalDisplacement;
uint32_t entryCount;
uint32_t hits;
uint32_t misses;
HashTableSlot* slots;
} HashTable;
HashTable HashTable_Init(uint32_t slotCount, uint32_t (*GetSlotIndex)(uint32_t hash, uint32_t probeCount)) {
HashTable table;
memset(&table, 0, sizeof(table));
table.GetSlotIndex = GetSlotIndex;
table.slots = (HashTableSlot*)calloc(slotCount, sizeof(*table.slots));
table.slotCount = slotCount;
return table;
}
// Simple primary hash functison
uint32_t primaryHash(uint32_t key) {
return key;
}
// Linear probing strategy
uint32_t linearProbing(uint32_t hashKey, uint32_t i) {
return (hashKey + i);
}
// Linear probing strategy 2
uint32_t linearProbing2(uint32_t hashKey, uint32_t i) {
return (hashKey + (i * hashKey & 7));
}
// Quadratic probing strategy
uint32_t quadraticProbing(uint32_t hashKey, uint32_t i) {
return (hashKey + (i*i));
}
// Combined probing strategy
uint32_t combinedProbing(uint32_t hashKey, uint32_t i) {
return (hashKey + (i < 16 ? i : i*i));
}
// Quadratic probing strategy
uint32_t quadraticProbing2(uint32_t hashKey, uint32_t i) {
return quadraticProbing(hashKey, i) + ((hashKey & 31) | 1);
}
/// returns true if the value has already been added.
/// false otherwise
bool GetOrAdd(HashTable* table, uint32_t key, const char* value) {
uint32_t hashKey = key;
uint32_t length = (key & 0xFF000000) >> 24;
uint32_t probeLength = 0;
uint32_t slotCount = table->slotCount;
int32_t idealSlotIndex = (int32_t) (hashKey % slotCount);
for (uint32_t i = 0;;i++) {
uint32_t slotIndex = (table->GetSlotIndex(hashKey, i) % slotCount);
HashTableSlot* slot = &table->slots[slotIndex];
if (i > TABLE_SIZE * 3)
{
// printf("Probe Limit reached aborting\n");
// abort();
int k = 2;
}
if (!slot->occupied) {
// Found an empty slot
slot->hashKey = hashKey;
slot->value = value;
slot->occupied = true;
slot->chainLength = i;
table->totalChainLength += i;
table->totalDisplacement += abs(((int32_t) slotIndex) - idealSlotIndex);
table->entryCount += 1;
if (i == 0) table->hits += 1;
// printf("%d\n", key);
return false;
} else {
if (slot->hashKey == key)
{
if (0 == memcmp(slot->value, value, length))
{
return true;
}
else
{
printf("!!! Oh a real hash collision !!!!\n");
}
}
}
}
// Table is full, insertion failed
return false;
}
static time_t initTs;
void HashTable_PrintStats(HashTable *self)
{
HashTable table = *self;
printf("Average Hits: %g %%\n", (double) table.hits / table.entryCount);
printf("Total Collisions: %u \n", table.totalChainLength);
printf("Average Chain Length: %lf\n", (double)table.totalChainLength / table.entryCount);
printf("Average Displacement: %lf\n\n", (double)table.totalDisplacement / table.entryCount);
}
typedef struct {
const char* textP;
uint8_t length;
} word_t;
typedef struct {
uint32_t (*FuncPtr)(uint32_t hash, uint32_t probeCount);
const char* FuncName;
} NextIndexFunc_t;
int main() {
double loadFactors[] = { 0.5, 0.75, 0.80, 0.95, 0.9999 };
const uint32_t slotCount = 14197; // 14011;
FILE* fbible = fopen("bible.txt", "rb");
uint32_t approxWordCount = 0;
word_t* words = 0;
uint32_t wordCount = 0;
uint32_t wordCapacity = 0;
uint32_t uniqueWords = 0;
uint32_t size = 0;
char* wordStart = 0;
char * bible = 0;
bool inWord = false;
HashTable wordTable;
fseek(fbible, 0, SEEK_END);
size = ftell(fbible);
fseek(fbible, 0, SEEK_SET);
approxWordCount = 800000;
words = malloc(sizeof(word_t) * approxWordCount);
wordCapacity = approxWordCount;
// initTs = time(NULL);
bible = malloc(size + 1);
fread(bible, 1, size, fbible);
bible[size] = '\0';
for(char c, *p = bible; c = *p; ++p)
{
if (isalpha(c)) // if it's a letter
{
if (!inWord) // we are not in a word so a new one starts
{
wordStart = p;
inWord = true;
}
} else { // not a letter
if (inWord)
{
uint8_t wordLength = (uint8_t)(p - wordStart);
word_t word = {wordStart, wordLength};
words[wordCount++] = word;
}
inWord = false;
}
}
printf("wordCount: %u\n", wordCount);
printf("ApproxWordCount: %u\n", approxWordCount);
printf("loadFactor: %lf\n\n", ((double)13992 / slotCount));
NextIndexFunc_t NextIndexFuncs[] = {
{linearProbing, "linearProbing"},
{quadraticProbing, "quadraticProbing"},
{combinedProbing, "combinedProbing"},
{linearProbing, "linearProbing2"},
};
for(int fnInx = 0; fnInx < (sizeof(NextIndexFuncs) / sizeof(*NextIndexFuncs)); fnInx++)
{
uniqueWords = 0;
NextIndexFunc_t fn = NextIndexFuncs[fnInx];
wordTable = HashTable_Init(slotCount, fn.FuncPtr);
printf("FuncName: %s\n", fn.FuncName);
for(uint32_t i = 0; i < wordCount; i++) {
uint32_t hashKey = (crc32c(~0, words[i].textP, words[i].length) & 0x00FFFFFF) | words[i].length << 24;
bool existing = GetOrAdd(&wordTable, hashKey, words[i].textP);
if (!existing) {
uniqueWords++;
// printf("%.*s\n", words[i].length, words[i].textP);
}
}
printf("UniqueWords: %u\n\n", uniqueWords);
HashTable_PrintStats(&wordTable);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment