Created
August 28, 2024 22:41
-
-
Save UplinkCoder/84a38f9843b3315281da044c2c7b1c9c to your computer and use it in GitHub Desktop.
test_probing_with_the_bible
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <stdbool.h> | |
#include <string#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <stdbool.h> | |
#include <string.h> | |
#include <ctype.h> | |
#include <time.h> | |
#include "crc32c.c" | |
#define TABLE_SIZE ((uint32_t)(4096 * 16)) // Size of the hash table | |
typedef struct { | |
uint32_t hashKey; | |
const char* value; | |
bool occupied; | |
uint32_t chainLength; | |
} HashTableSlot; | |
typedef struct { | |
uint32_t (*GetSlotIndex)(uint32_t hash, uint32_t probeCount); | |
uint32_t slotCount; | |
uint32_t totalChainLength; | |
uint32_t totalDisplacement; | |
uint32_t entryCount; | |
uint32_t hits; | |
uint32_t misses; | |
HashTableSlot* slots; | |
} HashTable; | |
HashTable HashTable_Init(uint32_t slotCount, uint32_t (*GetSlotIndex)(uint32_t hash, uint32_t probeCount)) { | |
HashTable table; | |
memset(&table, 0, sizeof(table)); | |
table.GetSlotIndex = GetSlotIndex; | |
table.slots = (HashTableSlot*)calloc(slotCount, sizeof(*table.slots)); | |
table.slotCount = slotCount; | |
return table; | |
} | |
// Simple primary hash functison | |
uint32_t primaryHash(uint32_t key) { | |
return key; | |
} | |
// Linear probing strategy | |
uint32_t linearProbing(uint32_t hashKey, uint32_t i) { | |
return (hashKey + i); | |
} | |
// Linear probing strategy 2 | |
uint32_t linearProbing2(uint32_t hashKey, uint32_t i) { | |
return (hashKey + (i * ((hashKey & 3)|1) )); | |
} | |
// Quadratic probing strategy | |
uint32_t quadraticProbing(uint32_t hashKey, uint32_t i) { | |
return (hashKey + (i*i)); | |
} | |
// Combined probing strategy | |
uint32_t combinedProbing(uint32_t hashKey, uint32_t i) { | |
return (hashKey + (i < 16 ? i : i*i)); | |
} | |
// Quadratic probing strategy | |
uint32_t quadraticProbing2(uint32_t hashKey, uint32_t i) { | |
return quadraticProbing(hashKey, i) + ((hashKey & 31) | 1); | |
} | |
/// returns true if the value has already been added. | |
/// false otherwise | |
bool GetOrAdd(HashTable* table, uint32_t key, const char* value) { | |
uint32_t hashKey = key; | |
uint32_t length = (key & 0xFF000000) >> 24; | |
uint32_t probeLength = 0; | |
uint32_t slotCount = table->slotCount; | |
int32_t idealSlotIndex = (int32_t) (hashKey % slotCount); | |
for (uint32_t i = 0;;i++) { | |
uint32_t slotIndex = (table->GetSlotIndex(hashKey, i) % slotCount); | |
HashTableSlot* slot = &table->slots[slotIndex]; | |
if (i > TABLE_SIZE * 3) | |
{ | |
// printf("Probe Limit reached aborting\n"); | |
// abort(); | |
int k = 2; | |
} | |
if (!slot->occupied) { | |
// Found an empty slot | |
slot->hashKey = hashKey; | |
slot->value = value; | |
slot->occupied = true; | |
slot->chainLength = i; | |
table->totalChainLength += i; | |
table->totalDisplacement += abs(((int32_t) slotIndex) - idealSlotIndex); | |
table->entryCount += 1; | |
if (i == 0) table->hits += 1; | |
// printf("%d\n", key); | |
return false; | |
} else { | |
if (slot->hashKey == key) | |
{ | |
if (0 == memcmp(slot->value, value, length)) | |
{ | |
return true; | |
} | |
else | |
{ | |
printf("!!! Oh a real hash collision !!!!\n"); | |
} | |
} | |
} | |
} | |
// Table is full, insertion failed | |
return false; | |
} | |
static time_t initTs; | |
void HashTable_PrintStats(HashTable *self) | |
{ | |
HashTable table = *self; | |
printf("Average Hits: %g %%\n", (double) table.hits / table.entryCount); | |
printf("Total Collisions: %u \n", table.totalChainLength); | |
printf("Average Chain Length: %lf\n", (double)table.totalChainLength / table.entryCount); | |
printf("Average Displacement: %lf\n\n", (double)table.totalDisplacement / table.entryCount); | |
} | |
typedef struct { | |
const char* textP; | |
uint8_t length; | |
} word_t; | |
typedef struct { | |
uint32_t (*FuncPtr)(uint32_t hash, uint32_t probeCount); | |
const char* FuncName; | |
} NextIndexFunc_t; | |
int main() { | |
double loadFactors[] = { 0.5, 0.75, 0.80, 0.95, 0.9999 }; | |
const uint32_t slotCount = /*14197;*/ 14011; | |
FILE* fbible = fopen("bible.txt", "rb"); | |
uint32_t approxWordCount = 0; | |
word_t* words = 0; | |
uint32_t wordCount = 0; | |
uint32_t wordCapacity = 0; | |
uint32_t uniqueWords = 0; | |
uint32_t size = 0; | |
char* wordStart = 0; | |
char * bible = 0; | |
bool inWord = false; | |
HashTable wordTable; | |
fseek(fbible, 0, SEEK_END); | |
size = ftell(fbible); | |
fseek(fbible, 0, SEEK_SET); | |
approxWordCount = 800000; | |
words = malloc(sizeof(word_t) * approxWordCount); | |
wordCapacity = approxWordCount; | |
// initTs = time(NULL); | |
bible = malloc(size + 1); | |
fread(bible, 1, size, fbible); | |
bible[size] = '\0'; | |
for(char c, *p = bible; c = *p; ++p) | |
{ | |
if (isalpha(c)) // if it's a letter | |
{ | |
if (!inWord) // we are not in a word so a new one starts | |
{ | |
wordStart = p; | |
inWord = true; | |
} | |
} else { // not a letter | |
if (inWord) | |
{ | |
uint8_t wordLength = (uint8_t)(p - wordStart); | |
word_t word = {wordStart, wordLength}; | |
words[wordCount++] = word; | |
} | |
inWord = false; | |
} | |
} | |
printf("wordCount: %u\n", wordCount); | |
printf("ApproxWordCount: %u\n", approxWordCount); | |
printf("loadFactor: %lf\n\n", ((double)13992 / slotCount)); | |
NextIndexFunc_t NextIndexFuncs[] = { | |
{linearProbing, "linearProbing"}, | |
{quadraticProbing, "quadraticProbing"}, | |
{combinedProbing, "combinedProbing"}, | |
{linearProbing2, "linearProbing2"}, | |
}; | |
for(int fnInx = 0; fnInx < (sizeof(NextIndexFuncs) / sizeof(*NextIndexFuncs)); fnInx++) | |
{ | |
uniqueWords = 0; | |
NextIndexFunc_t fn = NextIndexFuncs[fnInx]; | |
wordTable = HashTable_Init(slotCount, fn.FuncPtr); | |
printf("FuncName: %s\n", fn.FuncName); | |
for(uint32_t i = 0; i < wordCount; i++) { | |
uint32_t hashKey = (crc32c(~0, words[i].textP, words[i].length) & 0x00FFFFFF) | words[i].length << 24; | |
bool existing = GetOrAdd(&wordTable, hashKey, words[i].textP); | |
if (!existing) { | |
uniqueWords++; | |
// printf("%.*s\n", words[i].length, words[i].textP); | |
} | |
} | |
printf("UniqueWords: %u\n\n", uniqueWords); | |
HashTable_PrintStats(&wordTable); | |
} | |
return 0; | |
} | |
#include <time.h> | |
#include "crc32c.c" | |
#define TABLE_SIZE ((uint32_t)(4096 * 16)) // Size of the hash table | |
typedef struct { | |
uint32_t hashKey; | |
const char* value; | |
bool occupied; | |
uint32_t chainLength; | |
} HashTableSlot; | |
typedef struct { | |
uint32_t (*GetSlotIndex)(uint32_t hash, uint32_t probeCount); | |
uint32_t slotCount; | |
uint32_t totalChainLength; | |
uint32_t totalDisplacement; | |
uint32_t entryCount; | |
uint32_t hits; | |
uint32_t misses; | |
HashTableSlot* slots; | |
} HashTable; | |
HashTable HashTable_Init(uint32_t slotCount, uint32_t (*GetSlotIndex)(uint32_t hash, uint32_t probeCount)) { | |
HashTable table; | |
memset(&table, 0, sizeof(table)); | |
table.GetSlotIndex = GetSlotIndex; | |
table.slots = (HashTableSlot*)calloc(slotCount, sizeof(*table.slots)); | |
table.slotCount = slotCount; | |
return table; | |
} | |
// Simple primary hash functison | |
uint32_t primaryHash(uint32_t key) { | |
return key; | |
} | |
// Linear probing strategy | |
uint32_t linearProbing(uint32_t hashKey, uint32_t i) { | |
return (hashKey + i); | |
} | |
// Linear probing strategy 2 | |
uint32_t linearProbing2(uint32_t hashKey, uint32_t i) { | |
return (hashKey + (i * hashKey & 7)); | |
} | |
// Quadratic probing strategy | |
uint32_t quadraticProbing(uint32_t hashKey, uint32_t i) { | |
return (hashKey + (i*i)); | |
} | |
// Combined probing strategy | |
uint32_t combinedProbing(uint32_t hashKey, uint32_t i) { | |
return (hashKey + (i < 16 ? i : i*i)); | |
} | |
// Quadratic probing strategy | |
uint32_t quadraticProbing2(uint32_t hashKey, uint32_t i) { | |
return quadraticProbing(hashKey, i) + ((hashKey & 31) | 1); | |
} | |
/// returns true if the value has already been added. | |
/// false otherwise | |
bool GetOrAdd(HashTable* table, uint32_t key, const char* value) { | |
uint32_t hashKey = key; | |
uint32_t length = (key & 0xFF000000) >> 24; | |
uint32_t probeLength = 0; | |
uint32_t slotCount = table->slotCount; | |
int32_t idealSlotIndex = (int32_t) (hashKey % slotCount); | |
for (uint32_t i = 0;;i++) { | |
uint32_t slotIndex = (table->GetSlotIndex(hashKey, i) % slotCount); | |
HashTableSlot* slot = &table->slots[slotIndex]; | |
if (i > TABLE_SIZE * 3) | |
{ | |
// printf("Probe Limit reached aborting\n"); | |
// abort(); | |
int k = 2; | |
} | |
if (!slot->occupied) { | |
// Found an empty slot | |
slot->hashKey = hashKey; | |
slot->value = value; | |
slot->occupied = true; | |
slot->chainLength = i; | |
table->totalChainLength += i; | |
table->totalDisplacement += abs(((int32_t) slotIndex) - idealSlotIndex); | |
table->entryCount += 1; | |
if (i == 0) table->hits += 1; | |
// printf("%d\n", key); | |
return false; | |
} else { | |
if (slot->hashKey == key) | |
{ | |
if (0 == memcmp(slot->value, value, length)) | |
{ | |
return true; | |
} | |
else | |
{ | |
printf("!!! Oh a real hash collision !!!!\n"); | |
} | |
} | |
} | |
} | |
// Table is full, insertion failed | |
return false; | |
} | |
static time_t initTs; | |
void HashTable_PrintStats(HashTable *self) | |
{ | |
HashTable table = *self; | |
printf("Average Hits: %g %%\n", (double) table.hits / table.entryCount); | |
printf("Total Collisions: %u \n", table.totalChainLength); | |
printf("Average Chain Length: %lf\n", (double)table.totalChainLength / table.entryCount); | |
printf("Average Displacement: %lf\n\n", (double)table.totalDisplacement / table.entryCount); | |
} | |
typedef struct { | |
const char* textP; | |
uint8_t length; | |
} word_t; | |
typedef struct { | |
uint32_t (*FuncPtr)(uint32_t hash, uint32_t probeCount); | |
const char* FuncName; | |
} NextIndexFunc_t; | |
int main() { | |
double loadFactors[] = { 0.5, 0.75, 0.80, 0.95, 0.9999 }; | |
const uint32_t slotCount = 14197; // 14011; | |
FILE* fbible = fopen("bible.txt", "rb"); | |
uint32_t approxWordCount = 0; | |
word_t* words = 0; | |
uint32_t wordCount = 0; | |
uint32_t wordCapacity = 0; | |
uint32_t uniqueWords = 0; | |
uint32_t size = 0; | |
char* wordStart = 0; | |
char * bible = 0; | |
bool inWord = false; | |
HashTable wordTable; | |
fseek(fbible, 0, SEEK_END); | |
size = ftell(fbible); | |
fseek(fbible, 0, SEEK_SET); | |
approxWordCount = 800000; | |
words = malloc(sizeof(word_t) * approxWordCount); | |
wordCapacity = approxWordCount; | |
// initTs = time(NULL); | |
bible = malloc(size + 1); | |
fread(bible, 1, size, fbible); | |
bible[size] = '\0'; | |
for(char c, *p = bible; c = *p; ++p) | |
{ | |
if (isalpha(c)) // if it's a letter | |
{ | |
if (!inWord) // we are not in a word so a new one starts | |
{ | |
wordStart = p; | |
inWord = true; | |
} | |
} else { // not a letter | |
if (inWord) | |
{ | |
uint8_t wordLength = (uint8_t)(p - wordStart); | |
word_t word = {wordStart, wordLength}; | |
words[wordCount++] = word; | |
} | |
inWord = false; | |
} | |
} | |
printf("wordCount: %u\n", wordCount); | |
printf("ApproxWordCount: %u\n", approxWordCount); | |
printf("loadFactor: %lf\n\n", ((double)13992 / slotCount)); | |
NextIndexFunc_t NextIndexFuncs[] = { | |
{linearProbing, "linearProbing"}, | |
{quadraticProbing, "quadraticProbing"}, | |
{combinedProbing, "combinedProbing"}, | |
{linearProbing, "linearProbing2"}, | |
}; | |
for(int fnInx = 0; fnInx < (sizeof(NextIndexFuncs) / sizeof(*NextIndexFuncs)); fnInx++) | |
{ | |
uniqueWords = 0; | |
NextIndexFunc_t fn = NextIndexFuncs[fnInx]; | |
wordTable = HashTable_Init(slotCount, fn.FuncPtr); | |
printf("FuncName: %s\n", fn.FuncName); | |
for(uint32_t i = 0; i < wordCount; i++) { | |
uint32_t hashKey = (crc32c(~0, words[i].textP, words[i].length) & 0x00FFFFFF) | words[i].length << 24; | |
bool existing = GetOrAdd(&wordTable, hashKey, words[i].textP); | |
if (!existing) { | |
uniqueWords++; | |
// printf("%.*s\n", words[i].length, words[i].textP); | |
} | |
} | |
printf("UniqueWords: %u\n\n", uniqueWords); | |
HashTable_PrintStats(&wordTable); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment