Last active
August 17, 2024 06:25
-
-
Save deadpixi/8350ddb94ece931e9a6f1f86e83dc0ea to your computer and use it in GitHub Desktop.
An insertion-order-preserving hash table in C, as part of the Thebe scripting language (which I will eventually get around to finishing).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <math.h> | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <stdbool.h> | |
#include <string.h> | |
#include "thebe.h" | |
/*** TODO | |
* - switch tables to use uint32_t indexes; that's big enough for anybody, maybe even consider using | |
* different sized integers for different sized tables | |
* - grow the entry table more slowly than the index table | |
* - improve the naming of position vs index | |
* - add table sorting, value comparison | |
* - have the *_new functions return ThebeValues? | |
* - needs{growth,compaction} should not be separate from just growing and compacting; | |
* those functions can just exit early if the change isn't needed | |
*/ | |
/*** PRIVATE STRUCTURES AND ENUMERATIONS | |
* All internal structures are predeclared here, so that we don't need to worry about order later on. | |
*/ | |
typedef struct TableEntry TableEntry; | |
struct ThebeString{ | |
size_t length; | |
char *content; | |
uint32_t hashValue; | |
}; | |
struct TableEntry{ | |
ThebeValue key, value; | |
}; | |
struct ThebeTable | |
{ | |
size_t staleEntryCount; | |
size_t entryCount, allocatedCount; | |
size_t *indices; | |
TableEntry *entries; | |
}; | |
/*** FUNCTION PROTOTYPES | |
* Every function likewise gets a prototype, to avoid ordering issues. | |
* FIXME - We should pass ThebeValue by value everywhere. | |
*/ | |
static uint32_t computeHash(const uint8_t* key, size_t length); | |
bool ThebeValue_equals(const ThebeValue *self, const ThebeValue *other); | |
static uint32_t ThebeValue_computeHash(const ThebeValue *self); | |
static bool ThebeValue_isUsableAsKey(const ThebeValue *self); | |
static TableEntry *ThebeTable_getEntryForKey(const ThebeTable *self, const ThebeValue *key); | |
static bool ThebeTable_needsGrowth(const ThebeTable *self); | |
static bool ThebeTable_grow(ThebeTable *self, size_t size); | |
static inline size_t ThebeTable_getIndexOffsetForKey(const ThebeTable *self, const ThebeValue *key); | |
static bool ThebeTable_needsCompaction(const ThebeTable *self); | |
static void ThebeTable_compact(ThebeTable *self); | |
static void ThebeTable_reindex(ThebeTable *self); | |
/*** UTILITY FUNCTIONS */ | |
/* Jenkins one-at-a-time hash. */ | |
static uint32_t | |
computeHash(const uint8_t *key, size_t length) | |
{ | |
uint32_t hash = 0; | |
for (size_t i = 0; i < length; i++){ | |
hash += key[i]; | |
hash += hash << 10; | |
hash ^= hash >> 6; | |
} | |
hash += hash << 3; | |
hash ^= hash >> 11; | |
hash += hash << 15; | |
return hash; | |
} | |
/* Check if a floating point number is a non-infinite number. */ | |
static bool | |
isFiniteFloat(double real) | |
{ | |
switch (fpclassify(real)){ | |
case FP_NORMAL: case FP_SUBNORMAL: case FP_ZERO: | |
return true; | |
default: | |
return false; | |
} | |
} | |
/*** VALUES | |
* Values in the language. | |
*/ | |
bool | |
ThebeValue_equals(const ThebeValue *self, const ThebeValue *other) | |
{ | |
switch (self->type){ | |
case ThebeNilType: | |
return other->type == ThebeNilType; | |
case ThebeBooleanType: | |
return other->type == ThebeBooleanType && self->as.boolean == other->as.boolean; | |
case ThebeIntegerType: | |
if (other->type == ThebeIntegerType) | |
return self->as.integer == other->as.integer; | |
else if (other->type == ThebeRealType) | |
return self->as.integer == other->as.real; | |
else | |
return false; | |
case ThebeRealType: | |
if (other->type == ThebeIntegerType) | |
return self->as.real == other->as.integer; | |
else if (other->type == ThebeRealType) | |
return self->as.real == other->as.real; | |
else | |
return false; | |
case ThebeStringType: | |
return other->type == ThebeStringType | |
&& self->as.string->length == other->as.string->length | |
&& ThebeValue_computeHash(self) == ThebeValue_computeHash(other) | |
&& memcmp(self->as.string->content, other->as.string->content, self->as.string->length) == 0; | |
case ThebeFunctionType: | |
return other->type == ThebeFunctionType && self->as.function == other->as.function; | |
case ThebeTableType: | |
return other->type == ThebeTableType && self->as.table == other->as.table; | |
case ThebeOpaqueType: | |
return other->type == ThebeOpaqueType && self->as.opaque == other->as.opaque; | |
} | |
return false; | |
} | |
static uint32_t | |
ThebeValue_computeHash(const ThebeValue *self) | |
{ | |
switch (self->type){ | |
case ThebeBooleanType: | |
return (uint32_t)self->as.boolean; | |
case ThebeIntegerType: | |
return (uint32_t)self->as.integer; | |
case ThebeRealType: | |
return isFiniteFloat(self->as.real)? (uint32_t)self->as.real : 0; | |
case ThebeStringType: | |
return self->as.string->hashValue; | |
default: | |
return 0; | |
} | |
} | |
static bool | |
ThebeValue_isUsableAsKey(const ThebeValue *self) | |
{ | |
return self->type == ThebeBooleanType | |
|| self->type == ThebeIntegerType | |
|| self->type == ThebeStringType | |
|| (self->type == ThebeRealType && isFiniteFloat(self->as.real)); | |
} | |
/*** STRINGS | |
* Your bog standard counted string, though we do precompute its hash value. | |
* Note that the Thebe interpreter interns string constants in the source code, | |
* so we only ever have one copy of those. | |
*/ | |
ThebeString * | |
ThebeString_newEmptyString(void) | |
{ | |
return calloc(1, sizeof(ThebeString)); | |
} | |
ThebeString * | |
ThebeString_newFromString(const char *string) | |
{ | |
return ThebeString_newFromCountedString(string, strlen(string)); | |
} | |
#define HASHED_STRING_PREFIX 128 | |
ThebeString * | |
ThebeString_newFromCountedString(const char *string, size_t length) | |
{ | |
ThebeString *self = calloc(1, sizeof(ThebeString)); | |
if (!self) | |
return NULL; | |
self->content = calloc(1, length + 1); | |
if (!self->content){ | |
free(self); | |
return NULL; | |
} | |
memcpy(self->content, string, length); | |
size_t prefix = length > HASHED_STRING_PREFIX? HASHED_STRING_PREFIX : length; | |
self->hashValue = computeHash((const uint8_t *)self->content, prefix); | |
return self; | |
} | |
void | |
ThebeString_free(ThebeString *self) | |
{ | |
if (self){ | |
free(self->content); | |
free(self); | |
} | |
} | |
size_t | |
ThebeString_length(const ThebeString *self) | |
{ | |
return self->length; | |
} | |
const char * | |
ThebeString_contents(const ThebeString *self) | |
{ | |
return self->content; | |
} | |
/*** TABLES | |
* A table is a simple data structure that maps a key to a value. | |
* Both keys and values are instances of the ThebeValue structure, which is a tagged union. | |
* | |
* Tables have some interesting properties, the most notable of which is that iteration over the | |
* (key, value) pairs of the table is guaranteed to iterate in insertion order. | |
* | |
* The representation of a table in memory is compact, and inspired by the (beautiful) implementation | |
* of ordered dictionaries in Python: | |
* | |
* Tables consist of two arrays: an index array, and an item array. | |
* For example: | |
* var tab = { | |
* foo = "red", | |
* bar = "green", | |
* baz = "blue", | |
* } | |
* | |
* would be represented as: | |
* | |
* indices = [-1, -1, -1, 2, -1, -1, 0, -1, -1, -1, 1] | |
* content = [ | |
* ["bar", "green"], | |
* ["foo", "red"], | |
* ["baz", "blue"], | |
* ] | |
* | |
* This structure has significant advantages: | |
* - Adding a new entry just means plonking it on the end of the content array. | |
* - Iterating over the contents means just iterating over the content array, | |
* which preserves insertion order. | |
* - Resizing doesn't require moving the members of the content array; they're still packed at the | |
* beginning of the array. | |
* | |
* A (slight) disadvantage: | |
* - Removing an entry just uses tombstoning; compaction is sometimes necessary. | |
* Much like growing a table, this is amortized over all removals. | |
*/ | |
#define DEFAULT_TABLE_SIZE 17 | |
#define NO_ENTRY ((size_t)-1) | |
ThebeTable * | |
ThebeTable_new(void) | |
{ | |
ThebeTable *self = calloc(1, sizeof(ThebeTable)); | |
if (!self) | |
return NULL; | |
if (!ThebeTable_grow(self, DEFAULT_TABLE_SIZE)){ | |
free(self); | |
return NULL; | |
} | |
return self; | |
} | |
void | |
ThebeTable_free(ThebeTable *self) | |
{ | |
if (self){ | |
free(self->entries); | |
free(self->indices); | |
free(self); | |
} | |
} | |
bool | |
ThebeTable_contains(const ThebeTable *self, const ThebeValue *key) | |
{ | |
return ThebeTable_getEntryForKey(self, key) != NULL; | |
} | |
bool | |
ThebeTable_insert(ThebeTable *self, const ThebeValue *key, const ThebeValue *value) | |
{ | |
if (!ThebeValue_isUsableAsKey(key)) | |
return false; | |
/* If this key has a value already, replace it. */ | |
TableEntry *entry = ThebeTable_getEntryForKey(self, key); | |
if (entry){ | |
memcpy(&entry->value, value, sizeof(ThebeValue)); | |
return true; | |
} | |
/* Otherwise, grow the table if need be. */ | |
if (ThebeTable_needsGrowth(self) && !ThebeTable_grow(self, self->allocatedCount * 2)) | |
return false; | |
/* Otherwise, compute the hash, find the index, add the value. */ | |
size_t entryPosition = self->entryCount++; | |
entry = &self->entries[entryPosition]; | |
memcpy(&entry->key, key, sizeof(ThebeValue)); | |
memcpy(&entry->value, value, sizeof(ThebeValue)); | |
size_t position = ThebeTable_getIndexOffsetForKey(self, key); | |
if (self->indices[position] == NO_ENTRY) | |
self->indices[position] = entryPosition; | |
return true; | |
} | |
ThebeResult | |
ThebeTable_get(const ThebeTable *self, const ThebeValue *key) | |
{ | |
TableEntry *entry = ThebeTable_getEntryForKey(self, key); | |
if (!entry) | |
return (ThebeResult){.error = true}; | |
return (ThebeResult){.error = false, .value = entry->value}; | |
} | |
void | |
ThebeTable_remove(ThebeTable *self, const ThebeValue *key) | |
{ | |
if (!ThebeTable_contains(self, key)) | |
return; | |
/* Figure out where this entry lives. */ | |
size_t position = ThebeTable_getIndexOffsetForKey(self, key); | |
size_t index = self->indices[position]; | |
/* Mark the slot as unused. */ | |
memset(&self->entries[index], 0, sizeof(TableEntry)); | |
/* If by happy chance this was at the end of the entries, | |
* removing it doesn't create a stale entry. | |
*/ | |
if (index == self->entryCount - 1) | |
self->entryCount--; | |
else | |
self->staleEntryCount++; | |
/* Compact the table if needed. */ | |
if (ThebeTable_needsCompaction(self)) | |
ThebeTable_compact(self); | |
} | |
void | |
ThebeTable_clear(ThebeTable *self) | |
{ | |
for (size_t i = 0; i < self->allocatedCount; i++) | |
self->indices[i] = NO_ENTRY; | |
self->entryCount = 0; | |
} | |
size_t | |
ThebeTable_size(const ThebeTable *self) | |
{ | |
return self->entryCount; | |
} | |
ThebeTableCookie | |
ThebeTable_startIteration(const ThebeTable *self) | |
{ | |
(void)self; | |
return 0; | |
} | |
ThebeTableCookie | |
ThebeTable_getNextEntry(const ThebeTable *self, ThebeValue *key, ThebeValue *value, ThebeTableCookie cookie) | |
{ | |
while (cookie < self->entryCount && self->entries[cookie].key.type == ThebeNilType) | |
cookie++; | |
if (cookie >= self->entryCount) | |
return ThebeTableIteratorFinished; | |
TableEntry *entry = &self->entries[cookie]; | |
memcpy(key, &entry->key, sizeof(ThebeValue)); | |
memcpy(value, &entry->value, sizeof(ThebeValue)); | |
cookie++; | |
return cookie >= self->entryCount? ThebeTableIteratorFinished : cookie; | |
} | |
static inline size_t | |
ThebeTable_getIndexOffsetForKey(const ThebeTable *self, const ThebeValue *key) | |
{ | |
return ThebeValue_computeHash(key) % self->allocatedCount; | |
} | |
static TableEntry * | |
ThebeTable_getEntryForKey(const ThebeTable *self, const ThebeValue *key) | |
{ | |
if (!ThebeValue_isUsableAsKey(key)) | |
return NULL; | |
size_t index = self->indices[ThebeTable_getIndexOffsetForKey(self, key)]; | |
if (index == NO_ENTRY) | |
return NULL; | |
size_t startIndex = index; | |
do{ | |
TableEntry *entry = &self->entries[index++]; | |
if (ThebeValue_equals(&entry->key, key)) | |
return entry; | |
index %= self->entryCount; | |
} while (index != startIndex); | |
return NULL; | |
} | |
static bool | |
ThebeTable_needsGrowth(const ThebeTable *self) | |
{ | |
return self->entryCount >= (self->allocatedCount - 2) | |
|| ((double)self->entryCount) / ((double)self->allocatedCount) >= 0.8; | |
} | |
static bool | |
ThebeTable_grow(ThebeTable *self, size_t size) | |
{ | |
if (self->allocatedCount > size) | |
return true; | |
if (self->allocatedCount + self->staleEntryCount >= size){ | |
ThebeTable_compact(self); | |
return true; | |
} | |
size_t *newIndices = realloc(self->indices, size * sizeof(size_t)); | |
if (!newIndices) | |
return false; | |
self->indices = newIndices; | |
TableEntry *newEntries = realloc(self->entries, size * sizeof(TableEntry)); | |
if (!newEntries) | |
return false; | |
self->entries = newEntries; | |
self->allocatedCount = size; | |
ThebeTable_reindex(self); | |
return true; | |
} | |
static void | |
ThebeTable_reindex(ThebeTable *self) | |
{ | |
for (size_t i = 0; i < self->allocatedCount; i++) | |
self->indices[i] = NO_ENTRY; | |
for (size_t i = 0; i < self->entryCount; i++){ | |
TableEntry *entry = &self->entries[i]; | |
if (entry->key.type != ThebeNilType) | |
self->indices[ThebeTable_getIndexOffsetForKey(self, &entry->key)] = i; | |
} | |
} | |
static bool | |
ThebeTable_needsCompaction(const ThebeTable *self) | |
{ | |
return ThebeTable_needsGrowth(self) | |
|| ((double)self->staleEntryCount) / ((double)self->entryCount) >= 0.2; | |
} | |
static void | |
ThebeTable_compact(ThebeTable *self) | |
{ | |
if (self->entryCount < 2 || self->staleEntryCount == 0) | |
return; | |
for (size_t i = 0; i < self->entryCount && self->staleEntryCount > 0; i++){ | |
TableEntry *entry = &self->entries[i]; | |
if (entry->key.type == ThebeNilType){ | |
memmove(self->entries + i, self->entries + i + 1, sizeof(TableEntry) * (self->entryCount - i - 1)); | |
self->staleEntryCount--; | |
self->entryCount--; | |
} | |
} | |
ThebeTable_reindex(self); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef THEBE_H | |
#define THEBE_H | |
#include <stdbool.h> | |
#include <stdlib.h> | |
/*** TYPES | |
* The publicly-visible types for the Thebe environment. | |
*/ | |
typedef struct ThebeFunction ThebeFunction; | |
typedef struct ThebeResult ThebeResult; | |
typedef struct ThebeString ThebeString; | |
typedef struct ThebeTable ThebeTable; | |
typedef size_t ThebeTableCookie; | |
typedef struct ThebeValue ThebeValue; | |
typedef enum{ | |
ThebeNilType, | |
ThebeBooleanType, | |
ThebeIntegerType, | |
ThebeRealType, | |
ThebeStringType, | |
ThebeFunctionType, | |
ThebeTableType, | |
ThebeOpaqueType | |
} ThebeType; | |
struct ThebeValue{ | |
ThebeType type; | |
union{ | |
bool boolean; | |
int integer; | |
double real; | |
ThebeString *string; | |
ThebeFunction *function; | |
ThebeTable *table; | |
void *opaque; | |
} as; | |
}; | |
struct ThebeResult{ | |
bool error; | |
ThebeValue value; | |
}; | |
/*** VALUES */ | |
bool | |
ThebeValue_equals(const ThebeValue *self, const ThebeValue *other); | |
/*** STRINGS */ | |
ThebeString * | |
ThebeString_newEmptyString(void); | |
ThebeString * | |
ThebeString_newFromString(const char *string); | |
ThebeString * | |
ThebeString_newFromCountedString(const char *string, size_t length); | |
size_t | |
ThebeString_length(const ThebeString *self); | |
const char * | |
ThebeString_content(const ThebeString *self); | |
void | |
ThebeString_free(ThebeString *self); | |
/*** TABLES */ | |
ThebeTable * | |
ThebeTable_new(void); | |
void | |
ThebeTable_free(ThebeTable *self); | |
bool | |
ThebeTable_insert(ThebeTable *self, const ThebeValue *key, const ThebeValue *value); | |
bool | |
ThebeTable_contains(const ThebeTable *self, const ThebeValue *key); | |
ThebeResult | |
ThebeTable_get(const ThebeTable *self, const ThebeValue *key); | |
void | |
ThebeTable_remove(ThebeTable *self, const ThebeValue *key); | |
void | |
ThebeTable_clear(ThebeTable *self); | |
size_t | |
ThebeTable_size(const ThebeTable *self); | |
#define ThebeTableIteratorFinished ((ThebeTableCookie)-1) | |
ThebeTableCookie | |
ThebeTable_startIteration(const ThebeTable *self); | |
ThebeTableCookie | |
ThebeTable_getNextEntry(const ThebeTable *self, ThebeValue *key, ThebeValue *value, ThebeTableCookie cookie); | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment