Skip to content

Instantly share code, notes, and snippets.

@deadpixi
Last active August 17, 2024 06:25
Show Gist options
  • Save deadpixi/8350ddb94ece931e9a6f1f86e83dc0ea to your computer and use it in GitHub Desktop.
Save deadpixi/8350ddb94ece931e9a6f1f86e83dc0ea to your computer and use it in GitHub Desktop.
An insertion-order-preserving hash table in C, as part of the Thebe scripting language (which I will eventually get around to finishing).
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include "thebe.h"
/*** TODO
* - switch tables to use uint32_t indexes; that's big enough for anybody, maybe even consider using
* different sized integers for different sized tables
* - grow the entry table more slowly than the index table
* - improve the naming of position vs index
* - add table sorting, value comparison
* - have the *_new functions return ThebeValues?
* - needs{growth,compaction} should not be separate from just growing and compacting;
* those functions can just exit early if the change isn't needed
*/
/*** PRIVATE STRUCTURES AND ENUMERATIONS
* All internal structures are predeclared here, so that we don't need to worry about order later on.
*/
typedef struct TableEntry TableEntry;
struct ThebeString{
size_t length;
char *content;
uint32_t hashValue;
};
struct TableEntry{
ThebeValue key, value;
};
struct ThebeTable
{
size_t staleEntryCount;
size_t entryCount, allocatedCount;
size_t *indices;
TableEntry *entries;
};
/*** FUNCTION PROTOTYPES
* Every function likewise gets a prototype, to avoid ordering issues.
* FIXME - We should pass ThebeValue by value everywhere.
*/
static uint32_t computeHash(const uint8_t* key, size_t length);
bool ThebeValue_equals(const ThebeValue *self, const ThebeValue *other);
static uint32_t ThebeValue_computeHash(const ThebeValue *self);
static bool ThebeValue_isUsableAsKey(const ThebeValue *self);
static TableEntry *ThebeTable_getEntryForKey(const ThebeTable *self, const ThebeValue *key);
static bool ThebeTable_needsGrowth(const ThebeTable *self);
static bool ThebeTable_grow(ThebeTable *self, size_t size);
static inline size_t ThebeTable_getIndexOffsetForKey(const ThebeTable *self, const ThebeValue *key);
static bool ThebeTable_needsCompaction(const ThebeTable *self);
static void ThebeTable_compact(ThebeTable *self);
static void ThebeTable_reindex(ThebeTable *self);
/*** UTILITY FUNCTIONS */
/* Jenkins one-at-a-time hash. */
static uint32_t
computeHash(const uint8_t *key, size_t length)
{
uint32_t hash = 0;
for (size_t i = 0; i < length; i++){
hash += key[i];
hash += hash << 10;
hash ^= hash >> 6;
}
hash += hash << 3;
hash ^= hash >> 11;
hash += hash << 15;
return hash;
}
/* Check if a floating point number is a non-infinite number. */
static bool
isFiniteFloat(double real)
{
switch (fpclassify(real)){
case FP_NORMAL: case FP_SUBNORMAL: case FP_ZERO:
return true;
default:
return false;
}
}
/*** VALUES
* Values in the language.
*/
bool
ThebeValue_equals(const ThebeValue *self, const ThebeValue *other)
{
switch (self->type){
case ThebeNilType:
return other->type == ThebeNilType;
case ThebeBooleanType:
return other->type == ThebeBooleanType && self->as.boolean == other->as.boolean;
case ThebeIntegerType:
if (other->type == ThebeIntegerType)
return self->as.integer == other->as.integer;
else if (other->type == ThebeRealType)
return self->as.integer == other->as.real;
else
return false;
case ThebeRealType:
if (other->type == ThebeIntegerType)
return self->as.real == other->as.integer;
else if (other->type == ThebeRealType)
return self->as.real == other->as.real;
else
return false;
case ThebeStringType:
return other->type == ThebeStringType
&& self->as.string->length == other->as.string->length
&& ThebeValue_computeHash(self) == ThebeValue_computeHash(other)
&& memcmp(self->as.string->content, other->as.string->content, self->as.string->length) == 0;
case ThebeFunctionType:
return other->type == ThebeFunctionType && self->as.function == other->as.function;
case ThebeTableType:
return other->type == ThebeTableType && self->as.table == other->as.table;
case ThebeOpaqueType:
return other->type == ThebeOpaqueType && self->as.opaque == other->as.opaque;
}
return false;
}
static uint32_t
ThebeValue_computeHash(const ThebeValue *self)
{
switch (self->type){
case ThebeBooleanType:
return (uint32_t)self->as.boolean;
case ThebeIntegerType:
return (uint32_t)self->as.integer;
case ThebeRealType:
return isFiniteFloat(self->as.real)? (uint32_t)self->as.real : 0;
case ThebeStringType:
return self->as.string->hashValue;
default:
return 0;
}
}
static bool
ThebeValue_isUsableAsKey(const ThebeValue *self)
{
return self->type == ThebeBooleanType
|| self->type == ThebeIntegerType
|| self->type == ThebeStringType
|| (self->type == ThebeRealType && isFiniteFloat(self->as.real));
}
/*** STRINGS
* Your bog standard counted string, though we do precompute its hash value.
* Note that the Thebe interpreter interns string constants in the source code,
* so we only ever have one copy of those.
*/
ThebeString *
ThebeString_newEmptyString(void)
{
return calloc(1, sizeof(ThebeString));
}
ThebeString *
ThebeString_newFromString(const char *string)
{
return ThebeString_newFromCountedString(string, strlen(string));
}
#define HASHED_STRING_PREFIX 128
ThebeString *
ThebeString_newFromCountedString(const char *string, size_t length)
{
ThebeString *self = calloc(1, sizeof(ThebeString));
if (!self)
return NULL;
self->content = calloc(1, length + 1);
if (!self->content){
free(self);
return NULL;
}
memcpy(self->content, string, length);
size_t prefix = length > HASHED_STRING_PREFIX? HASHED_STRING_PREFIX : length;
self->hashValue = computeHash((const uint8_t *)self->content, prefix);
return self;
}
void
ThebeString_free(ThebeString *self)
{
if (self){
free(self->content);
free(self);
}
}
size_t
ThebeString_length(const ThebeString *self)
{
return self->length;
}
const char *
ThebeString_contents(const ThebeString *self)
{
return self->content;
}
/*** TABLES
* A table is a simple data structure that maps a key to a value.
* Both keys and values are instances of the ThebeValue structure, which is a tagged union.
*
* Tables have some interesting properties, the most notable of which is that iteration over the
* (key, value) pairs of the table is guaranteed to iterate in insertion order.
*
* The representation of a table in memory is compact, and inspired by the (beautiful) implementation
* of ordered dictionaries in Python:
*
* Tables consist of two arrays: an index array, and an item array.
* For example:
* var tab = {
* foo = "red",
* bar = "green",
* baz = "blue",
* }
*
* would be represented as:
*
* indices = [-1, -1, -1, 2, -1, -1, 0, -1, -1, -1, 1]
* content = [
* ["bar", "green"],
* ["foo", "red"],
* ["baz", "blue"],
* ]
*
* This structure has significant advantages:
* - Adding a new entry just means plonking it on the end of the content array.
* - Iterating over the contents means just iterating over the content array,
* which preserves insertion order.
* - Resizing doesn't require moving the members of the content array; they're still packed at the
* beginning of the array.
*
* A (slight) disadvantage:
* - Removing an entry just uses tombstoning; compaction is sometimes necessary.
* Much like growing a table, this is amortized over all removals.
*/
#define DEFAULT_TABLE_SIZE 17
#define NO_ENTRY ((size_t)-1)
ThebeTable *
ThebeTable_new(void)
{
ThebeTable *self = calloc(1, sizeof(ThebeTable));
if (!self)
return NULL;
if (!ThebeTable_grow(self, DEFAULT_TABLE_SIZE)){
free(self);
return NULL;
}
return self;
}
void
ThebeTable_free(ThebeTable *self)
{
if (self){
free(self->entries);
free(self->indices);
free(self);
}
}
bool
ThebeTable_contains(const ThebeTable *self, const ThebeValue *key)
{
return ThebeTable_getEntryForKey(self, key) != NULL;
}
bool
ThebeTable_insert(ThebeTable *self, const ThebeValue *key, const ThebeValue *value)
{
if (!ThebeValue_isUsableAsKey(key))
return false;
/* If this key has a value already, replace it. */
TableEntry *entry = ThebeTable_getEntryForKey(self, key);
if (entry){
memcpy(&entry->value, value, sizeof(ThebeValue));
return true;
}
/* Otherwise, grow the table if need be. */
if (ThebeTable_needsGrowth(self) && !ThebeTable_grow(self, self->allocatedCount * 2))
return false;
/* Otherwise, compute the hash, find the index, add the value. */
size_t entryPosition = self->entryCount++;
entry = &self->entries[entryPosition];
memcpy(&entry->key, key, sizeof(ThebeValue));
memcpy(&entry->value, value, sizeof(ThebeValue));
size_t position = ThebeTable_getIndexOffsetForKey(self, key);
if (self->indices[position] == NO_ENTRY)
self->indices[position] = entryPosition;
return true;
}
ThebeResult
ThebeTable_get(const ThebeTable *self, const ThebeValue *key)
{
TableEntry *entry = ThebeTable_getEntryForKey(self, key);
if (!entry)
return (ThebeResult){.error = true};
return (ThebeResult){.error = false, .value = entry->value};
}
void
ThebeTable_remove(ThebeTable *self, const ThebeValue *key)
{
if (!ThebeTable_contains(self, key))
return;
/* Figure out where this entry lives. */
size_t position = ThebeTable_getIndexOffsetForKey(self, key);
size_t index = self->indices[position];
/* Mark the slot as unused. */
memset(&self->entries[index], 0, sizeof(TableEntry));
/* If by happy chance this was at the end of the entries,
* removing it doesn't create a stale entry.
*/
if (index == self->entryCount - 1)
self->entryCount--;
else
self->staleEntryCount++;
/* Compact the table if needed. */
if (ThebeTable_needsCompaction(self))
ThebeTable_compact(self);
}
void
ThebeTable_clear(ThebeTable *self)
{
for (size_t i = 0; i < self->allocatedCount; i++)
self->indices[i] = NO_ENTRY;
self->entryCount = 0;
}
size_t
ThebeTable_size(const ThebeTable *self)
{
return self->entryCount;
}
ThebeTableCookie
ThebeTable_startIteration(const ThebeTable *self)
{
(void)self;
return 0;
}
ThebeTableCookie
ThebeTable_getNextEntry(const ThebeTable *self, ThebeValue *key, ThebeValue *value, ThebeTableCookie cookie)
{
while (cookie < self->entryCount && self->entries[cookie].key.type == ThebeNilType)
cookie++;
if (cookie >= self->entryCount)
return ThebeTableIteratorFinished;
TableEntry *entry = &self->entries[cookie];
memcpy(key, &entry->key, sizeof(ThebeValue));
memcpy(value, &entry->value, sizeof(ThebeValue));
cookie++;
return cookie >= self->entryCount? ThebeTableIteratorFinished : cookie;
}
static inline size_t
ThebeTable_getIndexOffsetForKey(const ThebeTable *self, const ThebeValue *key)
{
return ThebeValue_computeHash(key) % self->allocatedCount;
}
static TableEntry *
ThebeTable_getEntryForKey(const ThebeTable *self, const ThebeValue *key)
{
if (!ThebeValue_isUsableAsKey(key))
return NULL;
size_t index = self->indices[ThebeTable_getIndexOffsetForKey(self, key)];
if (index == NO_ENTRY)
return NULL;
size_t startIndex = index;
do{
TableEntry *entry = &self->entries[index++];
if (ThebeValue_equals(&entry->key, key))
return entry;
index %= self->entryCount;
} while (index != startIndex);
return NULL;
}
static bool
ThebeTable_needsGrowth(const ThebeTable *self)
{
return self->entryCount >= (self->allocatedCount - 2)
|| ((double)self->entryCount) / ((double)self->allocatedCount) >= 0.8;
}
static bool
ThebeTable_grow(ThebeTable *self, size_t size)
{
if (self->allocatedCount > size)
return true;
if (self->allocatedCount + self->staleEntryCount >= size){
ThebeTable_compact(self);
return true;
}
size_t *newIndices = realloc(self->indices, size * sizeof(size_t));
if (!newIndices)
return false;
self->indices = newIndices;
TableEntry *newEntries = realloc(self->entries, size * sizeof(TableEntry));
if (!newEntries)
return false;
self->entries = newEntries;
self->allocatedCount = size;
ThebeTable_reindex(self);
return true;
}
static void
ThebeTable_reindex(ThebeTable *self)
{
for (size_t i = 0; i < self->allocatedCount; i++)
self->indices[i] = NO_ENTRY;
for (size_t i = 0; i < self->entryCount; i++){
TableEntry *entry = &self->entries[i];
if (entry->key.type != ThebeNilType)
self->indices[ThebeTable_getIndexOffsetForKey(self, &entry->key)] = i;
}
}
static bool
ThebeTable_needsCompaction(const ThebeTable *self)
{
return ThebeTable_needsGrowth(self)
|| ((double)self->staleEntryCount) / ((double)self->entryCount) >= 0.2;
}
static void
ThebeTable_compact(ThebeTable *self)
{
if (self->entryCount < 2 || self->staleEntryCount == 0)
return;
for (size_t i = 0; i < self->entryCount && self->staleEntryCount > 0; i++){
TableEntry *entry = &self->entries[i];
if (entry->key.type == ThebeNilType){
memmove(self->entries + i, self->entries + i + 1, sizeof(TableEntry) * (self->entryCount - i - 1));
self->staleEntryCount--;
self->entryCount--;
}
}
ThebeTable_reindex(self);
}
#ifndef THEBE_H
#define THEBE_H
#include <stdbool.h>
#include <stdlib.h>
/*** TYPES
* The publicly-visible types for the Thebe environment.
*/
typedef struct ThebeFunction ThebeFunction;
typedef struct ThebeResult ThebeResult;
typedef struct ThebeString ThebeString;
typedef struct ThebeTable ThebeTable;
typedef size_t ThebeTableCookie;
typedef struct ThebeValue ThebeValue;
typedef enum{
ThebeNilType,
ThebeBooleanType,
ThebeIntegerType,
ThebeRealType,
ThebeStringType,
ThebeFunctionType,
ThebeTableType,
ThebeOpaqueType
} ThebeType;
struct ThebeValue{
ThebeType type;
union{
bool boolean;
int integer;
double real;
ThebeString *string;
ThebeFunction *function;
ThebeTable *table;
void *opaque;
} as;
};
struct ThebeResult{
bool error;
ThebeValue value;
};
/*** VALUES */
bool
ThebeValue_equals(const ThebeValue *self, const ThebeValue *other);
/*** STRINGS */
ThebeString *
ThebeString_newEmptyString(void);
ThebeString *
ThebeString_newFromString(const char *string);
ThebeString *
ThebeString_newFromCountedString(const char *string, size_t length);
size_t
ThebeString_length(const ThebeString *self);
const char *
ThebeString_content(const ThebeString *self);
void
ThebeString_free(ThebeString *self);
/*** TABLES */
ThebeTable *
ThebeTable_new(void);
void
ThebeTable_free(ThebeTable *self);
bool
ThebeTable_insert(ThebeTable *self, const ThebeValue *key, const ThebeValue *value);
bool
ThebeTable_contains(const ThebeTable *self, const ThebeValue *key);
ThebeResult
ThebeTable_get(const ThebeTable *self, const ThebeValue *key);
void
ThebeTable_remove(ThebeTable *self, const ThebeValue *key);
void
ThebeTable_clear(ThebeTable *self);
size_t
ThebeTable_size(const ThebeTable *self);
#define ThebeTableIteratorFinished ((ThebeTableCookie)-1)
ThebeTableCookie
ThebeTable_startIteration(const ThebeTable *self);
ThebeTableCookie
ThebeTable_getNextEntry(const ThebeTable *self, ThebeValue *key, ThebeValue *value, ThebeTableCookie cookie);
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment