Skip to content

Instantly share code, notes, and snippets.

@wernsey
Created September 13, 2020 10:49
Show Gist options
  • Save wernsey/831cc801bb859bb649e194e4fb386561 to your computer and use it in GitHub Desktop.
Save wernsey/831cc801bb859bb649e194e4fb386561 to your computer and use it in GitHub Desktop.
String interning snippet
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
typedef struct internTreeNode ITNode;
/* =============================================================
String Interning
`str_intern()` returns a reference counted copy of its parameter. It looks up
the string in the collection of strings it has seen before. If it hasn't seen
the string before, it creates a copy with a reference count initialised to 1
and adds it to the collection, otherwise it increments the reference count of
the string in the collection and returns it.
The collection the strings are kept in is a Red-Black tree. The implementation
is just a straight forward adaption of the one in the [Wikipedia article][red-black].
The first parameter to `str_intern()` is a pointer to an `ITNode*` that forms
the root of this tree.
The reference counting works by storing an `unsigned int` in the bytes before
the `char*` returned by `str_intern()`. The `char*` can therefore be used like
a regular null-terminated C string.
Call `str_release()` to decrement the reference count. If the reference count
drops to zero, the memory is freed.
[red-black]: https://en.wikipedia.org/wiki/Red%E2%80%93black_tree#Insertion
============================================================= */
#define BLACK 0
#define RED 1
struct internTreeNode {
char *value;
int color;
struct internTreeNode *parent, *left, *right;
};
static char *str_make(const char *str) {
size_t len = strlen(str);
unsigned int *rc = malloc((sizeof *rc) + len + 1);
char *data = (char*)rc + sizeof *rc;
*rc = 1;
memcpy(data, str, len);
data[len] = '\0';
return data;
}
static ITNode *make_intern_node(ITNode *parent, const char *str) {
ITNode *node = malloc(sizeof *node);
if(!node)
return NULL;
node->value = str_make(str);
node->parent = parent;
node->color = RED;
node->left = NULL;
node->right = NULL;
return node;
}
static void str_release(char *str) {
unsigned int *rc = (unsigned int *)(str - sizeof *rc);
if(--(*rc) == 0)
free(rc);
}
static char *str_retain(char *str) {
unsigned int *rc = (unsigned int *)(str - sizeof *rc);
(*rc)++;
return str;
}
static ITNode *do_str_intern(ITNode *root, const char *str) {
int i = strcmp(root->value, str);
if(i == 0) {
str_retain(root->value);
return root;
} else if(i > 0) {
if(root->left) {
return do_str_intern(root->left, str);
} else {
root->left = make_intern_node(root, str);
return root->left;
}
} else { // i < 0
if(root->right) {
return do_str_intern(root->right, str);
} else {
root->right = make_intern_node(root, str);
return root->right;
}
}
}
static void repair_tree(ITNode *n);
static char *str_intern(ITNode **pool, const char *str) {
ITNode *n;
if(!*pool) {
n = make_intern_node(NULL, str);
} else {
n = do_str_intern(*pool, str);
}
repair_tree(n);
ITNode *root = n;
while(root->parent)
root = root->parent;
*pool = root;
return n->value;
}
static void free_intern_tree(ITNode *node) {
if(!node)
return;
free_intern_tree(node->left);
free_intern_tree(node->right);
// Note that node->value is not freed on purpose...
free(node);
}
/* Red-Black Tree */
static ITNode *uncle(ITNode *n) {
ITNode *p = n->parent;
if(p && p->parent)
return p == p->parent->left ? p->parent->right : p->parent->left;
return NULL;
}
static void rotate_left(ITNode *n) {
assert(n);
ITNode *nnew = n->right;
ITNode *p = n->parent;
assert(nnew);
n->right = nnew->left;
nnew->left = n;
n->parent = nnew;
if(n->right)
n->right->parent = n;
if(p) {
if(n == p->left)
p->left = nnew;
else
p->right = nnew;
}
nnew->parent = p;
}
static void rotate_right(ITNode *n) {
assert(n);
ITNode *nnew = n->left;
ITNode *p = n->parent;
assert(nnew);
n->left = nnew->right;
nnew->right = n;
n->parent = nnew;
if(n->left)
n->left->parent = n;
if(p) {
if(n == p->left)
p->left = nnew;
else
p->right = nnew;
}
nnew->parent = p;
}
static void repair_tree(ITNode *n) {
ITNode *p = n->parent;
if(!p) {
// case 1
n->color = BLACK;
} else if(p->color == BLACK) {
// case 2
return;
} else {
ITNode *u = uncle(n);
ITNode *g = p->parent;
assert(g);
if(u && u->color == RED) {
// case 3
p->color = BLACK;
u->color = BLACK;
g->color = RED;
repair_tree(g);
} else {
// case 4
if(n == p->right && p == g->left) {
rotate_left(p);
n = n->left;
p = n->parent;
g = p->parent;
} else if(n == p->left && p == g->right) {
rotate_right(p);
n = n->right;
p = n->parent;
g = p->parent;
}
// case 4 step 2
assert(p && g);
if(n == p->left)
rotate_right(g);
else
rotate_left(g);
p->color = BLACK;
g->color = RED;
}
}
}
// ===========================================================
static void drawNode(ITNode *node) {
if(!node)
return;
drawNode(node->left);
if(node->color == RED)
printf(" %s [color=red,fontcolor=white,style=filled];\n", node->value);
else
printf(" %s [color=black,fontcolor=white,style=filled];\n", node->value);
drawNode(node->right);
}
static void drawLink(ITNode *node) {
if(!node)
return;
if(node->left) {
printf(" %s -> %s;\n", node->value, node->left->value);
drawLink(node->left);
}
if(node->right) {
printf(" %s -> %s;\n", node->value, node->right->value);
drawLink(node->right);
}
}
void drawGraph(ITNode *root) {
printf("digraph G {\n");
drawNode(root);
drawLink(root);
printf("}\n");
}
/*
dot -Tjpeg -o x.jpg g.dot
*/
int main(int argc, char *argv[]) {
ITNode *root = NULL;
str_intern(&root, "e");
str_intern(&root, "l");
str_intern(&root, "f");
str_intern(&root, "w");
str_intern(&root, "z");
str_intern(&root, "m");
str_intern(&root, "o");
str_intern(&root, "c");
str_intern(&root, "b");
str_intern(&root, "j");
str_intern(&root, "h");
str_intern(&root, "s");
str_intern(&root, "x");
str_intern(&root, "n");
str_intern(&root, "u");
str_intern(&root, "v");
str_intern(&root, "g");
str_intern(&root, "d");
str_intern(&root, "y");
str_intern(&root, "i");
str_intern(&root, "p");
str_intern(&root, "a");
str_intern(&root, "t");
str_intern(&root, "q");
str_intern(&root, "r");
str_intern(&root, "k");
str_intern(&root, "ga");
str_intern(&root, "zd");
str_intern(&root, "gh");
str_intern(&root, "ge");
str_intern(&root, "gk");
str_intern(&root, "ze");
str_intern(&root, "da");
str_intern(&root, "za");
str_intern(&root, "zb");
str_intern(&root, "gz");
str_intern(&root, "zc");
str_intern(&root, "aa");
str_intern(&root, "az");
str_intern(&root, "ac");
str_intern(&root, "ab");
str_intern(&root, "de");
str_intern(&root, "dg");
str_intern(&root, "zf");
str_intern(&root, "ma");
str_intern(&root, "mb");
str_intern(&root, "ea");
str_intern(&root, "eaa");
str_intern(&root, "eab");
str_intern(&root, "eaba");
str_intern(&root, "eabb");
drawGraph(root);
free_intern_tree(root);
/*
Note that this program will leak memory, because
`str_release()` needs to be called on all the return
values of `str_intern()` but I didn't bother with
it here
*/
return 0;
}
@wernsey
Copy link
Author

wernsey commented Sep 13, 2020

It outputs a file that can be passed to dot (part of the Graphviz package) to create an image file to visualize the Red-black tree.

Compile and run like so:

gcc -O0 -g -Wall intern.c
./a.out > g.dot
dot -Tjpeg -o g.jpg g.dot

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment