Skip to content

Instantly share code, notes, and snippets.

@tatut
Created August 15, 2025 10:46
Show Gist options
  • Save tatut/d3f05a97e26220daa44376214a0edb87 to your computer and use it in GitHub Desktop.
Save tatut/d3f05a97e26220daa44376214a0edb87 to your computer and use it in GitHub Desktop.
Trie in C
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <assert.h>
#include <ctype.h>
#define u8 uint8_t
#define u64 uint64_t
typedef struct trie {
struct trie *children;
u8 num_children;
char ch;
bool leaf;
void *data; // arbitrary user data
} trie;
int trie_node_compare(const void *a, const void *b) { return ((trie*)a)->ch - ((trie*)b)->ch; }
/* bsearch over sorted children */
trie *_trie_find_child(trie *node, char ch) {
u8 n = node->num_children;
if(n == 0) {
return NULL;
} else if(n == 1) {
return (node->children[0].ch == ch) ? &node->children[0] : NULL;
}
int l=0, r=n-1, i;
while(l <= r) {
i = l + ((r-l)/2);
int d = node->children[i].ch - ch;
if(d == 0) {
return &node->children[i];
} else if(d < 0) {
l = i+1;
} else if(d > 0) {
r = i-1;
}
}
return NULL;
}
/* Insert into the trie, returning the leaf node.
* If already exists, returns the existing node.
*/
trie *_trie_insert(trie *root, char *word, bool insert) {
char ch;
trie *at = root;
trie *node = root;
while((ch = *word++)) {
node = NULL;
// find child in current children
node = _trie_find_child(at, ch);
if(node == NULL) {
if(!insert) return NULL; // don't insert
// need to add child (PENDING: initially alloc some minium size, like 4?)
at->children = realloc(at->children, sizeof(trie)*(at->num_children+1));
if(!at->children) {
fprintf(stderr, "Realloc failed\n");
return NULL;
}
node = &at->children[at->num_children++];
node->ch = ch;
node->num_children = 0;
node->children = NULL;
node->data = NULL;
qsort(at->children, at->num_children, sizeof(trie), trie_node_compare);
// refind the node after sort
node = _trie_find_child(at, ch);
}
at = node;
}
assert(node != NULL);
if(insert) {
node->leaf = true;
}
return node;
}
trie *trie_insert(trie *t, char *word) { return _trie_insert(t, word, true); }
trie *trie_find(trie *t, char *word) { return _trie_insert(t, word, false); }
void _trie_starts_with(trie *at, char *prefix, char *buf, size_t left, void (*callback)(trie*, char*)) {
if(at->leaf) callback(at, prefix);
if(left < 2) return; // no more space
for(u8 i=0;i<at->num_children;i++) {
buf[0] = at->children[i].ch;
buf[1] = 0;
_trie_starts_with(&at->children[i], prefix, buf+1, left-1, callback);
}
}
// print all the words that start with given word
void trie_starts_with(trie *root, char *word, void (*callback)(trie*, char*)) {
char buf[512];
int len = strlen(word);
memcpy(buf, word, len);
buf[len] = 0;
trie *start = trie_find(root, word);
if(!start) { printf("not found: %s\n", word); return;}
_trie_starts_with(start, buf, &buf[len], 512-len, callback);
}
void wordcount(trie *root, char *word) {
trie *node = trie_insert(root, word);
u64 *count = (u64*) &node->data; // use userdata pointer directly as counter
*count += 1;
}
bool is_ws(char ch) {
return (ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t');
}
void print_with_count(trie *n, char *word) {
printf("FOUND: %s (%llu occurences)\n", word, (u64) n->data);
}
void _trie_export_graph(FILE *f, trie *n, char *prefix, char *buf, size_t left) {
if(left < 2) return; // no more space
if(!n->ch) {
fprintf(f,"%s [label=\"root|{", prefix);
} else {
fprintf(f,"%s [label=\"'%c' (%s)|{", prefix, n->ch, n->leaf ? "leaf" : "branch");
}
if(n->leaf) printf(" got leaf %c\n", n->ch);
// digraph node
for(u8 i=0;i<n->num_children;i++) {
if(i > 0) fprintf(f, "|");
fprintf(f,"<%c> %c ", n->children[i].ch, n->children[i].ch);
}
fprintf(f,"}\"];\n");
// links to all my children
for(u8 i=0;i<n->num_children;i++) {
fprintf(f,"%s:%c -> %s%c;\n", prefix, n->children[i].ch, prefix, n->children[i].ch);
}
// recursively output children
for(u8 i=0;i<n->num_children;i++) {
buf[0] = n->children[i].ch;
buf[1] = 0;
_trie_export_graph(f, &n->children[i], prefix, buf+1, left-1);
}
}
void trie_export_graph(const char *file_name, trie *root) {
char buf[512] = "r_\0";
FILE *f = fopen(file_name, "w");
if(!f) return;
fprintf(f,
"digraph {\n"
" rankdir=LR;\n"
" node [ shape=record ];\n");
_trie_export_graph(f, root, buf, &buf[2], 510);
fprintf(f, "}\n");
}
int main(int argc, char **argv) {
if(argc < 3) {
printf("Usage: trie <corpus> word1 ... wordN\n"
"Loads corpus as trie and searches all given words.");
return 1;
}
/*
trie root={0};
trie_insert(&root, "hello");
trie_insert(&root, "hell");
trie_insert(&root, "world");
trie_insert(&root, "wat");
trie_insert(&root, "wait");
trie_insert(&root, "xylophone");
trie_export_graph("trie.dot", &root);
*/
FILE *corpus = fopen(argv[1], "r");
if(!corpus) {
fprintf(stderr, "Can't open corpus file: %s\n", argv[1]);
return 1;
}
struct stat fs;
int in = fileno(corpus);
fstat(in, &fs);
char *map = mmap(0, fs.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, in, 0);
if(!map) {
fprintf(stderr, "Could not load corpus: %s\n", argv[1]);
}
// Load the corpus, insert all words into the trie
char *end = map + fs.st_size;
char *at = map;
trie root = {0};
while(at < end) {
// skipws
while(at < end && is_ws(*at)) at++;
char *word = at;
while(at < end && !is_ws(*at)) at++;
*at = 0;
// convert to lower case
char *l = word;
while(*l) { *l = tolower(*l); l++; }
at++;
wordcount(&root, word);
}
for(int i=2; i<argc;i++) {
printf("-------\n");
trie *n = trie_find(&root, argv[i]);
if(!n) {
printf("Word: %s not found in corpus.\n", argv[i]);
} else {
printf("Words starting with: %s\n", argv[i]);
trie_starts_with(&root, argv[i], print_with_count);
}
}
return 0;
}
@tatut
Copy link
Author

tatut commented Aug 15, 2025

NOTE: no free because we are just quitting afterwards, if you use in a long running program, should implement a trie_free function.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment