Created
August 15, 2025 10:46
-
-
Save tatut/d3f05a97e26220daa44376214a0edb87 to your computer and use it in GitHub Desktop.
Trie in C
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdbool.h> | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <sys/mman.h> | |
#include <sys/stat.h> | |
#include <fcntl.h> | |
#include <assert.h> | |
#include <ctype.h> | |
#define u8 uint8_t | |
#define u64 uint64_t | |
typedef struct trie { | |
struct trie *children; | |
u8 num_children; | |
char ch; | |
bool leaf; | |
void *data; // arbitrary user data | |
} trie; | |
int trie_node_compare(const void *a, const void *b) { return ((trie*)a)->ch - ((trie*)b)->ch; } | |
/* bsearch over sorted children */ | |
trie *_trie_find_child(trie *node, char ch) { | |
u8 n = node->num_children; | |
if(n == 0) { | |
return NULL; | |
} else if(n == 1) { | |
return (node->children[0].ch == ch) ? &node->children[0] : NULL; | |
} | |
int l=0, r=n-1, i; | |
while(l <= r) { | |
i = l + ((r-l)/2); | |
int d = node->children[i].ch - ch; | |
if(d == 0) { | |
return &node->children[i]; | |
} else if(d < 0) { | |
l = i+1; | |
} else if(d > 0) { | |
r = i-1; | |
} | |
} | |
return NULL; | |
} | |
/* Insert into the trie, returning the leaf node. | |
* If already exists, returns the existing node. | |
*/ | |
trie *_trie_insert(trie *root, char *word, bool insert) { | |
char ch; | |
trie *at = root; | |
trie *node = root; | |
while((ch = *word++)) { | |
node = NULL; | |
// find child in current children | |
node = _trie_find_child(at, ch); | |
if(node == NULL) { | |
if(!insert) return NULL; // don't insert | |
// need to add child (PENDING: initially alloc some minium size, like 4?) | |
at->children = realloc(at->children, sizeof(trie)*(at->num_children+1)); | |
if(!at->children) { | |
fprintf(stderr, "Realloc failed\n"); | |
return NULL; | |
} | |
node = &at->children[at->num_children++]; | |
node->ch = ch; | |
node->num_children = 0; | |
node->children = NULL; | |
node->data = NULL; | |
qsort(at->children, at->num_children, sizeof(trie), trie_node_compare); | |
// refind the node after sort | |
node = _trie_find_child(at, ch); | |
} | |
at = node; | |
} | |
assert(node != NULL); | |
if(insert) { | |
node->leaf = true; | |
} | |
return node; | |
} | |
trie *trie_insert(trie *t, char *word) { return _trie_insert(t, word, true); } | |
trie *trie_find(trie *t, char *word) { return _trie_insert(t, word, false); } | |
void _trie_starts_with(trie *at, char *prefix, char *buf, size_t left, void (*callback)(trie*, char*)) { | |
if(at->leaf) callback(at, prefix); | |
if(left < 2) return; // no more space | |
for(u8 i=0;i<at->num_children;i++) { | |
buf[0] = at->children[i].ch; | |
buf[1] = 0; | |
_trie_starts_with(&at->children[i], prefix, buf+1, left-1, callback); | |
} | |
} | |
// print all the words that start with given word | |
void trie_starts_with(trie *root, char *word, void (*callback)(trie*, char*)) { | |
char buf[512]; | |
int len = strlen(word); | |
memcpy(buf, word, len); | |
buf[len] = 0; | |
trie *start = trie_find(root, word); | |
if(!start) { printf("not found: %s\n", word); return;} | |
_trie_starts_with(start, buf, &buf[len], 512-len, callback); | |
} | |
void wordcount(trie *root, char *word) { | |
trie *node = trie_insert(root, word); | |
u64 *count = (u64*) &node->data; // use userdata pointer directly as counter | |
*count += 1; | |
} | |
bool is_ws(char ch) { | |
return (ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t'); | |
} | |
void print_with_count(trie *n, char *word) { | |
printf("FOUND: %s (%llu occurences)\n", word, (u64) n->data); | |
} | |
void _trie_export_graph(FILE *f, trie *n, char *prefix, char *buf, size_t left) { | |
if(left < 2) return; // no more space | |
if(!n->ch) { | |
fprintf(f,"%s [label=\"root|{", prefix); | |
} else { | |
fprintf(f,"%s [label=\"'%c' (%s)|{", prefix, n->ch, n->leaf ? "leaf" : "branch"); | |
} | |
if(n->leaf) printf(" got leaf %c\n", n->ch); | |
// digraph node | |
for(u8 i=0;i<n->num_children;i++) { | |
if(i > 0) fprintf(f, "|"); | |
fprintf(f,"<%c> %c ", n->children[i].ch, n->children[i].ch); | |
} | |
fprintf(f,"}\"];\n"); | |
// links to all my children | |
for(u8 i=0;i<n->num_children;i++) { | |
fprintf(f,"%s:%c -> %s%c;\n", prefix, n->children[i].ch, prefix, n->children[i].ch); | |
} | |
// recursively output children | |
for(u8 i=0;i<n->num_children;i++) { | |
buf[0] = n->children[i].ch; | |
buf[1] = 0; | |
_trie_export_graph(f, &n->children[i], prefix, buf+1, left-1); | |
} | |
} | |
void trie_export_graph(const char *file_name, trie *root) { | |
char buf[512] = "r_\0"; | |
FILE *f = fopen(file_name, "w"); | |
if(!f) return; | |
fprintf(f, | |
"digraph {\n" | |
" rankdir=LR;\n" | |
" node [ shape=record ];\n"); | |
_trie_export_graph(f, root, buf, &buf[2], 510); | |
fprintf(f, "}\n"); | |
} | |
int main(int argc, char **argv) { | |
if(argc < 3) { | |
printf("Usage: trie <corpus> word1 ... wordN\n" | |
"Loads corpus as trie and searches all given words."); | |
return 1; | |
} | |
/* | |
trie root={0}; | |
trie_insert(&root, "hello"); | |
trie_insert(&root, "hell"); | |
trie_insert(&root, "world"); | |
trie_insert(&root, "wat"); | |
trie_insert(&root, "wait"); | |
trie_insert(&root, "xylophone"); | |
trie_export_graph("trie.dot", &root); | |
*/ | |
FILE *corpus = fopen(argv[1], "r"); | |
if(!corpus) { | |
fprintf(stderr, "Can't open corpus file: %s\n", argv[1]); | |
return 1; | |
} | |
struct stat fs; | |
int in = fileno(corpus); | |
fstat(in, &fs); | |
char *map = mmap(0, fs.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, in, 0); | |
if(!map) { | |
fprintf(stderr, "Could not load corpus: %s\n", argv[1]); | |
} | |
// Load the corpus, insert all words into the trie | |
char *end = map + fs.st_size; | |
char *at = map; | |
trie root = {0}; | |
while(at < end) { | |
// skipws | |
while(at < end && is_ws(*at)) at++; | |
char *word = at; | |
while(at < end && !is_ws(*at)) at++; | |
*at = 0; | |
// convert to lower case | |
char *l = word; | |
while(*l) { *l = tolower(*l); l++; } | |
at++; | |
wordcount(&root, word); | |
} | |
for(int i=2; i<argc;i++) { | |
printf("-------\n"); | |
trie *n = trie_find(&root, argv[i]); | |
if(!n) { | |
printf("Word: %s not found in corpus.\n", argv[i]); | |
} else { | |
printf("Words starting with: %s\n", argv[i]); | |
trie_starts_with(&root, argv[i], print_with_count); | |
} | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
NOTE: no
free
because we are just quitting afterwards, if you use in a long running program, should implement atrie_free
function.