Created
February 4, 2012 14:53
-
-
Save dylan-evans/1738281 to your computer and use it in GitHub Desktop.
Simple hash
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** \file simple_hash.h | |
* | |
* This simple hashing routine was created after finding the hsearch function | |
* a bit inadequate, due to the lack of a reentrant function in posix and | |
* the inability to do simple operations such as delete... and for fun. | |
*/ | |
#include <string.h> | |
#include <stdio.h> | |
#include <unistd.h> | |
#include <malloc.h> | |
#include "simple_hash.h" | |
/** | |
* Allocate a table object and set the hash function. | |
* | |
* \param size The number of entries in the hash. | |
* \param hash_func The hash function. This can be set to null for sh_indexer. | |
* | |
* \return Newly allocated hash table. | |
*/ | |
sh_table *sh_create(int size, int (*hash_func)(char *)) | |
{ | |
sh_table *tab = malloc(sizeof(sh_table)); | |
tab->size = size; | |
tab->table = (sh_bucket *)malloc(sizeof(sh_bucket) * size); | |
memset(tab->table, '\0', sizeof(sh_bucket) * size); | |
if(hash_func) | |
tab->hash_func = hash_func; | |
else | |
tab->hash_func = &sh_indexer; | |
return tab; | |
} | |
/** | |
* A very simple hash routine for use in testing. Actually this has provided | |
* a pretty resonable spread for the assorted texts i have been running through | |
* it. | |
* | |
* \param str The string to be hashed. | |
* \return The hash key. | |
*/ | |
int sh_indexer(char *str) | |
{ | |
int hash, i; | |
for(hash = 0, i = 0; *str; str++, i++) { | |
hash ^= ((int)(*str) << (i % 32)); | |
} | |
return hash; | |
} | |
/** | |
* Find an entry. If no matching key is found then the entry and index is | |
* set to the position of the next empty bucket, of course this rule is | |
* dependent on there always being an empty bucket which requires a new | |
* sh_bucket to be allocated as soon as all the buckets are in a sh_bucket | |
* are filled. | |
* | |
* \param tab The table object. | |
* \param key The key string. | |
* \param ent A location to store the matching entry. | |
* \param index The matching index. | |
* | |
* \return 1 when an entry is found. | |
*/ | |
inline int sh_find_entry(sh_table *tab, char *key, sh_bucket **ent, int *index) | |
{ | |
int hkey; | |
char *lkey, *rkey; | |
hkey = (*tab->hash_func)(key) % tab->size; | |
for(*ent = &tab->table[hkey]; *ent; *ent = (*ent)->next) { | |
for((*index) = 0; (*index) < SH_MAX_BUCKET; (*index)++) { | |
lkey = (*ent)->key[*index]; | |
if(!lkey) return 0; | |
for(rkey = key; *rkey == *lkey; rkey++, lkey++) | |
if(!*rkey) return 1; | |
} | |
} | |
return 0; | |
} | |
/** | |
* Add a new value to the hash table. This function does nothing if en entry | |
* with a matching key already exists. | |
* | |
* \param tab The hash table to add to. | |
* \param key The key to add. | |
* \param val The value to associate with the key. | |
* | |
* \return 1 on success, 0 if the key already exists. | |
* \sa sh_set sh_replace | |
*/ | |
int sh_add(sh_table *tab, char *key, void *val) | |
{ | |
int index = 0; | |
sh_bucket *ent; | |
if(sh_find_entry(tab, key, &ent, &index)) | |
return 0; // Already exists | |
ent->key[index] = key; | |
ent->value[index] = val; | |
if(index == (SH_MAX_BUCKET - 1)) { | |
ent->next = (sh_bucket*)malloc(sizeof(sh_bucket)); | |
memset(ent->next, '\0', sizeof(sh_bucket)); | |
ent->next->prev = ent; | |
} | |
return 1; | |
} | |
/** | |
* This function adds an item to the hash, adding it if | |
* it doesn't already exist. | |
* | |
* \param tab The hash table. | |
* \param key The key to add. | |
* \param val The value to be set. | |
* | |
* \return Always returns 1 | |
* \sa sh_add sh_replace | |
*/ | |
int sh_set(sh_table *tab, char *key, void *val) | |
{ | |
sh_bucket *ent; | |
int index = 0; | |
sh_find_entry(tab, key, &ent, &index); | |
ent->key[index] = (char*) malloc(strlen(key)); | |
strcpy(ent->key[index], key); | |
ent->value[index] = val; | |
if(index == (SH_MAX_BUCKET - 1) && !ent->next) { | |
ent->next = (sh_bucket*)malloc(sizeof(sh_bucket)); | |
} | |
return 1; | |
} | |
/** | |
* Replace an existing item. | |
* | |
* \param tab The table object. | |
* \param key The key string. | |
* \param val The pointer to store. | |
* | |
* \return 1 on success, 0 if the item doesn't exist. | |
*/ | |
int sh_replace(sh_table *tab, char *key, void *val) | |
{ | |
int index; | |
sh_bucket *ent; | |
if(!sh_find_entry(tab, key, &ent, &index)) | |
return 0; // It doesn't exist | |
ent->value[index] = val; | |
return 1; | |
} | |
/** | |
* Delete a matching item. | |
* | |
* \param tab The sh_table to check. | |
* \param key The key to search for. | |
* | |
* \return 0 if no match is found. | |
*/ | |
int sh_delete(sh_table *tab, char *key) | |
{ | |
int index = 0, n_index = 0; | |
sh_bucket *ent, *n_ent; | |
if(!sh_find_entry(tab, key, &ent, &index)) | |
return 0; | |
// This routine shifts each entry into the place of the higher bucket | |
n_ent = ent; | |
while(ent && ent->key[index]) { | |
n_index = index + 1; | |
if(n_index == SH_MAX_BUCKET) { | |
n_index = 0; | |
n_ent = ent->next; | |
} | |
ent->key[index] = n_ent->key[n_index]; | |
ent->value[index] = n_ent->value[n_index]; | |
ent = n_ent; | |
index = n_index; | |
} | |
if(n_index == SH_MAX_BUCKET) | |
free(ent->next); | |
n_ent->key[n_index] = NULL; | |
n_ent->value[n_index] = NULL; | |
return 1; | |
} | |
/** | |
* Find an item in the hash. | |
* | |
* \param tab The sh_table instance to search. | |
* \param key The key to search for. | |
* | |
* \return The entry value or NULL if no item is found. | |
*/ | |
void *sh_find(sh_table *tab, char *key) | |
{ | |
int index; | |
sh_bucket *ent; | |
// It would be better if you just didn't do this | |
if(!tab || !key) return NULL; | |
if(!sh_find_entry(tab, key, &ent, &index)) | |
return NULL; | |
return ent->value[index]; | |
} | |
/** | |
* Send each key/value pair to a callback function. Although in many ways this | |
* beats the purpose of using a hash it is very useful to do a brute force | |
* scan of a hash. This function scans through each item in whatever position | |
* it happens to be stored without any attempt at sorting. | |
* | |
* The callback has the paramaters (key, value, arg), | |
* \warning The function MUST return a positive value to continue scanning. | |
* Returning a zero value will abort the scan and return. | |
* | |
* \warning If an item is deleted in a scan the next item in an entry will be | |
* missed if it exists, if it is the last item in an entry then it will likely | |
* result in a segfault. Just don't do it. | |
* | |
* \param tab The table to scan. | |
* \param arg An argument passed to the callback function, may be NULL. | |
* \param cb The callback function | |
* | |
* \return True if the scan completed, zero if cancelled. | |
*/ | |
int sh_each(sh_table *tab, void *arg, int (*cb)(char *, void *, void *)) | |
{ | |
int i, e; | |
sh_bucket *ent; | |
for(i = 0; i < tab->size; i++) { | |
ent = &tab->table[i]; | |
while(ent && ent->key[0]) { | |
for(e = 0; e < SH_MAX_BUCKET; e++) { | |
if(!ent->key[e]) break; // End of the bucket | |
if(!(*cb)(ent->key[e], ent->value[e], tab)) | |
return 0; | |
} | |
ent = ent->next; | |
} | |
} | |
return 1; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef SIMPLE_HASH_H | |
#define SIMPLE_HASH_H 1 | |
#define SH_MAX_BUCKET 9 | |
typedef struct _sh_bucket sh_bucket; | |
struct _sh_bucket { | |
char *key[SH_MAX_BUCKET]; | |
void *value[SH_MAX_BUCKET]; | |
sh_bucket *prev, *next; | |
}; | |
typedef struct _sh_table sh_table; | |
struct _sh_table { | |
unsigned int size; | |
sh_bucket *table; | |
int (*hash_func)(char *); | |
}; | |
sh_table *sh_create(int, int (*)(char*)); | |
int sh_indexer(char *); | |
int sh_add(sh_table *, char *, void *); | |
int sh_set(sh_table *, char *, void *); | |
int sh_delete(sh_table *, char *); | |
void *sh_find(sh_table *, char *); | |
int sh_each(sh_table *, void *, int (*)(char *, void *, void *)); | |
#endif |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <stdio.h> | |
#include <ctype.h> | |
#include <string.h> | |
#include <unistd.h> | |
#include "simple_hash.h" | |
// This is probably going to be different on some systems | |
#define SYSTEM_DICT "/usr/share/dict/words" | |
/** \file spellcheck.c | |
* | |
* This is a basic spell checker which loads data from the system dictionary | |
* into a simple hash table and scans stdin for words not matching any entries. | |
*/ | |
int test_word(sh_table *tab, char *buf, int *col); | |
int read_dict(FILE *f, char *buf); | |
int main() | |
{ | |
int len, line, col; | |
int word_count, err_count; | |
char buf[256]; | |
FILE *f; | |
sh_table *tab; | |
/* With a bit of experimentation i found 8192 to give a pretty even spread | |
* with the default hash function */ | |
tab = sh_create(8192, NULL); | |
f = fopen(SYSTEM_DICT, "r"); | |
if(!f) { | |
perror("Couldn't open dictionary"); | |
exit(1); | |
} | |
while(!read_dict(f, buf)) | |
sh_set(tab, buf, (void*)1); | |
fclose(f); | |
err_count = word_count = 0; | |
line = col = 0; // The position in the file | |
while(!feof(stdin) && fgets(buf, 256, stdin)) { | |
line++; | |
// Each line | |
for(col = 0; buf[col]; col++) { | |
// Each character | |
if(isalpha(buf[col])) { | |
if( (len = test_word(tab, buf, &col)) ) { | |
err_count++; | |
printf("Bad word: '%.*s' (line: %d, column: %d)\n", | |
len, (buf + (col - len)), line, (col - len)); | |
printf("%s%*s", buf, (col - len), ""); | |
while(len--) putchar('^'); | |
putchar('\n'); | |
} | |
word_count++; | |
} | |
} | |
} | |
printf("%d words found with %d errors on %d lines\n", | |
word_count, err_count, line); | |
return err_count; | |
} | |
/** | |
* Read a word into the dictionary converting to lower case. | |
* | |
* \param f The input file. | |
* \param buf The input buffer to use. | |
* | |
* \return The end of file status. | |
*/ | |
int read_dict(FILE *f, char *buf) | |
{ | |
int i; | |
fgets(buf, 256, f); | |
for(i = 0; buf[i] != '\0'; i++) { | |
if(buf[i] == '\n') | |
buf[i] = '\0'; | |
else if(isupper(buf[i])) | |
buf[i] += 32; | |
} | |
return feof(f); | |
} | |
/** | |
* Parse and test the spelling of a single word. | |
* | |
* \param tab The sh_table with dictionary in it. | |
* \param buf The input buffer. | |
* \param col A pointer to the column index. | |
* | |
* \return 0 if the word is found, otherwise the length of the word. | |
*/ | |
int test_word(sh_table *tab, char *buf, int *col) | |
{ | |
char word[256]; | |
int wi; | |
buf += (*col); | |
for(wi = 0; wi < 256 && (isalpha(*buf) || *buf == '\''); buf++, wi++) | |
word[wi] = tolower(*buf); | |
if(word[wi-1] == '\'') wi--; | |
word[wi] = '\0'; | |
(*col) += wi ; | |
if(sh_find(tab, word)) return 0; | |
return wi; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment