dylan-evans · February 4, 2012 14:53
diff --git a/simple_hash.c b/simple_hash.c
 /** \file simple_hash.h
 * 
 * This simple hashing routine was created after finding the hsearch function
 * a bit inadequate, due to the lack of a reentrant function in posix and
 * the inability to do simple operations such as delete... and for fun.
 */

 #include <string.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <malloc.h>

 #include "simple_hash.h"

 /**
 * Allocate a table object and set the hash function.
 *
 * \param size The number of entries in the hash.
 * \param hash_func The hash function. This can be set to null for sh_indexer.
 *
 * \return Newly allocated hash table.
 */
 sh_table *sh_create(int size, int (*hash_func)(char *))
 {
    sh_table *tab = malloc(sizeof(sh_table));
 	tab->size = size;
    tab->table = (sh_bucket *)malloc(sizeof(sh_bucket) * size);
 	memset(tab->table, '\0', sizeof(sh_bucket) * size);
 	
    if(hash_func)
        tab->hash_func = hash_func;
    else
        tab->hash_func = &sh_indexer;
 	return tab;
 }

 /** 
 * A very simple hash routine for use in testing. Actually this has provided
 * a pretty resonable spread for the assorted texts i have been running through
 * it.
 *
 * \param str The string to be hashed.
 * \return The hash key.
 */
 int sh_indexer(char *str)
 {
 	int hash, i;

 	for(hash = 0, i = 0; *str; str++, i++) {

 		hash ^= ((int)(*str) << (i % 32));
 	}
 	return hash;
 }

 /**
 * Find an entry. If no matching key is found then the entry and index is
 * set to the position of the next empty bucket, of course this rule is
 * dependent on there always being an empty bucket which requires a new
 * sh_bucket to be allocated as soon as all the buckets are in a sh_bucket 
 * are filled.
 *
 * \param tab The table object.
 * \param key The key string.
 * \param ent A location to store the matching entry.
 * \param index The matching index.
 *
 * \return 1 when an entry is found.
 */
 inline int sh_find_entry(sh_table *tab, char *key, sh_bucket **ent, int *index)
 {
 	int hkey;
    char *lkey, *rkey;

 	hkey = (*tab->hash_func)(key) % tab->size;
 	for(*ent = &tab->table[hkey]; *ent; *ent = (*ent)->next) {
 		for((*index) = 0; (*index) < SH_MAX_BUCKET; (*index)++) {
            lkey = (*ent)->key[*index];
            if(!lkey) return 0;
            for(rkey = key; *rkey == *lkey; rkey++, lkey++)
                if(!*rkey) return 1;
        }
 	}
    return 0;
 }

 /**
 * Add a new value to the hash table. This function does nothing if en entry 
 * with a matching key already exists.
 * 
 * \param tab The hash table to add to.
 * \param key The key to add.
 * \param val The value to associate with the key.
 *
 * \return 1 on success, 0 if the key already exists.
 * \sa sh_set sh_replace
 */
 int sh_add(sh_table *tab, char *key, void *val)
 {
    int index = 0;
    sh_bucket *ent;

    if(sh_find_entry(tab, key, &ent, &index))
        return 0; // Already exists
    ent->key[index] = key;
    ent->value[index] = val;
    if(index == (SH_MAX_BUCKET - 1)) {
        ent->next = (sh_bucket*)malloc(sizeof(sh_bucket));
        memset(ent->next, '\0', sizeof(sh_bucket));
        ent->next->prev = ent;
    }
    return 1;
 }

 /**
 * This function adds an item to the hash, adding it if
 * it doesn't already exist.
 *
 * \param tab The hash table.
 * \param key The key to add.
 * \param val The value to be set.
 *
 * \return Always returns 1
 * \sa sh_add sh_replace
 */
 int sh_set(sh_table *tab, char *key, void *val)
 {
    sh_bucket *ent;
    int index = 0;

    sh_find_entry(tab, key, &ent, &index);

    ent->key[index] = (char*) malloc(strlen(key));
    strcpy(ent->key[index], key);
    ent->value[index] = val;
    if(index == (SH_MAX_BUCKET - 1) && !ent->next) {
        ent->next = (sh_bucket*)malloc(sizeof(sh_bucket));
    }
    return 1;
 }

 /**
 * Replace an existing item. 
 *
 * \param tab The table object.
 * \param key The key string.
 * \param val The pointer to store.
 * 
 * \return 1 on success, 0 if the item doesn't exist.
 */
 int sh_replace(sh_table *tab, char *key, void *val)
 {
    int index;
    sh_bucket *ent;
    
    if(!sh_find_entry(tab, key, &ent, &index))
        return 0; // It doesn't exist
    ent->value[index] = val;
    return 1;
 }

 /**
 * Delete a matching item.
 *
 * \param tab The sh_table to check.
 * \param key The key to search for.
 *
 * \return 0 if no match is found.
 */
 int sh_delete(sh_table *tab, char *key)
 {
    int index = 0, n_index = 0;
    sh_bucket *ent, *n_ent;

    if(!sh_find_entry(tab, key, &ent, &index))
        return 0;

    // This routine shifts each entry into the place of the higher bucket
    n_ent = ent;
    while(ent && ent->key[index]) {
        n_index = index + 1;

        if(n_index == SH_MAX_BUCKET) {
            n_index = 0;
            n_ent = ent->next;
        }
        ent->key[index] = n_ent->key[n_index];
        ent->value[index] = n_ent->value[n_index];
        ent = n_ent;
        index = n_index;
    }

    if(n_index == SH_MAX_BUCKET) 
        free(ent->next);
    n_ent->key[n_index] = NULL;
    n_ent->value[n_index] = NULL;
    
    return 1;
 }

 /**
 * Find an item in the hash. 
 * 
 * \param tab The sh_table instance to search.
 * \param key The key to search for.
 *
 * \return The entry value or NULL if no item is found.
 */
 void *sh_find(sh_table *tab, char *key)
 {
    int index;
    sh_bucket *ent;

    // It would be better if you just didn't do this
    if(!tab || !key) return NULL;
    
    if(!sh_find_entry(tab, key, &ent, &index))
        return NULL;
    
    return ent->value[index];
 }

 /**
 * Send each key/value pair to a callback function. Although in many ways this
 * beats the purpose of using a hash it is very useful to do a brute force
 * scan of a hash. This function scans through each item in whatever position
 * it happens to be stored without any attempt at sorting.
 *
 * The callback has the paramaters (key, value, arg), 
 * \warning The function MUST return a positive value to continue scanning. 
 * Returning a zero value will abort the scan and return.
 *
 * \warning If an item is deleted in a scan the next item in an entry will be
 * missed if it exists, if it is the last item in an entry then it will likely
 * result in a segfault. Just don't do it.
 *
 * \param tab The table to scan.
 * \param arg An argument passed to the callback function, may be NULL.
 * \param cb The callback function
 *
 * \return True if the scan completed, zero if cancelled. 
 */
 int sh_each(sh_table *tab, void *arg, int (*cb)(char *, void *, void *))
 {
    int i, e;
    sh_bucket *ent;

    for(i = 0; i < tab->size; i++) {
        ent = &tab->table[i];
        while(ent && ent->key[0]) {
            for(e = 0; e < SH_MAX_BUCKET; e++) {
                if(!ent->key[e]) break; // End of the bucket
                if(!(*cb)(ent->key[e], ent->value[e], tab))
                    return 0;
            }
            ent = ent->next;
        }
    }
    return 1;
 }
diff --git a/simple_hash.h b/simple_hash.h

 #ifndef SIMPLE_HASH_H
 #define SIMPLE_HASH_H 1

 #define SH_MAX_BUCKET 9

 typedef struct _sh_bucket sh_bucket;
 struct _sh_bucket {
 	char *key[SH_MAX_BUCKET];
 	void *value[SH_MAX_BUCKET];
 	sh_bucket *prev, *next;	
 };


 typedef struct _sh_table sh_table;
 struct _sh_table {
 	unsigned int size;
 	sh_bucket *table;
 	int (*hash_func)(char *);
 };


 sh_table *sh_create(int, int (*)(char*));
 int sh_indexer(char *);
 int sh_add(sh_table *, char *, void *);
 int sh_set(sh_table *, char *, void *);
 int sh_delete(sh_table *, char *);
 void *sh_find(sh_table *, char *);
 int sh_each(sh_table *, void *, int (*)(char *, void *, void *));

 #endif
diff --git a/spellcheck.c b/spellcheck.c
 #include <stdlib.h>
 #include <stdio.h>
 #include <ctype.h>
 #include <string.h>
 #include <unistd.h>

 #include "simple_hash.h"

 // This is probably going to be different on some systems
 #define SYSTEM_DICT "/usr/share/dict/words"

 /** \file spellcheck.c
 *
 * This is a basic spell checker which loads data from the system dictionary
 * into a simple hash table and scans stdin for words not matching any entries.
 */

 int test_word(sh_table *tab, char *buf, int *col);
 int read_dict(FILE *f, char *buf);

 int main()
 {
    int len, line, col;
    int word_count, err_count;
    char buf[256];
    FILE *f;
    sh_table *tab;
    
    /* With a bit of experimentation i found 8192 to give a pretty even spread
     * with the default hash function */
    tab = sh_create(8192, NULL);
    f = fopen(SYSTEM_DICT, "r");
    if(!f) {
        perror("Couldn't open dictionary");
        exit(1);
    }
    while(!read_dict(f, buf))
        sh_set(tab, buf, (void*)1);
    fclose(f);

    err_count = word_count = 0;
    line = col = 0; // The position in the file
    while(!feof(stdin) && fgets(buf, 256, stdin)) {
        line++;
        // Each line
        for(col = 0; buf[col]; col++) {
            // Each character
            if(isalpha(buf[col])) {
                if( (len = test_word(tab, buf, &col)) ) {
                        err_count++;
                        printf("Bad word: '%.*s' (line: %d, column: %d)\n",
                            len, (buf + (col - len)), line, (col - len));
                        printf("%s%*s", buf, (col - len), "");
                        while(len--) putchar('^');
                        putchar('\n');
                }
                word_count++;
            }
        }
    }
    printf("%d words found with %d errors on %d lines\n", 
            word_count, err_count, line);
    return err_count;
 }

 /**
 * Read a word into the dictionary converting to lower case.
 *
 * \param f The input file.
 * \param buf The input buffer to use.
 *
 * \return The end of file status.
 */
 int read_dict(FILE *f, char *buf)
 {
    int i;
    fgets(buf, 256, f);
    for(i = 0; buf[i] != '\0'; i++) {
        if(buf[i] == '\n')
            buf[i] = '\0';
        else if(isupper(buf[i]))
            buf[i] += 32;
    }
    return feof(f);
 }


 /**
 * Parse and test the spelling of a single word.
 *
 * \param tab The sh_table with dictionary in it.
 * \param buf The input buffer.
 * \param col A pointer to the column index.
 *
 * \return 0 if the word is found, otherwise the length of the word.
 */
 int test_word(sh_table *tab, char *buf, int *col)
 {
    char word[256];
    int wi;

    buf += (*col);
    for(wi = 0; wi < 256 && (isalpha(*buf) || *buf == '\''); buf++, wi++)
        word[wi] = tolower(*buf);
    if(word[wi-1] == '\'') wi--;
    word[wi] = '\0';
    (*col) += wi ;
    if(sh_find(tab, word)) return 0;

    return wi;
 }
	/** \file simple_hash.h
	*
	* This simple hashing routine was created after finding the hsearch function
	* a bit inadequate, due to the lack of a reentrant function in posix and
	* the inability to do simple operations such as delete... and for fun.
	*/

	#include <string.h>
	#include <stdio.h>
	#include <unistd.h>
	#include <malloc.h>

	#include "simple_hash.h"

	/**
	* Allocate a table object and set the hash function.
	*
	* \param size The number of entries in the hash.
	* \param hash_func The hash function. This can be set to null for sh_indexer.
	*
	* \return Newly allocated hash table.
	*/
	sh_table sh_create(int size, int (hash_func)(char *))
	{
	sh_table *tab = malloc(sizeof(sh_table));
	tab->size = size;
	tab->table = (sh_bucket )malloc(sizeof(sh_bucket) size);
	memset(tab->table, '\0', sizeof(sh_bucket) * size);

	if(hash_func)
	tab->hash_func = hash_func;
	else
	tab->hash_func = &sh_indexer;
	return tab;
	}

	/**
	* A very simple hash routine for use in testing. Actually this has provided
	* a pretty resonable spread for the assorted texts i have been running through
	* it.
	*
	* \param str The string to be hashed.
	* \return The hash key.
	*/
	int sh_indexer(char *str)
	{
	int hash, i;

	for(hash = 0, i = 0; *str; str++, i++) {

	hash ^= ((int)(*str) << (i % 32));
	}
	return hash;
	}

	/**
	* Find an entry. If no matching key is found then the entry and index is
	* set to the position of the next empty bucket, of course this rule is
	* dependent on there always being an empty bucket which requires a new
	* sh_bucket to be allocated as soon as all the buckets are in a sh_bucket
	* are filled.
	*
	* \param tab The table object.
	* \param key The key string.
	* \param ent A location to store the matching entry.
	* \param index The matching index.
	*
	* \return 1 when an entry is found.
	*/
	inline int sh_find_entry(sh_table tab, char key, sh_bucket *ent, int index)
	{
	int hkey;
	char lkey, rkey;

	hkey = (*tab->hash_func)(key) % tab->size;
	for(ent = &tab->table[hkey]; ent; ent = (ent)->next) {
	for((index) = 0; (index) < SH_MAX_BUCKET; (*index)++) {
	lkey = (ent)->key[index];
	if(!lkey) return 0;
	for(rkey = key; rkey == lkey; rkey++, lkey++)
	if(!*rkey) return 1;
	}
	}
	return 0;
	}

	/**
	* Add a new value to the hash table. This function does nothing if en entry
	* with a matching key already exists.
	*
	* \param tab The hash table to add to.
	* \param key The key to add.
	* \param val The value to associate with the key.
	*
	* \return 1 on success, 0 if the key already exists.
	* \sa sh_set sh_replace
	*/
	int sh_add(sh_table tab, char key, void *val)
	{
	int index = 0;
	sh_bucket *ent;

	if(sh_find_entry(tab, key, &ent, &index))
	return 0; // Already exists
	ent->key[index] = key;
	ent->value[index] = val;
	if(index == (SH_MAX_BUCKET - 1)) {
	ent->next = (sh_bucket*)malloc(sizeof(sh_bucket));
	memset(ent->next, '\0', sizeof(sh_bucket));
	ent->next->prev = ent;
	}
	return 1;
	}

	/**
	* This function adds an item to the hash, adding it if
	* it doesn't already exist.
	*
	* \param tab The hash table.
	* \param key The key to add.
	* \param val The value to be set.
	*
	* \return Always returns 1
	* \sa sh_add sh_replace
	*/
	int sh_set(sh_table tab, char key, void *val)
	{
	sh_bucket *ent;
	int index = 0;

	sh_find_entry(tab, key, &ent, &index);

	ent->key[index] = (char*) malloc(strlen(key));
	strcpy(ent->key[index], key);
	ent->value[index] = val;
	if(index == (SH_MAX_BUCKET - 1) && !ent->next) {
	ent->next = (sh_bucket*)malloc(sizeof(sh_bucket));
	}
	return 1;
	}

	/**
	* Replace an existing item.
	*
	* \param tab The table object.
	* \param key The key string.
	* \param val The pointer to store.
	*
	* \return 1 on success, 0 if the item doesn't exist.
	*/
	int sh_replace(sh_table tab, char key, void *val)
	{
	int index;
	sh_bucket *ent;

	if(!sh_find_entry(tab, key, &ent, &index))
	return 0; // It doesn't exist
	ent->value[index] = val;
	return 1;
	}

	/**
	* Delete a matching item.
	*
	* \param tab The sh_table to check.
	* \param key The key to search for.
	*
	* \return 0 if no match is found.
	*/
	int sh_delete(sh_table tab, char key)
	{
	int index = 0, n_index = 0;
	sh_bucket ent, n_ent;

	if(!sh_find_entry(tab, key, &ent, &index))
	return 0;

	// This routine shifts each entry into the place of the higher bucket
	n_ent = ent;
	while(ent && ent->key[index]) {
	n_index = index + 1;

	if(n_index == SH_MAX_BUCKET) {
	n_index = 0;
	n_ent = ent->next;
	}
	ent->key[index] = n_ent->key[n_index];
	ent->value[index] = n_ent->value[n_index];
	ent = n_ent;
	index = n_index;
	}

	if(n_index == SH_MAX_BUCKET)
	free(ent->next);
	n_ent->key[n_index] = NULL;
	n_ent->value[n_index] = NULL;

	return 1;
	}

	/**
	* Find an item in the hash.
	*
	* \param tab The sh_table instance to search.
	* \param key The key to search for.
	*
	* \return The entry value or NULL if no item is found.
	*/
	void sh_find(sh_table tab, char *key)
	{
	int index;
	sh_bucket *ent;

	// It would be better if you just didn't do this
	if(!tab \|\| !key) return NULL;

	if(!sh_find_entry(tab, key, &ent, &index))
	return NULL;

	return ent->value[index];
	}

	/**
	* Send each key/value pair to a callback function. Although in many ways this
	* beats the purpose of using a hash it is very useful to do a brute force
	* scan of a hash. This function scans through each item in whatever position
	* it happens to be stored without any attempt at sorting.
	*
	* The callback has the paramaters (key, value, arg),
	* \warning The function MUST return a positive value to continue scanning.
	* Returning a zero value will abort the scan and return.
	*
	* \warning If an item is deleted in a scan the next item in an entry will be
	* missed if it exists, if it is the last item in an entry then it will likely
	* result in a segfault. Just don't do it.
	*
	* \param tab The table to scan.
	* \param arg An argument passed to the callback function, may be NULL.
	* \param cb The callback function
	*
	* \return True if the scan completed, zero if cancelled.
	*/
	int sh_each(sh_table tab, void arg, int (cb)(char , void , void ))
	{
	int i, e;
	sh_bucket *ent;

	for(i = 0; i < tab->size; i++) {
	ent = &tab->table[i];
	while(ent && ent->key[0]) {
	for(e = 0; e < SH_MAX_BUCKET; e++) {
	if(!ent->key[e]) break; // End of the bucket
	if(!(*cb)(ent->key[e], ent->value[e], tab))
	return 0;
	}
	ent = ent->next;
	}
	}
	return 1;
	}

	#ifndef SIMPLE_HASH_H
	#define SIMPLE_HASH_H 1

	#define SH_MAX_BUCKET 9

	typedef struct _sh_bucket sh_bucket;
	struct _sh_bucket {
	char *key[SH_MAX_BUCKET];
	void *value[SH_MAX_BUCKET];
	sh_bucket prev, next;
	};


	typedef struct _sh_table sh_table;
	struct _sh_table {
	unsigned int size;
	sh_bucket *table;
	int (hash_func)(char );
	};


	sh_table sh_create(int, int ()(char*));
	int sh_indexer(char *);
	int sh_add(sh_table , char , void *);
	int sh_set(sh_table , char , void *);
	int sh_delete(sh_table , char );
	void sh_find(sh_table , char *);
	int sh_each(sh_table , void , int ()(char , void , void ));

	#endif
	#include <stdlib.h>
	#include <stdio.h>
	#include <ctype.h>
	#include <string.h>
	#include <unistd.h>

	#include "simple_hash.h"

	// This is probably going to be different on some systems
	#define SYSTEM_DICT "/usr/share/dict/words"

	/** \file spellcheck.c
	*
	* This is a basic spell checker which loads data from the system dictionary
	* into a simple hash table and scans stdin for words not matching any entries.
	*/

	int test_word(sh_table tab, char buf, int *col);
	int read_dict(FILE f, char buf);

	int main()
	{
	int len, line, col;
	int word_count, err_count;
	char buf[256];
	FILE *f;
	sh_table *tab;

	/* With a bit of experimentation i found 8192 to give a pretty even spread
	* with the default hash function */
	tab = sh_create(8192, NULL);
	f = fopen(SYSTEM_DICT, "r");
	if(!f) {
	perror("Couldn't open dictionary");
	exit(1);
	}
	while(!read_dict(f, buf))
	sh_set(tab, buf, (void*)1);
	fclose(f);

	err_count = word_count = 0;
	line = col = 0; // The position in the file
	while(!feof(stdin) && fgets(buf, 256, stdin)) {
	line++;
	// Each line
	for(col = 0; buf[col]; col++) {
	// Each character
	if(isalpha(buf[col])) {
	if( (len = test_word(tab, buf, &col)) ) {
	err_count++;
	printf("Bad word: '%.*s' (line: %d, column: %d)\n",
	len, (buf + (col - len)), line, (col - len));
	printf("%s%*s", buf, (col - len), "");
	while(len--) putchar('^');
	putchar('\n');
	}
	word_count++;
	}
	}
	}
	printf("%d words found with %d errors on %d lines\n",
	word_count, err_count, line);
	return err_count;
	}

	/**
	* Read a word into the dictionary converting to lower case.
	*
	* \param f The input file.
	* \param buf The input buffer to use.
	*
	* \return The end of file status.
	*/
	int read_dict(FILE f, char buf)
	{
	int i;
	fgets(buf, 256, f);
	for(i = 0; buf[i] != '\0'; i++) {
	if(buf[i] == '\n')
	buf[i] = '\0';
	else if(isupper(buf[i]))
	buf[i] += 32;
	}
	return feof(f);
	}


	/**
	* Parse and test the spelling of a single word.
	*
	* \param tab The sh_table with dictionary in it.
	* \param buf The input buffer.
	* \param col A pointer to the column index.
	*
	* \return 0 if the word is found, otherwise the length of the word.
	*/
	int test_word(sh_table tab, char buf, int *col)
	{
	char word[256];
	int wi;

	buf += (*col);
	for(wi = 0; wi < 256 && (isalpha(buf) \|\| buf == '\''); buf++, wi++)
	word[wi] = tolower(*buf);
	if(word[wi-1] == '\'') wi--;
	word[wi] = '\0';
	(*col) += wi ;
	if(sh_find(tab, word)) return 0;

	return wi;
	}