Created
January 3, 2016 08:42
-
-
Save Sammons/d4db049d0206e23230e1 to your computer and use it in GitHub Desktop.
toy hash table of lines of text in a file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
struct offset_node_s { | |
struct offset_node_s * next; | |
unsigned long offset; | |
}; | |
typedef struct offset_node_s offset_node; | |
struct index_s { | |
unsigned int size; | |
offset_node ** offsets_table; | |
}; | |
typedef struct index_s hash_index; | |
unsigned int hash(char* str, int strlength, unsigned int max_size) { | |
unsigned int value = 0; | |
unsigned int tmp = 0; | |
for (int i = 0; i < strlength; ++i) { | |
tmp |= (unsigned int)str[i]; | |
tmp = tmp << sizeof(char); | |
value ^= tmp; | |
tmp = 0; | |
} | |
return value % max_size; | |
} | |
void free_index(hash_index* ind) { | |
for (int i = 0; i < ind->size; ++i) { | |
offset_node* cur_ptr = NULL; | |
offset_node* next_ptr = NULL; | |
cur_ptr = ind->offsets_table[i]; | |
while (cur_ptr != NULL) { | |
next_ptr = cur_ptr->next; | |
free(cur_ptr); | |
cur_ptr = next_ptr; | |
} | |
ind->offsets_table[i] = NULL; | |
} | |
free(ind->offsets_table); | |
ind->offsets_table = NULL; | |
} | |
void append_to_list_with_key(hash_index* ind, unsigned int key, unsigned long offset) { | |
offset_node * new_node = (offset_node*) malloc(sizeof(offset_node)); | |
new_node->next = NULL; | |
new_node->offset = offset; | |
if (ind->offsets_table[key] != NULL) { | |
offset_node * existing_node = ind->offsets_table[key]; | |
while (existing_node->next != NULL) existing_node = existing_node->next; | |
existing_node->next = new_node; | |
} else { | |
ind->offsets_table[key] = new_node; | |
} | |
} | |
void read_file_into_index(FILE* f, hash_index* ind) { | |
int chars_read = 0; | |
char buf[1024] = {0}; | |
do { | |
memset(buf, 0, sizeof(buf)); | |
unsigned long value = ftell(f); | |
chars_read = fscanf(f, "%s", buf); | |
unsigned int key = hash(buf, strlen(buf), ind->size); | |
append_to_list_with_key(ind, key, value); | |
} while (chars_read > 0); | |
} | |
/* | |
format: | |
<table_size> | |
<key unsigned int>,<unsigned long offset 1>,<unsigned long offset 2>,<unsigned long offset 3>...\n | |
<key unsigned int>,<unsigned long offset 1>,<unsigned long offset 2>,<unsigned long offset 3>...\n | |
... | |
*/ | |
void write_index(FILE* f, hash_index* ind) { | |
fprintf(f, "%d\n", ind->size); | |
for (unsigned int i = 0; i < ind->size; ++i) { | |
if (ind->offsets_table[i] != NULL) { | |
offset_node* cur_ptr = ind->offsets_table[i]; | |
fprintf(f, "%d,", i); | |
while (cur_ptr != NULL) { | |
fprintf(f, "%lu,", cur_ptr->offset); | |
cur_ptr = cur_ptr->next; | |
} | |
fprintf(f, "|\n"); | |
} | |
} | |
} | |
void read_index(FILE* f, hash_index* ind) { | |
fscanf(f, "%d", &ind->size); | |
ind->offsets_table = (offset_node**)malloc(sizeof(offset_node*) * ind->size); | |
memset(ind->offsets_table, 0, sizeof(offset_node*) * ind->size); | |
unsigned int key = 0; | |
while (fscanf(f, "%d,", &key) > 0) { | |
int finished_with_this_line = 0; | |
unsigned long offset = 0; | |
while (fscanf(f, "%lu,", &offset)) { | |
append_to_list_with_key(ind, key, offset); | |
} | |
while (fgetc(f) != '\n'); | |
} | |
} | |
int compare_hash_indices(hash_index* i1, hash_index* i2) { | |
if (i1->size != i2->size) return 0; | |
for (int i = 0; i < i1->size; ++i) { | |
if ((i1->offsets_table[i] == NULL || i2->offsets_table[i] == NULL) | |
&& i1->offsets_table[i] != i2->offsets_table[i]) return 0; | |
else if (i1->offsets_table[i] != NULL) { | |
offset_node* p1 = i1->offsets_table[i]; | |
offset_node* p2 = i2->offsets_table[i]; | |
while (p1 != NULL) { | |
if (p1->offset != p2->offset) return 0; | |
if (p1->next == NULL && p2->next != NULL) return 0; | |
if (p1->next != NULL && p2->next == NULL) return 0; | |
p1 = p1->next; | |
p2 = p2->next; | |
} | |
} | |
} | |
return 1; | |
} | |
/* toy implementation of indexing text in lines of a file */ | |
int main(int argc, char const * argv[]) { | |
/* crude file read - please be secure in real life */ | |
FILE* f = fopen(argv[1], "rb"); | |
unsigned int max_size = 1000; | |
hash_index fileIndex = { 0 }; | |
fileIndex.size = max_size; | |
fileIndex.offsets_table = (offset_node**) malloc ( sizeof(offset_node*) * max_size ); | |
memset(fileIndex.offsets_table, 0, max_size); | |
read_file_into_index(f, &fileIndex); | |
fclose(f); | |
printf("%s\n", "writing"); | |
FILE* indexfile = fopen(argv[2], "wb"); | |
write_index(indexfile, &fileIndex); | |
fclose(indexfile); | |
printf("%s\n", "reading"); | |
FILE* readfile = fopen(argv[2], "rb"); | |
hash_index checkIndex = {0}; | |
read_index(readfile, &checkIndex); | |
fclose(readfile); | |
printf("%s\n", "comparing"); | |
if (compare_hash_indices(&fileIndex, &checkIndex)) { | |
printf("%s\n", "equal, write and read were the same"); | |
} else { | |
printf("%s\n", "there was an error"); | |
} | |
// printf("looking up the word \"hello\"\n"); | |
// char* to_find = "hello"; | |
// unsigned int hello_hash = hash(to_find, strlen(to_find), checkIndex.size); | |
// printf("hashes to %d\n", hello_hash); | |
// readfile = fopen(argv[1], "rb"); | |
// char tmp[1024] = {0}; | |
// if (checkIndex.offsets_table[hello_hash] != NULL) { | |
// printf("found at %lu\n", checkIndex.offsets_table[hello_hash]->offset); | |
// fseek(f, checkIndex.offsets_table[hello_hash]->offset, SEEK_SET); | |
// fscanf(f, "%s", tmp); | |
// printf("file read gives: %s\n", tmp); | |
// } | |
// fclose(readfile) | |
free_index(&checkIndex); | |
free_index(&fileIndex); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for the help , how could I compile the code ?