Created
May 3, 2011 17:13
-
-
Save EmilHernvall/953744 to your computer and use it in GitHub Desktop.
Find the most popular words in a file - Linked list and merge sort based version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// this has hacky support for swedish characters in utf8. | |
// i do not recommend this solution for any serious purposes. :) | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <wchar.h> | |
// convert a twobyte utf8 character into a single integer | |
#define MB(str) (((str[0] & 0xFF) << 8) | (str[1] & 0xFF)) | |
// our linked list for words | |
typedef struct word_ { | |
char *str; | |
int c; | |
struct word_ *next; | |
} word; | |
void *xmalloc(size_t size) | |
{ | |
void *res = malloc(size); | |
if (res == NULL) { | |
fprintf(stderr, "Memory allocation failed\n"); | |
abort(); | |
} | |
return res; | |
} | |
void *xstrdup(char *str) | |
{ | |
char *res = strdup(str); | |
if (res == NULL) { | |
fprintf(stderr, "Memory allocation failed\n"); | |
abort(); | |
} | |
return res; | |
} | |
// read a character from file, if it detects multibyte characters | |
// it merges them into a single integer | |
int readchar(FILE *fh) | |
{ | |
int c = fgetc(fh); | |
if (c == EOF) { | |
return c; | |
} | |
// all characters larger than 128 is unicode characters | |
if (c > 0x80) { | |
int c2 = fgetc(fh); | |
if (c2 == EOF) { | |
return c; | |
} | |
c = ((c & 0xFF) << 8) | (c2 & 0xFF); | |
} | |
return c; | |
} | |
// read a file from disk, and return a linked list | |
// containing all the words in the file | |
int get_words_ascii(word **words, char *path) | |
{ | |
FILE *fh = fopen(path, "r"); | |
if (!fh) { | |
return -1; | |
} | |
int pos = 0, wc = 0; | |
char buffer[1024]; | |
unsigned int c = fgetc(fh); | |
word *current_word = NULL; | |
do { | |
// look for letters | |
if ((c >= 'a' && c <= 'z') || | |
(c >= 'A' && c <= 'Z') || | |
c == 0x86 || c == 0x8F || | |
c == 0x84 || c == 0x8E || | |
c == 0x94 || c == 0x99 | |
) { | |
buffer[pos] = c; | |
pos++; | |
} | |
else if (pos > 0) { | |
buffer[pos] = 0; | |
word *new_word = (word*)xmalloc(sizeof(word)); | |
new_word->str = xstrdup(buffer); | |
new_word->c = 1; | |
new_word->next = current_word; | |
current_word = new_word; | |
memset(buffer, 0, sizeof(buffer)); | |
pos = 0; | |
wc++; | |
} | |
} while ((c = fgetc(fh)) != EOF); | |
fclose(fh); | |
*words = current_word; | |
return wc; | |
} | |
int get_words_utf8(word **words, char *path) | |
{ | |
FILE *fh = fopen(path, "r"); | |
if (!fh) { | |
return -1; | |
} | |
int pos = 0, wc = 0; | |
char buffer[1024]; | |
unsigned int c = readchar(fh); | |
word *current_word = NULL; | |
do { | |
// look for letters | |
if ((c >= 'a' && c <= 'z') || | |
(c >= 'A' && c <= 'Z') || | |
c == 0xC3A5 || c == 0xC385 || | |
c == 0xC3A4 || c == 0xC384 || | |
c == 0xC3B6 || c == 0xC396 | |
) { | |
if (c > 0x80) { | |
buffer[pos] = c >> 8; | |
buffer[pos+1] = c & 0xFF; | |
pos++; | |
} else { | |
buffer[pos] = c; | |
} | |
pos++; | |
} | |
else if (pos > 0) { | |
buffer[pos] = 0; | |
word *new_word = (word*)xmalloc(sizeof(word)); | |
new_word->str = xstrdup(buffer); | |
new_word->c = 1; | |
new_word->next = current_word; | |
current_word = new_word; | |
memset(buffer, 0, sizeof(buffer)); | |
pos = 0; | |
wc++; | |
} | |
} while ((c = readchar(fh)) != EOF); | |
fclose(fh); | |
*words = current_word; | |
return wc; | |
} | |
// free the memory for a linked list of words | |
void free_words(word *words) | |
{ | |
word *current_word; | |
for (current_word = words; current_word != NULL; ) { | |
word *next = current_word->next; | |
free(current_word->str); | |
free(current_word); | |
current_word = next; | |
} | |
} | |
// print a list of words. level can be used to specify | |
// indentation | |
void print_list(word *words, int level) | |
{ | |
word *current_word; | |
for (current_word = words; current_word != NULL; current_word = current_word->next) { | |
int j; | |
for (j = 0; j < level; j++) { | |
printf("\t"); | |
} | |
printf("%s %d\n", current_word->str, current_word->c); | |
} | |
} | |
// sort a list of words using a custom comparison function | |
word *merge_sort(word *words, int count, int level, int (*cmpfunc)(word*, word*)) | |
{ | |
if (words == NULL) { | |
return words; | |
} | |
//printf("%d\n", count); | |
//print_list(words, level); | |
if (count == 1) { | |
words->next = NULL; | |
return words; | |
} | |
// split the list into two parts | |
int split = count / 2; | |
word *left, *right; | |
word *current_word; | |
int i = 0; | |
for (current_word = words; current_word != NULL; current_word = current_word->next) { | |
if (i == split - 1 || current_word->next == NULL) { | |
left = words; | |
right = current_word->next; | |
current_word->next = NULL; | |
break; | |
} | |
i++; | |
} | |
// sort recursively | |
word *base = merge_sort(left, split, level+1, cmpfunc); | |
word *cmp = merge_sort(right, count-split, level+1, cmpfunc); | |
//printf("left: %d\n", left->next == NULL); | |
//printf("right: %d\n", right->next == NULL); | |
// merge the two lists | |
if (cmpfunc(base, cmp) > 0) { | |
word *tmp = base; | |
base = cmp; | |
cmp = tmp; | |
} | |
word *result = base; | |
while (cmp != NULL) { | |
//printf("%s\n", cmp->str); | |
if (base->next == NULL) { | |
base->next = cmp; | |
break; | |
} | |
//printf("cmp: %s %s\n", base->next->str, cmp->str); | |
if (cmpfunc(base->next, cmp) > 0) { | |
word *next = cmp->next; | |
cmp->next = base->next; | |
base->next = cmp; | |
cmp = next; | |
} | |
else { | |
base = base->next; | |
} | |
} | |
//print_list(result, level); | |
//printf("\n"); | |
return result; | |
} | |
// take a sorted linked list of words, and merge all duplicates | |
// incrementing the counter | |
int reduce(word *words) | |
{ | |
word *current_word, *next; | |
int totalCount = 0; | |
for (current_word = words; current_word != NULL; ) { | |
for (next = current_word->next; next != NULL; ) { | |
if (strcmp(current_word->str, next->str) != 0) { | |
break; | |
} | |
current_word->c++; | |
word *tmp = next->next; | |
free(next->str); | |
free(next); | |
next = tmp; | |
} | |
totalCount++; | |
current_word->next = next; | |
current_word = next; | |
} | |
return totalCount; | |
} | |
int namecmp(word *a, word *b) | |
{ | |
return strcmp(b->str, a->str); | |
} | |
int countcmp(word *a, word *b) | |
{ | |
return b->c - a->c; | |
} | |
int main(int argc, char **argv) | |
{ | |
if (argc < 2) { | |
printf("usage: wordcount [file]\n"); | |
return 0; | |
} | |
int i; | |
for (i = 1; i < argc; i++) { | |
word *words; | |
// read words from file | |
int wordCount = get_words_ascii(&words, argv[i]); | |
if (wordCount == -1) { | |
continue; | |
} | |
printf("wordcount for %s: %d\n", argv[i], wordCount); | |
// first sorting pass: sort the words alphabetically | |
words = merge_sort(words, wordCount, 0, namecmp); | |
// eliminate duplicates | |
int newCount = reduce(words); | |
// sort by word frequency | |
words = merge_sort(words, newCount, 0, countcmp); | |
// print result | |
word *current_word; | |
int j = 0; | |
for (current_word = words; current_word != NULL; current_word = current_word->next) { | |
if (j > 10) { | |
break; | |
} | |
printf("%s %d\n", current_word->str, current_word->c); | |
j++; | |
} | |
free_words(words); | |
printf("\n"); | |
} | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Turns out this isn't nearly as efficient as the hashtable approach in https://gist.github.com/953745 . In fact, a perl script performed slightly better than this implementation.