EmilHernvall · May 3, 2011 17:13 · EmilHernvall · May 3, 2011
diff --git a/wordcount.c b/wordcount.c
 // this has hacky support for swedish characters in utf8.
 // i do not recommend this solution for any serious purposes. :)

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <wchar.h>

 // convert a twobyte utf8 character into a single integer
 #define MB(str) (((str[0] & 0xFF) << 8) | (str[1] & 0xFF))

 // our linked list for words
 typedef struct word_ {
 	char *str;
 	int c;
 	struct word_ *next;
 } word;

 void *xmalloc(size_t size)
 {
 	void *res = malloc(size);
 	if (res == NULL) {
 		fprintf(stderr, "Memory allocation failed\n");
 		abort();
 	}
 	return res;
 }

 void *xstrdup(char *str)
 {
 	char *res = strdup(str);
 	if (res == NULL) {
 		fprintf(stderr, "Memory allocation failed\n");
 		abort();
 	}
 	return res;
 }

 // read a character from file, if it detects multibyte characters
 // it merges them into a single integer
 int readchar(FILE *fh)
 {
 	int c = fgetc(fh);
 	if (c == EOF) {
 		return c;
 	}

 	// all characters larger than 128 is unicode characters
 	if (c > 0x80) {
 		int c2 = fgetc(fh);
 		if (c2 == EOF) {
 			return c;
 		}

 		c = ((c & 0xFF) << 8) | (c2 & 0xFF);
 	}

 	return c;
 }

 // read a file from disk, and return a linked list
 // containing all the words in the file
 int get_words_ascii(word **words, char *path)
 {
 	FILE *fh = fopen(path, "r");
 	if (!fh) {
 		return -1;
 	}

 	int pos = 0, wc = 0;
 	char buffer[1024];

 	unsigned int c = fgetc(fh);
 	word *current_word = NULL;
 	do {
 		// look for letters
 		if ((c >= 'a' && c <= 'z') || 
 				(c >= 'A' && c <= 'Z') ||
 				c == 0x86 || c == 0x8F ||
 				c == 0x84 || c == 0x8E ||
 				c == 0x94 || c == 0x99
 			) {
 			buffer[pos] = c;
 			pos++;
 		}
 		else if (pos > 0) {
 			buffer[pos] = 0;

 			word *new_word = (word*)xmalloc(sizeof(word));
 			new_word->str = xstrdup(buffer);
 			new_word->c = 1;
 			new_word->next = current_word;

 			current_word = new_word;

 			memset(buffer, 0, sizeof(buffer));
 			pos = 0;
 			wc++;
 		}
 	} while ((c = fgetc(fh)) != EOF);

 	fclose(fh);

 	*words = current_word;

 	return wc;
 }

 int get_words_utf8(word **words, char *path)
 {
 	FILE *fh = fopen(path, "r");
 	if (!fh) {
 		return -1;
 	}

 	int pos = 0, wc = 0;
 	char buffer[1024];

 	unsigned int c = readchar(fh);
 	word *current_word = NULL;
 	do {
 		// look for letters
 		if ((c >= 'a' && c <= 'z') || 
 				(c >= 'A' && c <= 'Z') ||
 				c == 0xC3A5 || c == 0xC385 ||
 				c == 0xC3A4 || c == 0xC384 ||
 				c == 0xC3B6 || c == 0xC396
 			) {
 			if (c > 0x80) {
 				buffer[pos] = c >> 8;
 				buffer[pos+1] = c & 0xFF;
 				pos++;
 			} else {
 				buffer[pos] = c;
 			}
 			pos++;
 		}
 		else if (pos > 0) {
 			buffer[pos] = 0;

 			word *new_word = (word*)xmalloc(sizeof(word));
 			new_word->str = xstrdup(buffer);
 			new_word->c = 1;
 			new_word->next = current_word;

 			current_word = new_word;

 			memset(buffer, 0, sizeof(buffer));
 			pos = 0;
 			wc++;
 		}
 	} while ((c = readchar(fh)) != EOF);

 	fclose(fh);

 	*words = current_word;

 	return wc;
 }

 // free the memory for a linked list of words
 void free_words(word *words)
 {
 		word *current_word;
 		for (current_word = words; current_word != NULL; ) {
 			word *next = current_word->next;
 			free(current_word->str);
 			free(current_word);
 			current_word = next;
 		}
 }

 // print a list of words. level can be used to specify
 // indentation
 void print_list(word *words, int level)
 {
 	word *current_word;
 	for (current_word = words; current_word != NULL; current_word = current_word->next) {
 		int j;
 		for (j = 0; j < level; j++) {
 			printf("\t");
 		}
 		printf("%s %d\n", current_word->str, current_word->c);
 	}
 }

 // sort a list of words using a custom comparison function
 word *merge_sort(word *words, int count, int level, int (*cmpfunc)(word*, word*))
 {
 	if (words == NULL) {
 		return words;
 	}

 	//printf("%d\n", count);
 	//print_list(words, level);

 	if (count == 1) {
 		words->next = NULL;
 		return words;
 	}

 	// split the list into two parts
 	int split = count / 2;

 	word *left, *right;
 	word *current_word;
 	int i = 0;
 	for (current_word = words; current_word != NULL; current_word = current_word->next) {
 		if (i == split - 1 || current_word->next == NULL) {
 			left = words;
 			right = current_word->next;
 			current_word->next = NULL;
 			break;
 		}
 		i++;
 	}

 	// sort recursively
 	word *base = merge_sort(left, split, level+1, cmpfunc);
 	word *cmp = merge_sort(right, count-split, level+1, cmpfunc);

 	//printf("left: %d\n", left->next == NULL);
 	//printf("right: %d\n", right->next == NULL);

 	// merge the two lists
 	if (cmpfunc(base, cmp) > 0) {
 		word *tmp = base;
 		base = cmp;
 		cmp = tmp;
 	}

 	word *result = base;
 	while (cmp != NULL) {
 		//printf("%s\n", cmp->str);
 		if (base->next == NULL) {
 			base->next = cmp;
 			break;
 		}

 		//printf("cmp: %s %s\n", base->next->str, cmp->str);
 		if (cmpfunc(base->next, cmp) > 0) {
 			word *next = cmp->next;
 			cmp->next = base->next;
 			base->next = cmp;
 			cmp = next;
 		}
 		else {
 			base = base->next;
 		}
 	}

 	//print_list(result, level);
 	//printf("\n");

 	return result;
 }

 // take a sorted linked list of words, and merge all duplicates
 // incrementing the counter
 int reduce(word *words)
 {
 	word *current_word, *next;
 	int totalCount = 0;
 	for (current_word = words; current_word != NULL; ) {
 		for (next = current_word->next; next != NULL; ) {
 			if (strcmp(current_word->str, next->str) != 0) {
 				break;
 			}
 			current_word->c++;
 			word *tmp = next->next;
 			free(next->str);
 			free(next);
 			next = tmp;
 		}

 		totalCount++;
 		current_word->next = next;
 		current_word = next;
 	}

 	return totalCount;
 }

 int namecmp(word *a, word *b)
 {
 	return strcmp(b->str, a->str);
 }

 int countcmp(word *a, word *b)
 {
 	return b->c - a->c;
 }

 int main(int argc, char **argv)
 {
 	if (argc < 2) {
 		printf("usage: wordcount [file]\n");
 		return 0;
 	}

 	int i;
 	for (i = 1; i < argc; i++) {
 		word *words;

 		// read words from file
 		int wordCount = get_words_ascii(&words, argv[i]);
 		if (wordCount == -1) {
 			continue;
 		}

 		printf("wordcount for %s: %d\n", argv[i], wordCount);

 		// first sorting pass: sort the words alphabetically 
 		words = merge_sort(words, wordCount, 0, namecmp);

 		// eliminate duplicates
 		int newCount = reduce(words);

 		// sort by word frequency
 		words = merge_sort(words, newCount, 0, countcmp);

 		// print result
 		word *current_word;
 		int j = 0;
 		for (current_word = words; current_word != NULL; current_word = current_word->next) {
 			if (j > 10) {
 				break;
 			}

 			printf("%s %d\n", current_word->str, current_word->c);
 			j++;
 		}

 		free_words(words);

 		printf("\n");
 	}

 	return 0;
 }
	// this has hacky support for swedish characters in utf8.
	// i do not recommend this solution for any serious purposes. :)

	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <wchar.h>

	// convert a twobyte utf8 character into a single integer
	#define MB(str) (((str[0] & 0xFF) << 8) \| (str[1] & 0xFF))

	// our linked list for words
	typedef struct word_ {
	char *str;
	int c;
	struct word_ *next;
	} word;

	void *xmalloc(size_t size)
	{
	void *res = malloc(size);
	if (res == NULL) {
	fprintf(stderr, "Memory allocation failed\n");
	abort();
	}
	return res;
	}

	void xstrdup(char str)
	{
	char *res = strdup(str);
	if (res == NULL) {
	fprintf(stderr, "Memory allocation failed\n");
	abort();
	}
	return res;
	}

	// read a character from file, if it detects multibyte characters
	// it merges them into a single integer
	int readchar(FILE *fh)
	{
	int c = fgetc(fh);
	if (c == EOF) {
	return c;
	}

	// all characters larger than 128 is unicode characters
	if (c > 0x80) {
	int c2 = fgetc(fh);
	if (c2 == EOF) {
	return c;
	}

	c = ((c & 0xFF) << 8) \| (c2 & 0xFF);
	}

	return c;
	}

	// read a file from disk, and return a linked list
	// containing all the words in the file
	int get_words_ascii(word *words, char path)
	{
	FILE *fh = fopen(path, "r");
	if (!fh) {
	return -1;
	}

	int pos = 0, wc = 0;
	char buffer[1024];

	unsigned int c = fgetc(fh);
	word *current_word = NULL;
	do {
	// look for letters
	if ((c >= 'a' && c <= 'z') \|\|
	(c >= 'A' && c <= 'Z') \|\|
	c == 0x86 \|\| c == 0x8F \|\|
	c == 0x84 \|\| c == 0x8E \|\|
	c == 0x94 \|\| c == 0x99
	) {
	buffer[pos] = c;
	pos++;
	}
	else if (pos > 0) {
	buffer[pos] = 0;

	word new_word = (word)xmalloc(sizeof(word));
	new_word->str = xstrdup(buffer);
	new_word->c = 1;
	new_word->next = current_word;

	current_word = new_word;

	memset(buffer, 0, sizeof(buffer));
	pos = 0;
	wc++;
	}
	} while ((c = fgetc(fh)) != EOF);

	fclose(fh);

	*words = current_word;

	return wc;
	}

	int get_words_utf8(word *words, char path)
	{
	FILE *fh = fopen(path, "r");
	if (!fh) {
	return -1;
	}

	int pos = 0, wc = 0;
	char buffer[1024];

	unsigned int c = readchar(fh);
	word *current_word = NULL;
	do {
	// look for letters
	if ((c >= 'a' && c <= 'z') \|\|
	(c >= 'A' && c <= 'Z') \|\|
	c == 0xC3A5 \|\| c == 0xC385 \|\|
	c == 0xC3A4 \|\| c == 0xC384 \|\|
	c == 0xC3B6 \|\| c == 0xC396
	) {
	if (c > 0x80) {
	buffer[pos] = c >> 8;
	buffer[pos+1] = c & 0xFF;
	pos++;
	} else {
	buffer[pos] = c;
	}
	pos++;
	}
	else if (pos > 0) {
	buffer[pos] = 0;

	word new_word = (word)xmalloc(sizeof(word));
	new_word->str = xstrdup(buffer);
	new_word->c = 1;
	new_word->next = current_word;

	current_word = new_word;

	memset(buffer, 0, sizeof(buffer));
	pos = 0;
	wc++;
	}
	} while ((c = readchar(fh)) != EOF);

	fclose(fh);

	*words = current_word;

	return wc;
	}

	// free the memory for a linked list of words
	void free_words(word *words)
	{
	word *current_word;
	for (current_word = words; current_word != NULL; ) {
	word *next = current_word->next;
	free(current_word->str);
	free(current_word);
	current_word = next;
	}
	}

	// print a list of words. level can be used to specify
	// indentation
	void print_list(word *words, int level)
	{
	word *current_word;
	for (current_word = words; current_word != NULL; current_word = current_word->next) {
	int j;
	for (j = 0; j < level; j++) {
	printf("\t");
	}
	printf("%s %d\n", current_word->str, current_word->c);
	}
	}

	// sort a list of words using a custom comparison function
	word merge_sort(word words, int count, int level, int (cmpfunc)(word, word*))
	{
	if (words == NULL) {
	return words;
	}

	//printf("%d\n", count);
	//print_list(words, level);

	if (count == 1) {
	words->next = NULL;
	return words;
	}

	// split the list into two parts
	int split = count / 2;

	word left, right;
	word *current_word;
	int i = 0;
	for (current_word = words; current_word != NULL; current_word = current_word->next) {
	if (i == split - 1 \|\| current_word->next == NULL) {
	left = words;
	right = current_word->next;
	current_word->next = NULL;
	break;
	}
	i++;
	}

	// sort recursively
	word *base = merge_sort(left, split, level+1, cmpfunc);
	word *cmp = merge_sort(right, count-split, level+1, cmpfunc);

	//printf("left: %d\n", left->next == NULL);
	//printf("right: %d\n", right->next == NULL);

	// merge the two lists
	if (cmpfunc(base, cmp) > 0) {
	word *tmp = base;
	base = cmp;
	cmp = tmp;
	}

	word *result = base;
	while (cmp != NULL) {
	//printf("%s\n", cmp->str);
	if (base->next == NULL) {
	base->next = cmp;
	break;
	}

	//printf("cmp: %s %s\n", base->next->str, cmp->str);
	if (cmpfunc(base->next, cmp) > 0) {
	word *next = cmp->next;
	cmp->next = base->next;
	base->next = cmp;
	cmp = next;
	}
	else {
	base = base->next;
	}
	}

	//print_list(result, level);
	//printf("\n");

	return result;
	}

	// take a sorted linked list of words, and merge all duplicates
	// incrementing the counter
	int reduce(word *words)
	{
	word current_word, next;
	int totalCount = 0;
	for (current_word = words; current_word != NULL; ) {
	for (next = current_word->next; next != NULL; ) {
	if (strcmp(current_word->str, next->str) != 0) {
	break;
	}
	current_word->c++;
	word *tmp = next->next;
	free(next->str);
	free(next);
	next = tmp;
	}

	totalCount++;
	current_word->next = next;
	current_word = next;
	}

	return totalCount;
	}

	int namecmp(word a, word b)
	{
	return strcmp(b->str, a->str);
	}

	int countcmp(word a, word b)
	{
	return b->c - a->c;
	}

	int main(int argc, char **argv)
	{
	if (argc < 2) {
	printf("usage: wordcount [file]\n");
	return 0;
	}

	int i;
	for (i = 1; i < argc; i++) {
	word *words;

	// read words from file
	int wordCount = get_words_ascii(&words, argv[i]);
	if (wordCount == -1) {
	continue;
	}

	printf("wordcount for %s: %d\n", argv[i], wordCount);

	// first sorting pass: sort the words alphabetically
	words = merge_sort(words, wordCount, 0, namecmp);

	// eliminate duplicates
	int newCount = reduce(words);

	// sort by word frequency
	words = merge_sort(words, newCount, 0, countcmp);

	// print result
	word *current_word;
	int j = 0;
	for (current_word = words; current_word != NULL; current_word = current_word->next) {
	if (j > 10) {
	break;
	}

	printf("%s %d\n", current_word->str, current_word->c);
	j++;
	}

	free_words(words);

	printf("\n");
	}

	return 0;
	}