In this project, you need to write a tool named “tcount” that will count the occurrences of terms in a given file. You will use hashing data structure to implement the tool.
The test data contains a dictionary file and a text file. For each term in the dictionary, count the occurrence number in the text file.
For example suppose the dictionary file contains:
apple pie
apple
banana
banana juice
orange
the content of the text file:
apple pie is sweet, and apple is sweet too.
I like banana juice but I don't like banana.
It's good to eat orange after dimmer.
I like banana tree.
Hello, apple pie.
What do you want to eat, apple or banana?
The result will be
3 banana
2 apple pie
2 apple
1 banana juice
1 orange
Write the output to result.txt
. It should be sorted in descendant order.
因 text file
極大(~ 12 MB),所以先利用 dict file
建立 hash_table,之後每從 text file
讀入一行,拆解該行,並拿去 hash_table 裡比對有沒有存在,若有,該字串的出現次數就加一。
另外,處理英文資料時,是基於 word,也就是說,如果 dict file
是
apple
pine
text file
是:
pineapple
那 apple
跟 pine
的出現次數都是 **0
** 。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <ctype.h>
struct Node_ {
char* data;
int cnt;
struct Node_* next;
};
typedef struct Node_ Node;
Node hash_table[100000];
int hash(const char* s) {
const int len = strlen(s);
int value = 0;
for (int i=0; i<len; i++)
value = (value*33 + abs(s[i])) % 100000;
return value;
}
void ht_insert(const char* s) {
const int len = strlen(s);
const int value = hash(s);
// find empty node
Node* temp = &hash_table[value];
while (temp->data != NULL) {
if (strcmp(temp->data, s) == 0)
return;
temp = temp->next;
}
temp->data = (char*) malloc (sizeof(char) * (len + 1));
strncpy(temp->data, s, len);
(temp->data)[len] = '\0';
temp->cnt = 0;
temp->next = (Node*) malloc (sizeof(Node));
temp = temp->next;
temp->cnt = 0;
temp->data = NULL;
temp->next = NULL;
}
bool ht_append(const char* s) {
const int len = strlen(s);
const int value = hash(s);
//printf("appending:%s->", s);
Node* temp = &hash_table[value];
while (temp->data != NULL) {
if (strcmp(temp->data, s) == 0) {
(temp->cnt)++;
//printf("y\n");
return true;
}
temp = temp->next;
}
//printf("n\n");
return false;
}
typedef struct {
char* data;
int cnt;
} Sortable;
Sortable sort_data[10000000];
int sortable_cmp(const void* a, const void* b) {
Sortable* sa = (Sortable*) a;
Sortable* sb = (Sortable*) b;
if (sa->cnt < sb->cnt)
return -1;
if (sa->cnt > sb->cnt)
return +1;
return strcmp(sa->data, sb->data);
}
int min(const int a, const int b) {
return ((a < b) ? a : b);
}
void print_list(Node* n) {
if (n == NULL || n->data == NULL) return;
printf("%s,", n->data);
print_list(n->next);
}
void free_list(Node* n) {
if (n == NULL || n->data == NULL) return;
free_list(n->next);
free(n->data);
}
int main(int argc, char* argv[]) {
if (argc != 3) {
puts("Illegal arguments!");
return -1;
}
FILE* f_dic = fopen(argv[1], "r");
FILE* f_text = fopen(argv[2], "r");
if (f_dic == NULL || f_text == NULL) {
puts("File not exist!");
return -1;
}
// init hash table
for (int i=0; i<100000; i++) {
hash_table[i].cnt = 0;
hash_table[i].data = NULL;
hash_table[i].next = NULL;
}
char input[2048];
int max_len = -1;
while (fgets(input, 1024, f_dic) != NULL) {
int len = strlen(input);
if (input[len-1] == '\n') {
input[len-1] = '\0';
len--;
}
if (len == 0)
continue;
ht_insert(input);
if (len > max_len)
max_len = len;
}
fclose(f_dic);
// puts("Buiding Dict...Done:");
// for (int i=0; i<100000; i++) {
// if (hash_table[i].data != NULL) {
// printf("%5d: ", i);
// print_list(&hash_table[i]);
// puts("");
// }
// }
while (fgets(input, 2048, f_text) != NULL) {
int len = strlen(input);
if (input[len-1] == '\n') {
input[len-1] = '\0';
len--;
}
if (len == 0)
continue;
int pivot[2048];
int pivot_idx = 0;
pivot[pivot_idx++] = -1;
for (int i=0; i<len; i++)
if (isalpha(input[i]) == false &&
isdigit(input[i]) == false &&
strchr("\'", input[i]) == NULL)
pivot[pivot_idx++] = i;
pivot[pivot_idx++] = len;
// printf("max_len: %d\n", max_len);
// for (int i=0; i<pivot_idx; i++)
// printf("%d, ", pivot[i]);
// puts("");
for (int start=0; start < pivot_idx-1; start++) {
for (int end=start+1; end < pivot_idx; end++) {
int substr_len = pivot[end] - pivot[start];
if (substr_len > max_len)
break;
char temp = input[pivot[end]];
input[pivot[end]] = '\0';
ht_append(input + pivot[start] + 1);
input[pivot[end]] = temp;
}
}
}
fclose(f_text);
//printf("Process Data...ok\n");
int idx = 0;
for (int i=0; i<100000; i++) {
if (hash_table[i].data != NULL) {
Node* temp = &hash_table[i];
while (temp->data != NULL) {
if (temp->cnt != 0) {
sort_data[idx].data = temp->data;
sort_data[idx].cnt = temp->cnt;
idx++;
}
temp = temp->next;
}
}
}
//printf("data cnt: %d\n", idx);
qsort(sort_data, idx, sizeof(Sortable), sortable_cmp);
FILE* f_output = fopen("result.txt", "w");
for (int i=idx-1; i>=0; i--)
fprintf(f_output, "%d %s\n", sort_data[i].cnt, sort_data[i].data);
fclose(f_output);
for (int i=0; i<100000; i++) {
if (hash_table[i].data != NULL) {
free_list(&hash_table[i]);
}
}
return 0;
}