Skip to content

Instantly share code, notes, and snippets.

@amoshyc
Created January 27, 2015 07:37
Show Gist options
  • Save amoshyc/ce4eb28581f9a2f62dc3 to your computer and use it in GitHub Desktop.
Save amoshyc/ce4eb28581f9a2f62dc3 to your computer and use it in GitHub Desktop.
DS Project 8: Term Counting (Using hash table)

題目

In this project, you need to write a tool named “tcount” that will count the occurrences of terms in a given file. You will use hashing data structure to implement the tool.

The test data contains a dictionary file and a text file. For each term in the dictionary, count the occurrence number in the text file.

For example suppose the dictionary file contains:

apple pie
apple
banana
banana juice
orange

the content of the text file:

apple pie is sweet, and apple is sweet too.
I like banana juice but I don't like banana.
It's good to eat orange after dimmer.
I like banana tree.
Hello, apple pie.
What do you want to eat, apple or banana?

The result will be

3 banana
2 apple pie
2 apple
1 banana juice
1 orange

Write the output to result.txt. It should be sorted in descendant order.

解法

text file 極大(~ 12 MB),所以先利用 dict file 建立 hash_table,之後每從 text file 讀入一行,拆解該行,並拿去 hash_table 裡比對有沒有存在,若有,該字串的出現次數就加一。

另外,處理英文資料時,是基於 word,也就是說,如果 dict file

apple
pine

text file 是:

pineapple

applepine 的出現次數都是 **0 ** 。

Code

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <ctype.h>

struct Node_ {
    char* data;
    int cnt;
    struct Node_* next;
};

typedef struct Node_ Node;

Node hash_table[100000];

int hash(const char* s) {
    const int len = strlen(s);

    int value = 0;
    for (int i=0; i<len; i++)
        value = (value*33 + abs(s[i])) % 100000;
    return value;
}

void ht_insert(const char* s) {
    const int len = strlen(s);
    const int value = hash(s);

    // find empty node
    Node* temp = &hash_table[value];
    while (temp->data != NULL) {
        if (strcmp(temp->data, s) == 0)
            return;
        temp = temp->next;
    }

    temp->data = (char*) malloc (sizeof(char) * (len + 1));
    strncpy(temp->data, s, len);
    (temp->data)[len] = '\0';
    temp->cnt = 0;

    temp->next = (Node*) malloc (sizeof(Node));
    temp = temp->next;
    temp->cnt = 0;
    temp->data = NULL;
    temp->next = NULL;
}

bool ht_append(const char* s) {
    const int len = strlen(s);
    const int value = hash(s);

    //printf("appending:%s->", s);

    Node* temp = &hash_table[value];
    while (temp->data != NULL) {
        if (strcmp(temp->data, s) == 0) {
            (temp->cnt)++;
            //printf("y\n");
            return true;
        }
        temp = temp->next;
    }
    //printf("n\n");
    return false;
}

typedef struct {
    char* data;
    int cnt;
} Sortable;

Sortable sort_data[10000000];

int sortable_cmp(const void* a, const void* b) {
    Sortable* sa = (Sortable*) a;
    Sortable* sb = (Sortable*) b;

    if (sa->cnt < sb->cnt)
        return -1;
    if (sa->cnt > sb->cnt)
        return +1;
    return strcmp(sa->data, sb->data);
}

int min(const int a, const int b) {
    return ((a < b) ? a : b);
}

void print_list(Node* n) {
    if (n == NULL || n->data == NULL) return;

    printf("%s,", n->data);
    print_list(n->next);
}

void free_list(Node* n) {
    if (n == NULL || n->data == NULL) return;

    free_list(n->next);
    free(n->data);
}

int main(int argc, char* argv[]) {
    if (argc != 3) {
        puts("Illegal arguments!");
        return -1;
    }

    FILE* f_dic = fopen(argv[1], "r");
    FILE* f_text = fopen(argv[2], "r");
    if (f_dic == NULL || f_text == NULL) {
        puts("File not exist!");
        return -1;
    }

    // init hash table
    for (int i=0; i<100000; i++) {
        hash_table[i].cnt = 0;
        hash_table[i].data = NULL;
        hash_table[i].next = NULL;
    }

    char input[2048];
    int max_len = -1;
    while (fgets(input, 1024, f_dic) != NULL) {
        int len = strlen(input);
        if (input[len-1] == '\n') {
            input[len-1] = '\0';
            len--;
        }
        if (len == 0)
            continue;

        ht_insert(input);

        if (len > max_len)
            max_len = len;
    }
    fclose(f_dic);

    // puts("Buiding Dict...Done:");
    // for (int i=0; i<100000; i++) {
    //     if (hash_table[i].data != NULL) {
    //         printf("%5d: ", i);
    //         print_list(&hash_table[i]);
    //         puts("");
    //     }
    // }

    while (fgets(input, 2048, f_text) != NULL) {
        int len = strlen(input);
        if (input[len-1] == '\n') {
            input[len-1] = '\0';
            len--;
        }
        if (len == 0)
            continue;

        int pivot[2048];
        int pivot_idx = 0;
        pivot[pivot_idx++] = -1;
        for (int i=0; i<len; i++)
            if (isalpha(input[i]) == false &&
                isdigit(input[i]) == false &&
                strchr("\'", input[i]) == NULL)
                pivot[pivot_idx++] = i;
        pivot[pivot_idx++] = len;

        // printf("max_len: %d\n", max_len);
        // for (int i=0; i<pivot_idx; i++)
        //     printf("%d, ", pivot[i]);
        // puts("");

        for (int start=0; start < pivot_idx-1; start++) {
            for (int end=start+1; end < pivot_idx; end++) {
                int substr_len = pivot[end] - pivot[start];
                if (substr_len > max_len)
                    break;

                char temp = input[pivot[end]];
                input[pivot[end]] = '\0';
                ht_append(input + pivot[start] + 1);
                input[pivot[end]] = temp;
            }
        }
    }
    fclose(f_text);

    //printf("Process Data...ok\n");

    int idx = 0;
    for (int i=0; i<100000; i++) {
        if (hash_table[i].data != NULL) {
            Node* temp = &hash_table[i];
            while (temp->data != NULL) {
                if (temp->cnt != 0) {
                    sort_data[idx].data = temp->data;
                    sort_data[idx].cnt = temp->cnt;
                    idx++;
                }
                temp = temp->next;
            }
        }
    }

    //printf("data cnt: %d\n", idx);

    qsort(sort_data, idx, sizeof(Sortable), sortable_cmp);

    FILE* f_output = fopen("result.txt", "w");
    for (int i=idx-1; i>=0; i--)
        fprintf(f_output, "%d %s\n", sort_data[i].cnt, sort_data[i].data);
    fclose(f_output);

    for (int i=0; i<100000; i++) {
        if (hash_table[i].data != NULL) {
            free_list(&hash_table[i]);
        }
    }

    return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment