Last active
August 29, 2015 13:56
-
-
Save Rag0n/9216175 to your computer and use it in GitHub Desktop.
bwt[quicksort] + mtf + huffman
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdlib.h> | |
// #include <stdbool.h> // bool | |
#define BLOCKSIZE 5000 | |
#define MAX_TREE_HT 100 | |
// Преобразование Барроуза-Уиллера | |
// int f = 53; // колво символов в алфавите mtf | |
struct CharAndFreq | |
{ | |
unsigned ch; | |
unsigned freq; | |
}; | |
struct CharAndBit | |
{ | |
unsigned ch; | |
int bits[8]; | |
}; | |
struct Node | |
{ | |
struct CharAndFreq data[1]; | |
struct Node *left, *right; // Left and right child of this node | |
}; | |
struct Heap | |
{ | |
unsigned size; // Current size of min heap | |
unsigned capacity; // capacity of min heap (count) | |
// unsigned int size; // Size of the allocated memory (in number of items) | |
// unsigned int capacity; // capacity of the elements in the heap | |
struct Node **array; // Array of Heap node pointers | |
}; | |
// A utility function to print an array of size n | |
void printArr(int arr[], int n, struct CharAndBit *result, int *pos) | |
{ | |
int i; | |
for (i = 0; i < n; ++i) | |
{ | |
printf("%d", arr[i]); | |
result[*pos].bits[i] = arr[i]; | |
} | |
result[*pos].bits[i] = 2; | |
printf("\n"); | |
} | |
struct Heap* createHeap(int capacity) // | |
{ | |
struct Heap *heap = (struct Heap*)malloc(sizeof(struct Heap)); | |
heap->size = 0; | |
heap->capacity = capacity; | |
heap->array = (struct Node**) malloc(heap->capacity * sizeof(struct Node*)); | |
return heap; | |
} | |
// A utility function to swap two min heap nodes | |
void swap(struct Node **one, struct Node **two) | |
{ | |
struct Node *temp = *one; | |
*one = *two; | |
*two = temp; | |
} | |
// The standard minHeapify function. | |
void Heapify(struct Heap* heap, int i) // | |
{ | |
int minimum = i; | |
int left = 2*i + 1; | |
int right = 2*i + 2; | |
if (left < heap->size && heap->array[left]->data[0].freq < heap->array[minimum]->data[0].freq) | |
minimum = left; | |
if (right < heap->size && heap->array[right]->data[0].freq < heap->array[minimum]->data[0].freq) | |
minimum = right; | |
if (minimum != i) | |
{ | |
swap(&heap->array[minimum], &heap->array[i]); | |
Heapify(heap, minimum); | |
} | |
} | |
struct Node* extract_min(struct Heap *heap) | |
{ | |
struct Mode *temp = heap->array[0]; | |
heap->array[0] = heap->array[heap->size - 1]; | |
heap->size--; | |
Heapify(heap, 0); | |
return temp; | |
} | |
// A standard funvtion to build min heap | |
void buildHeap(struct Heap *heap) // | |
{ | |
int lenght = heap->size - 1; | |
for (int i = lenght / 2; i > -1; --i) | |
Heapify(heap, i); | |
} | |
struct Node* newNode(char data, int freq) | |
{ | |
struct Node* temp = (struct Node*)malloc(sizeof(struct Node)); | |
temp->left = temp->right = NULL; // обнуляем указатели | |
temp->data[0].ch = data; | |
temp->data[0].freq = freq; | |
return temp; | |
} | |
void add(struct Heap *heap, struct Node *node) | |
{ | |
int i = heap->size; | |
heap->size++; | |
while(i > 0 && node->data[0].freq < heap->array[(i - 1)/2]->data[0].freq) | |
{ | |
heap->array[i] = heap->array[(i - 1)/2]; | |
i = (i - 1)/2; | |
} | |
heap->array[i] = node; | |
} | |
struct Node* buildTree(struct CharAndFreq data[], int size) // | |
{ | |
struct Node *top, *left, *right; | |
// создаем минимальную кучу | |
struct Heap *heap = createHeap(size); | |
for (int i = 0; i < size; ++i) | |
{ | |
heap->array[i] = newNode(data[i].ch, data[i].freq); | |
} | |
heap->size = size; | |
buildHeap(heap); | |
while(heap->size > 1) | |
{ | |
left = extract_min(heap); | |
right = extract_min(heap); | |
top = newNode('$', left->data[0].freq + right->data[0].freq); | |
top->left = left; | |
top->right = right; | |
add(heap, top); | |
} | |
// последний элемент - корень | |
return extract_min(heap); | |
} | |
// Prints huffman codes from the root of Huffman Tree. It uses arr[] to | |
// store codes | |
void HuffmanCodes(struct Node* root, int arr[], int top, struct CharAndBit result[], int *pos) | |
{ | |
// Assign 0 to left edge and recur | |
if (root->left) | |
{ | |
arr[top] = 0; | |
HuffmanCodes(root->left, arr, top + 1, result, pos); | |
} | |
// Assign 1 to right edge and recur | |
if (root->right) | |
{ | |
arr[top] = 1; | |
HuffmanCodes(root->right, arr, top + 1, result, pos); | |
} | |
// If this is a leaf node, then it contains one of the input | |
// characters, print the character and its code from arr[] | |
if (root->left == NULL && root->right == NULL) | |
{ | |
printf("%d: ", root->data[0].ch); | |
result[*pos].ch = root->data[0].ch; | |
printArr(arr, top, result, pos); | |
(*pos)++; | |
} | |
} | |
// MTF: | |
int mtf_move(int index, char *dict) | |
{ | |
int tmp1, tmp2, i = 1, x = dict[index]; | |
tmp1 = dict[i]; | |
dict[i] = x; | |
while(tmp1 != x) | |
{ | |
i++; | |
tmp2 = tmp1; | |
tmp1 = dict[i]; | |
dict[i] = tmp2; | |
} | |
return 0; | |
} | |
int mtf(int sym_mem, char *result_bwt) | |
{ | |
// составляем словарь | |
char dict[97] = {'e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd', 'l', 'c', 'u', 'm', 'w', 'f', 'g', 'y', 'p', \ | |
'b', 'v', 'k', 'j', 'x', 'q', 'z', 'E', 'T', 'A', 'O', 'I', 'N', 'S', 'H', 'R', 'D', 'L', 'C', 'U', 'M', 'W', 'F', 'G', 'Y', 'P', 'B', \ | |
'V', 'K', 'X', 'J', 'Q', 'Z'}; | |
int f = 53; | |
for (int d = 32; d < 65; ++d) | |
{ | |
dict[f] = d; | |
f++; | |
} | |
for (int d = 91; d < 97; ++d) | |
{ | |
dict[f] = d; | |
f++; | |
} | |
for (int d = 123; d < 127; ++d) | |
{ | |
dict[f] = d; | |
f++; | |
} | |
// printf("%d\n", f); | |
// можно сделать бинарный поиск | |
int j, tmp; | |
for (int i = 0; i < sym_mem; ++i) | |
{ | |
// переписать на while? | |
for (j = 0; j < f; ++j) | |
if (result_bwt[i] == dict[j]) | |
break; | |
printf("%d ", j); | |
result_bwt[i] = j; | |
if (j > 1) | |
{ | |
mtf_move(j, dict); | |
} | |
else if (j == 1) | |
{ | |
tmp = dict[0]; | |
dict[0] = dict[1]; | |
dict[1] = tmp; | |
} | |
} | |
printf("\n"); | |
return f; | |
} | |
// BWT: | |
void bwtswap(char *f, char *l, int memory) | |
{ | |
char temp; | |
for (int i = 0; i < memory; ++i) | |
{ | |
temp = f[i]; | |
f[i] = l[i]; | |
l[i] = temp; | |
} | |
} | |
int mystrcmp(const char *a, const char *b){ | |
while ( *a && *b && *a == *b ) | |
++a, ++b; | |
return (*a - *b); | |
} | |
int cmp_min(char *f, char *l) | |
{ | |
int res = mystrcmp(f, l); | |
if (res <= 0) | |
return 1; | |
return 0; | |
} | |
int cmp_max(char *f, char *l) | |
{ | |
int res = mystrcmp(f, l); | |
if (res > 0) | |
return 1; | |
return 0; | |
} | |
void quickSortR(char *s_arr, int first, int last, int memory) | |
{ | |
// На входе - массив a[], a[N] - его последний элемент. | |
long i = 0, j = last; // поставить указатели на исходные места | |
char *x = s_arr+(first+(last-first)/2)*(memory+1); | |
// процедура разделения | |
do { | |
while (mystrcmp(s_arr+i*(memory+1), x) < 0 ) i++; | |
while (mystrcmp(s_arr+j*(memory+1), x) > 0 ) j--; | |
if (i <= j) { | |
bwtswap(s_arr+i*(memory+1), s_arr+j*(memory+1), memory); | |
i++; j--; | |
} | |
} while ( i<=j ); | |
// рекурсивные вызовы, если есть, что сортировать | |
if (i < last) | |
quickSortR(s_arr, i, last, memory); | |
if (first < j) | |
quickSortR(s_arr, first,j, memory); | |
} | |
int bwt(int memory, FILE *file, int *current_pos, char *result_bwt) | |
{ | |
// memory+1 по сути лишнее, но рефакторинг потом | |
char bwt_array[memory][memory+1]; // +1 для терминального нуля | |
for (int i = 0; i < memory-1; ++i) | |
bwt_array[0][i] = fgetc(file); | |
bwt_array[0][memory-1] = '$'; | |
for (int i = 1; i < memory; ++i) | |
{ | |
bwt_array[i][memory-1] = bwt_array[i-1][0]; | |
for (int j = 0; j < memory-1; ++j) | |
bwt_array[i][j] = bwt_array[i-1][j+1]; | |
} | |
for (int i = 0; i < memory; ++i) | |
bwt_array[i][memory] = '\0'; | |
for (int i = 0; i < memory; ++i) | |
printf("%s\n", bwt_array[i]); | |
printf("=======\n"); | |
quickSortR(bwt_array, 0, memory-1, memory); | |
for (int i = 0; i < memory; ++i) | |
printf("%s\n", bwt_array[i]); | |
printf("=======\n"); | |
// выбираем последний столбец | |
for (int i = 0, j = *current_pos; j < memory + *current_pos; ++i, ++j) | |
result_bwt[j] = bwt_array[i][memory-1]; | |
*current_pos += memory; | |
return 0; | |
} | |
int main() | |
{ | |
FILE *file, *en_file; | |
file = fopen("/home/rag0n/Programming/C/MyProjects/kursovaya/text/sample.txt", "r"); | |
int i, symbol, counter = 0, sum_mem = 0, current_pos = 0, alphabet; | |
for (i = 0; symbol != EOF; ++i) | |
symbol = fgetc(file); | |
// исправить на фсик | |
fclose(file); | |
file = fopen("/home/rag0n/Programming/C/MyProjects/kursovaya/text/sample.txt", "r"); | |
i -= 2; | |
printf("%d\n", i); | |
while(i > 0) | |
{ | |
counter++; | |
i -= BLOCKSIZE; | |
sum_mem += BLOCKSIZE+1; | |
} | |
sum_mem += i; // суммарное кол-во памяти для bwt | |
char result_bwt[sum_mem]; | |
// printf("i: %d\n", i); | |
// printf("%d\n", counter); | |
printf("sum mem %d\n", sum_mem); | |
while(counter > 1) | |
{ | |
bwt(BLOCKSIZE+1, file, ¤t_pos, result_bwt); | |
counter--; | |
} | |
// printf("%d\n", BLOCKSIZE+i+1); | |
bwt(BLOCKSIZE+i+1, file, ¤t_pos, result_bwt); | |
printf("\n\n"); | |
// for (int i = 0; i < sum_mem; ++i) | |
// printf("%c", result_bwt[i]); | |
printf("%s\n", result_bwt); | |
// printf("\n"); | |
alphabet = mtf(sum_mem, result_bwt); | |
for (int i = 0; i < sum_mem; ++i) | |
printf("%d ", result_bwt[i]); | |
printf("\n"); | |
fclose(file); | |
int j, pos = 0, *pnt; // частота вхождения символа | |
struct CharAndFreq data[256]; | |
struct CharAndBit result[256]; | |
int k = 0; // указатель на массив и кол-во массивов одновременно; | |
printf("---------------------\n"); | |
for (int i = 0; i < alphabet; ++i) | |
{ | |
j = 0; | |
data[k].freq = 0; | |
while(j < sum_mem) | |
{ | |
if (result_bwt[j] == i) | |
{ | |
data[k].freq++; | |
} | |
j++; | |
} | |
if (data[k].freq != 0) | |
{ | |
data[k].ch = i; | |
k++; | |
} | |
} | |
printf("%d\n", k); | |
printf("Частота встречаемости символов:\n"); | |
for (int i = 0; i < k; ++i) | |
{ | |
printf("%d %d\n", data[i].ch, data[i].freq); | |
} | |
struct Node *root = buildTree(data, k); | |
int codes[256], top = 0; | |
printf("---------------------\nHuffman codes:\n"); | |
HuffmanCodes(root, codes, top, result, &pos); | |
// printf("===============\n"); | |
// for (int i = 0; i < k; ++i) | |
// { | |
// printf("%d ", result[i].ch); | |
// for (pnt = result[i].bits; *pnt != 2; pnt++) | |
// printf("%d", *pnt); | |
// printf("\n"); | |
// } | |
// encode | |
file = fopen("/home/rag0n/Programming/C/MyProjects/kursovaya/text/sample.txt", "r"); | |
en_file = fopen("/home/rag0n/Programming/C/MyProjects/kursovaya/text/sample_result.txt", "w"); | |
int cnt = 0; | |
printf("%d\n", result[5].ch); | |
while(cnt < sum_mem) | |
{ | |
for (int i = 0; i < k; ++i) | |
{ | |
if (result_bwt[cnt] == result[i].ch) | |
{ | |
for (pnt = result[i].bits; *pnt != 2; pnt++) | |
{ | |
printf("%d", *pnt); | |
fprintf(en_file, "%d", *pnt); | |
} | |
break; | |
} | |
} | |
cnt++; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment