Created
January 14, 2019 07:21
-
-
Save starwing/4bc558c962f6ad1fc16bf0a6470f3c13 to your computer and use it in GitHub Desktop.
Huffman coding convertion
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _CRT_SECURE_NO_WARNINGS | |
#include <assert.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <limits.h> | |
#include <errno.h> | |
/* 能处理的最大文件大小,在这种情况下,最大的霍夫曼编码也不可能超过 | |
* size_t 的大小 */ | |
#define HC_SIZEMAX (~(size_t)0 - 100) | |
/* 该文件限制下最大的霍夫曼编码长度 */ | |
#define HC_CODELEN (sizeof(size_t)*CHAR_BIT) | |
/* 字符表长度 */ | |
#define HC_CHARCOUNT (1<<CHAR_BIT) | |
/* 字符掩码 */ | |
#define HC_CHARMASK (HC_CHARCOUNT-1) | |
/* 一次处理多少bit编码(使用2^N叉树解析) */ | |
#define HC_BITSHIFT 4 | |
/* 最多有多少树节点:霍夫曼编码总节点数有字母表*2 | |
* 字母表取8bit树,则最多有512个节点。使用2^N叉树最坏情况下一个节点存 | |
* 储2N个二叉树节点,所以最大节点数为字母表/N */ | |
#define HC_MAXNODES (HC_CHARCOUNT/HC_BITSHIFT) | |
/* 霍夫曼解码器:前两个字节为节点数量,后面是2^N叉树 | |
* 每个节点是short,其中如果高8位是0,则低八位是子树节点索引。否则,高 | |
* 8位是当前节点编码长度(最大255),低8位是字符 */ | |
typedef struct HDecoder { | |
unsigned short count; | |
unsigned short nodes[HC_MAXNODES][1<<HC_BITSHIFT]; | |
} HDecoder; | |
/* 霍夫曼编码器:code为编码的二进制值,len是编码的比特位 */ | |
typedef struct HEncoder { | |
size_t code[HC_CHARCOUNT]; | |
size_t len[HC_CHARCOUNT]; | |
} HEncoder; | |
/* 二叉霍夫曼树节点,节点总数是字符集*2,所以left和right分别为 | |
* CHAR_BIT+1就足够了 */ | |
typedef struct HNode { | |
unsigned weight; | |
unsigned code: CHAR_BIT; | |
unsigned left: CHAR_BIT+1; | |
unsigned right: CHAR_BIT+1; | |
unsigned leaf: 1; | |
} HNode; | |
static void ht_traverse(HEncoder *e, HNode *nodes, size_t ni, size_t len) { | |
HNode *c = &nodes[ni]; | |
if (c->leaf) { | |
assert(len < HC_CODELEN); | |
e->len[c->code] = len; | |
} else { | |
ht_traverse(e, nodes, c->left, len+1); | |
ht_traverse(e, nodes, c->right, len+1); | |
} | |
} | |
static void ht_initencoder(HEncoder *e) { | |
size_t bl_count[HC_CODELEN+1] = {0}; | |
size_t next_code[HC_CODELEN+1] = {0}; | |
size_t i, code = 0; | |
for (i = 0; i < HC_CHARCOUNT; ++i) | |
++bl_count[e->len[i]]; | |
bl_count[0] = 0; | |
for (i = 1; i <= HC_CODELEN; ++i) { | |
code = (code + bl_count[i-1]) << 1; | |
next_code[i] = code; | |
} | |
for (i = 0; i < HC_CHARCOUNT; ++i) | |
if (e->len[i]) e->code[i] = next_code[e->len[i]]++; | |
} | |
static void ht_initdecoder(HDecoder *d, const HEncoder *e) { | |
size_t i; | |
memset(d, 0, sizeof(*d)); | |
d->count = 1; | |
for (i = 0; i < HC_CHARCOUNT; ++i) { | |
size_t code = e->code[i], len = e->len[i]; | |
size_t ti = 0; | |
while (len) { | |
size_t curr = len < HC_BITSHIFT ? len : HC_BITSHIFT; | |
size_t old = ti, bit = code >> (len - curr); | |
if ((len -= curr) == 0) { | |
size_t padding = HC_BITSHIFT - curr; | |
short data = (e->len[i]<<CHAR_BIT|i)&(~(short)0); | |
size_t j, count; | |
for (j = 0, count = 1<<padding; j < count; ++j) | |
d->nodes[ti][(bit<<padding)+j] = data; | |
} else if ((ti = d->nodes[ti][bit]) == 0) { | |
d->nodes[old][bit] = d->count; | |
ti = d->count++; | |
} | |
assert(ti < d->count); | |
code &= (1<<len) - 1; | |
} | |
} | |
} | |
static void ht_heapadjust(HNode *h, size_t i, size_t count) { | |
HNode curr = h[i]; | |
size_t root, child; | |
for (root = i; (child = root*2+1) < count; root = child) { | |
if (child < count-1 && h[child].weight > h[child+1].weight) | |
++child; | |
if (curr.weight <= h[child].weight) | |
break; | |
h[root] = h[child]; | |
} | |
h[root] = curr; | |
} | |
static size_t ht_buildencoder(HEncoder *e, HNode *nodes, size_t count) { | |
size_t i, hsize = count; | |
for (i = count/2; i > 0; --i) | |
ht_heapadjust(nodes, i, hsize); | |
ht_heapadjust(nodes, 0, hsize); | |
for (; --hsize > 0; ++count) { | |
HNode t = nodes[0]; | |
nodes[0] = nodes[hsize]; | |
nodes[hsize] = t; | |
ht_heapadjust(nodes, 0, hsize); | |
nodes[count] = nodes[0]; | |
nodes[0].weight += nodes[hsize].weight; | |
nodes[0].code = 0; | |
nodes[0].left = hsize; | |
nodes[0].right = count; | |
nodes[0].leaf = 0; | |
ht_heapadjust(nodes, 0, hsize); | |
} | |
memset(e, 0, sizeof(HEncoder)); | |
ht_traverse(e, nodes, 0, 0); | |
ht_initencoder(e); | |
return count; | |
} | |
static size_t ht_buildnode(HNode *nodes, FILE *fp, size_t *pfilelen) { | |
size_t i, count, weight[HC_CHARCOUNT] = {0}; | |
size_t filelen = 0; | |
int ch; | |
while ((ch = fgetc(fp)) != EOF) | |
++weight[ch & HC_CHARMASK], ++filelen; | |
if (pfilelen) *pfilelen = filelen; | |
for (i = count = 0; i < HC_CHARCOUNT; ++i) { | |
if (weight[i]) { | |
nodes[count].weight = weight[i]; | |
nodes[count].code = i; | |
nodes[count].leaf = 1; | |
++count; | |
} | |
} | |
for (i = 0; i < HC_CHARCOUNT && count < 2; ++i) { | |
if (!weight[i]) { | |
nodes[count].weight = weight[i]; | |
nodes[count].code = i; | |
nodes[count].leaf = 1; | |
++count; | |
} | |
} | |
return count; | |
} | |
static void ht_encode(HEncoder *e, FILE *infp, FILE *outfp) { | |
int ch; | |
unsigned carry = 0; | |
size_t count = 0; | |
while ((ch = fgetc(infp)) != EOF) { | |
size_t code = e->code[ch & HC_CHARMASK]; | |
size_t len = e->len[ch & HC_CHARMASK]; | |
while (len) { | |
size_t curr = len < CHAR_BIT ? len : CHAR_BIT; | |
if (curr) { | |
carry = (carry<<curr)|(code>>(len - curr)); | |
count += curr; | |
len -= curr; | |
} | |
/* count数量永远不会超过CHAR_BIT*2,所以不需要while */ | |
assert(count < CHAR_BIT*2); | |
if (count >= CHAR_BIT) { | |
fputc(carry >> (count-CHAR_BIT), outfp); | |
count -= CHAR_BIT; | |
carry &= (1 << count) - 1; | |
} | |
} | |
} | |
assert(count <= CHAR_BIT); | |
fputc(carry << (CHAR_BIT-count), outfp); | |
} | |
static int ht_decode(HDecoder *d, size_t filelen, FILE *infp, FILE *outfp) { | |
int ch, carry = 0; | |
size_t ti = 0, count = 0; | |
while (filelen != 0 && (ch = fgetc(infp)) != EOF) { | |
carry = carry<<CHAR_BIT|(ch & HC_CHARMASK); | |
count += CHAR_BIT; | |
while (filelen != 0 && count >= HC_BITSHIFT) { | |
unsigned short code = d->nodes[ti][carry>>(count-HC_BITSHIFT)]; | |
size_t len = code>>CHAR_BIT; | |
if (len == 0) { | |
count -= HC_BITSHIFT; | |
ti = code; | |
} else { | |
size_t remain = len % HC_BITSHIFT; | |
count -= remain ? remain : HC_BITSHIFT; | |
fputc(code&HC_CHARMASK, outfp), --filelen; | |
ti = 0; | |
} | |
carry &= (1 << count) - 1; | |
} | |
} | |
return filelen == 0; | |
} | |
static int report_error(const char *fn) { | |
fprintf(stderr, "%s: %s\n", fn, strerror(errno)); | |
exit(1); | |
return 0; /* 避免警告 */ | |
} | |
static void write_char(FILE *fp, int ch) | |
{ if (fputc(ch, fp) != ch) report_error("write"); } | |
static void write_size(FILE *fp, size_t sz) { | |
size_t i; | |
write_char(fp, HC_CODELEN); | |
for (i = 0; i < HC_CODELEN; ++i) { | |
write_char(fp, sz&HC_CHARMASK); | |
sz >>= CHAR_BIT; | |
} | |
} | |
static size_t read_size(FILE *fp) { | |
size_t i, sz = 0, count; | |
int ch; | |
if ((ch = fgetc(fp)) == EOF | |
|| (count = (size_t)ch) > HC_CODELEN) | |
return 0; | |
for (i = 0; i < count; ++i) { | |
if ((ch = fgetc(fp)) == EOF) | |
return 0; | |
sz |= (ch&HC_CHARMASK)<<(i*CHAR_BIT); | |
} | |
return sz; | |
} | |
static int encode_file(const char *infile, const char *outfile) { | |
FILE *infp, *outfp; | |
HEncoder e; | |
HNode nodes[HC_CHARCOUNT*2]; | |
size_t i, count, filelen; | |
if ((infp = fopen(infile, "rb")) == NULL) | |
return report_error(infile); | |
if ((outfp = fopen(outfile, "wb")) == NULL) | |
return report_error(outfile); | |
count = ht_buildnode(nodes, infp, &filelen); | |
ht_buildencoder(&e, nodes, count); | |
write_size(outfp, filelen); | |
for (i = 0; i < HC_CHARCOUNT; ++i) | |
write_char(outfp, e.len[i]); | |
rewind(infp); | |
ht_encode(&e, infp, outfp); | |
fclose(infp); | |
fclose(outfp); | |
return 1; | |
} | |
static int decode_file(const char *infile, const char *outfile) { | |
FILE *infp, *outfp; | |
HEncoder e; | |
HDecoder d; | |
size_t i, filelen; | |
int ch; | |
if ((infp = fopen(infile, "rb")) == NULL) | |
return report_error(infile); | |
if ((outfp = fopen(outfile, "wb")) == NULL) | |
return report_error(outfile); | |
if ((filelen = read_size(infp)) == 0) | |
return report_error(infile); | |
if (filelen > HC_SIZEMAX) { | |
fprintf(stderr, "%s: file too big (size: %u)\n", | |
infile, (unsigned)filelen); | |
exit(1); | |
} | |
for (i = 0; i < HC_CHARCOUNT; ++i) { | |
if ((ch = fgetc(infp)) == EOF) | |
return report_error(infile); | |
if ((e.len[i] = (size_t)(ch & HC_CHARMASK)) > HC_CODELEN) { | |
fprintf(stderr, "%s: table error\n", infile); | |
exit(1); | |
} | |
} | |
ht_initencoder(&e); | |
ht_initdecoder(&d, &e); | |
if (!ht_decode(&d, filelen, infp, outfp)) { | |
fprintf(stderr, "%s: decode error\n", outfile); | |
exit(1); | |
} | |
fclose(infp); | |
fclose(outfp); | |
return 1; | |
} | |
int main(int argc, char **argv) | |
{ | |
if (argc >= 4 && strcmp(argv[1], "encode") == 0) | |
encode_file(argv[2], argv[3]); | |
else if (argc >= 4 && strcmp(argv[1], "decode") == 0) | |
decode_file(argv[2], argv[3]); | |
else { | |
fprintf(stderr, "Usage: %s (encode|decode) infile outfile\n", argv[0]); | |
return 0; | |
} | |
return 0; | |
} | |
/* cc:run+='decode data.bin out.txt' */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment