Skip to content

Instantly share code, notes, and snippets.

@starwing
Created January 14, 2019 07:21
Show Gist options
  • Save starwing/4bc558c962f6ad1fc16bf0a6470f3c13 to your computer and use it in GitHub Desktop.
Save starwing/4bc558c962f6ad1fc16bf0a6470f3c13 to your computer and use it in GitHub Desktop.
Huffman coding convertion
#define _CRT_SECURE_NO_WARNINGS
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <errno.h>
/* 能处理的最大文件大小,在这种情况下,最大的霍夫曼编码也不可能超过
* size_t 的大小 */
#define HC_SIZEMAX (~(size_t)0 - 100)
/* 该文件限制下最大的霍夫曼编码长度 */
#define HC_CODELEN (sizeof(size_t)*CHAR_BIT)
/* 字符表长度 */
#define HC_CHARCOUNT (1<<CHAR_BIT)
/* 字符掩码 */
#define HC_CHARMASK (HC_CHARCOUNT-1)
/* 一次处理多少bit编码(使用2^N叉树解析) */
#define HC_BITSHIFT 4
/* 最多有多少树节点:霍夫曼编码总节点数有字母表*2
* 字母表取8bit树,则最多有512个节点。使用2^N叉树最坏情况下一个节点存
* 储2N个二叉树节点,所以最大节点数为字母表/N */
#define HC_MAXNODES (HC_CHARCOUNT/HC_BITSHIFT)
/* 霍夫曼解码器:前两个字节为节点数量,后面是2^N叉树
* 每个节点是short,其中如果高8位是0,则低八位是子树节点索引。否则,高
* 8位是当前节点编码长度(最大255),低8位是字符 */
typedef struct HDecoder {
unsigned short count;
unsigned short nodes[HC_MAXNODES][1<<HC_BITSHIFT];
} HDecoder;
/* 霍夫曼编码器:code为编码的二进制值,len是编码的比特位 */
typedef struct HEncoder {
size_t code[HC_CHARCOUNT];
size_t len[HC_CHARCOUNT];
} HEncoder;
/* 二叉霍夫曼树节点,节点总数是字符集*2,所以left和right分别为
* CHAR_BIT+1就足够了 */
typedef struct HNode {
unsigned weight;
unsigned code: CHAR_BIT;
unsigned left: CHAR_BIT+1;
unsigned right: CHAR_BIT+1;
unsigned leaf: 1;
} HNode;
static void ht_traverse(HEncoder *e, HNode *nodes, size_t ni, size_t len) {
HNode *c = &nodes[ni];
if (c->leaf) {
assert(len < HC_CODELEN);
e->len[c->code] = len;
} else {
ht_traverse(e, nodes, c->left, len+1);
ht_traverse(e, nodes, c->right, len+1);
}
}
static void ht_initencoder(HEncoder *e) {
size_t bl_count[HC_CODELEN+1] = {0};
size_t next_code[HC_CODELEN+1] = {0};
size_t i, code = 0;
for (i = 0; i < HC_CHARCOUNT; ++i)
++bl_count[e->len[i]];
bl_count[0] = 0;
for (i = 1; i <= HC_CODELEN; ++i) {
code = (code + bl_count[i-1]) << 1;
next_code[i] = code;
}
for (i = 0; i < HC_CHARCOUNT; ++i)
if (e->len[i]) e->code[i] = next_code[e->len[i]]++;
}
static void ht_initdecoder(HDecoder *d, const HEncoder *e) {
size_t i;
memset(d, 0, sizeof(*d));
d->count = 1;
for (i = 0; i < HC_CHARCOUNT; ++i) {
size_t code = e->code[i], len = e->len[i];
size_t ti = 0;
while (len) {
size_t curr = len < HC_BITSHIFT ? len : HC_BITSHIFT;
size_t old = ti, bit = code >> (len - curr);
if ((len -= curr) == 0) {
size_t padding = HC_BITSHIFT - curr;
short data = (e->len[i]<<CHAR_BIT|i)&(~(short)0);
size_t j, count;
for (j = 0, count = 1<<padding; j < count; ++j)
d->nodes[ti][(bit<<padding)+j] = data;
} else if ((ti = d->nodes[ti][bit]) == 0) {
d->nodes[old][bit] = d->count;
ti = d->count++;
}
assert(ti < d->count);
code &= (1<<len) - 1;
}
}
}
static void ht_heapadjust(HNode *h, size_t i, size_t count) {
HNode curr = h[i];
size_t root, child;
for (root = i; (child = root*2+1) < count; root = child) {
if (child < count-1 && h[child].weight > h[child+1].weight)
++child;
if (curr.weight <= h[child].weight)
break;
h[root] = h[child];
}
h[root] = curr;
}
static size_t ht_buildencoder(HEncoder *e, HNode *nodes, size_t count) {
size_t i, hsize = count;
for (i = count/2; i > 0; --i)
ht_heapadjust(nodes, i, hsize);
ht_heapadjust(nodes, 0, hsize);
for (; --hsize > 0; ++count) {
HNode t = nodes[0];
nodes[0] = nodes[hsize];
nodes[hsize] = t;
ht_heapadjust(nodes, 0, hsize);
nodes[count] = nodes[0];
nodes[0].weight += nodes[hsize].weight;
nodes[0].code = 0;
nodes[0].left = hsize;
nodes[0].right = count;
nodes[0].leaf = 0;
ht_heapadjust(nodes, 0, hsize);
}
memset(e, 0, sizeof(HEncoder));
ht_traverse(e, nodes, 0, 0);
ht_initencoder(e);
return count;
}
static size_t ht_buildnode(HNode *nodes, FILE *fp, size_t *pfilelen) {
size_t i, count, weight[HC_CHARCOUNT] = {0};
size_t filelen = 0;
int ch;
while ((ch = fgetc(fp)) != EOF)
++weight[ch & HC_CHARMASK], ++filelen;
if (pfilelen) *pfilelen = filelen;
for (i = count = 0; i < HC_CHARCOUNT; ++i) {
if (weight[i]) {
nodes[count].weight = weight[i];
nodes[count].code = i;
nodes[count].leaf = 1;
++count;
}
}
for (i = 0; i < HC_CHARCOUNT && count < 2; ++i) {
if (!weight[i]) {
nodes[count].weight = weight[i];
nodes[count].code = i;
nodes[count].leaf = 1;
++count;
}
}
return count;
}
static void ht_encode(HEncoder *e, FILE *infp, FILE *outfp) {
int ch;
unsigned carry = 0;
size_t count = 0;
while ((ch = fgetc(infp)) != EOF) {
size_t code = e->code[ch & HC_CHARMASK];
size_t len = e->len[ch & HC_CHARMASK];
while (len) {
size_t curr = len < CHAR_BIT ? len : CHAR_BIT;
if (curr) {
carry = (carry<<curr)|(code>>(len - curr));
count += curr;
len -= curr;
}
/* count数量永远不会超过CHAR_BIT*2,所以不需要while */
assert(count < CHAR_BIT*2);
if (count >= CHAR_BIT) {
fputc(carry >> (count-CHAR_BIT), outfp);
count -= CHAR_BIT;
carry &= (1 << count) - 1;
}
}
}
assert(count <= CHAR_BIT);
fputc(carry << (CHAR_BIT-count), outfp);
}
static int ht_decode(HDecoder *d, size_t filelen, FILE *infp, FILE *outfp) {
int ch, carry = 0;
size_t ti = 0, count = 0;
while (filelen != 0 && (ch = fgetc(infp)) != EOF) {
carry = carry<<CHAR_BIT|(ch & HC_CHARMASK);
count += CHAR_BIT;
while (filelen != 0 && count >= HC_BITSHIFT) {
unsigned short code = d->nodes[ti][carry>>(count-HC_BITSHIFT)];
size_t len = code>>CHAR_BIT;
if (len == 0) {
count -= HC_BITSHIFT;
ti = code;
} else {
size_t remain = len % HC_BITSHIFT;
count -= remain ? remain : HC_BITSHIFT;
fputc(code&HC_CHARMASK, outfp), --filelen;
ti = 0;
}
carry &= (1 << count) - 1;
}
}
return filelen == 0;
}
static int report_error(const char *fn) {
fprintf(stderr, "%s: %s\n", fn, strerror(errno));
exit(1);
return 0; /* 避免警告 */
}
static void write_char(FILE *fp, int ch)
{ if (fputc(ch, fp) != ch) report_error("write"); }
static void write_size(FILE *fp, size_t sz) {
size_t i;
write_char(fp, HC_CODELEN);
for (i = 0; i < HC_CODELEN; ++i) {
write_char(fp, sz&HC_CHARMASK);
sz >>= CHAR_BIT;
}
}
static size_t read_size(FILE *fp) {
size_t i, sz = 0, count;
int ch;
if ((ch = fgetc(fp)) == EOF
|| (count = (size_t)ch) > HC_CODELEN)
return 0;
for (i = 0; i < count; ++i) {
if ((ch = fgetc(fp)) == EOF)
return 0;
sz |= (ch&HC_CHARMASK)<<(i*CHAR_BIT);
}
return sz;
}
static int encode_file(const char *infile, const char *outfile) {
FILE *infp, *outfp;
HEncoder e;
HNode nodes[HC_CHARCOUNT*2];
size_t i, count, filelen;
if ((infp = fopen(infile, "rb")) == NULL)
return report_error(infile);
if ((outfp = fopen(outfile, "wb")) == NULL)
return report_error(outfile);
count = ht_buildnode(nodes, infp, &filelen);
ht_buildencoder(&e, nodes, count);
write_size(outfp, filelen);
for (i = 0; i < HC_CHARCOUNT; ++i)
write_char(outfp, e.len[i]);
rewind(infp);
ht_encode(&e, infp, outfp);
fclose(infp);
fclose(outfp);
return 1;
}
static int decode_file(const char *infile, const char *outfile) {
FILE *infp, *outfp;
HEncoder e;
HDecoder d;
size_t i, filelen;
int ch;
if ((infp = fopen(infile, "rb")) == NULL)
return report_error(infile);
if ((outfp = fopen(outfile, "wb")) == NULL)
return report_error(outfile);
if ((filelen = read_size(infp)) == 0)
return report_error(infile);
if (filelen > HC_SIZEMAX) {
fprintf(stderr, "%s: file too big (size: %u)\n",
infile, (unsigned)filelen);
exit(1);
}
for (i = 0; i < HC_CHARCOUNT; ++i) {
if ((ch = fgetc(infp)) == EOF)
return report_error(infile);
if ((e.len[i] = (size_t)(ch & HC_CHARMASK)) > HC_CODELEN) {
fprintf(stderr, "%s: table error\n", infile);
exit(1);
}
}
ht_initencoder(&e);
ht_initdecoder(&d, &e);
if (!ht_decode(&d, filelen, infp, outfp)) {
fprintf(stderr, "%s: decode error\n", outfile);
exit(1);
}
fclose(infp);
fclose(outfp);
return 1;
}
int main(int argc, char **argv)
{
if (argc >= 4 && strcmp(argv[1], "encode") == 0)
encode_file(argv[2], argv[3]);
else if (argc >= 4 && strcmp(argv[1], "decode") == 0)
decode_file(argv[2], argv[3]);
else {
fprintf(stderr, "Usage: %s (encode|decode) infile outfile\n", argv[0]);
return 0;
}
return 0;
}
/* cc:run+='decode data.bin out.txt' */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment