starwing · January 14, 2019 07:21
diff --git a/huffman.c b/huffman.c
 #define _CRT_SECURE_NO_WARNINGS
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include <errno.h>


 /* 能处理的最大文件大小，在这种情况下，最大的霍夫曼编码也不可能超过
 * size_t 的大小 */
 #define HC_SIZEMAX   (~(size_t)0 - 100)

 /* 该文件限制下最大的霍夫曼编码长度 */
 #define HC_CODELEN   (sizeof(size_t)*CHAR_BIT)

 /* 字符表长度 */
 #define HC_CHARCOUNT  (1<<CHAR_BIT)

 /* 字符掩码 */
 #define HC_CHARMASK (HC_CHARCOUNT-1)

 /* 一次处理多少bit编码（使用2^N叉树解析） */
 #define HC_BITSHIFT 4

 /* 最多有多少树节点：霍夫曼编码总节点数有字母表*2
 * 字母表取8bit树，则最多有512个节点。使用2^N叉树最坏情况下一个节点存
 * 储2N个二叉树节点，所以最大节点数为字母表/N */
 #define HC_MAXNODES (HC_CHARCOUNT/HC_BITSHIFT)

 /* 霍夫曼解码器：前两个字节为节点数量，后面是2^N叉树
 * 每个节点是short，其中如果高8位是0，则低八位是子树节点索引。否则，高
 * 8位是当前节点编码长度（最大255），低8位是字符 */
 typedef struct HDecoder {
    unsigned short count;
    unsigned short nodes[HC_MAXNODES][1<<HC_BITSHIFT];
 } HDecoder;

 /* 霍夫曼编码器：code为编码的二进制值，len是编码的比特位 */
 typedef struct HEncoder {
    size_t code[HC_CHARCOUNT];
    size_t len[HC_CHARCOUNT];
 } HEncoder;

 /* 二叉霍夫曼树节点，节点总数是字符集*2，所以left和right分别为
 * CHAR_BIT+1就足够了 */
 typedef struct HNode {
    unsigned weight;
    unsigned code:  CHAR_BIT;
    unsigned left:  CHAR_BIT+1;
    unsigned right: CHAR_BIT+1;
    unsigned leaf:  1;
 } HNode;

 static void ht_traverse(HEncoder *e, HNode *nodes, size_t ni, size_t len) {
    HNode *c = &nodes[ni];
    if (c->leaf) {
        assert(len < HC_CODELEN);
        e->len[c->code] = len;
    } else {
        ht_traverse(e, nodes, c->left, len+1);
        ht_traverse(e, nodes, c->right, len+1);
    }
 }

 static void ht_initencoder(HEncoder *e) {
    size_t bl_count[HC_CODELEN+1] = {0};
    size_t next_code[HC_CODELEN+1] = {0};
    size_t i, code = 0;
    for (i = 0; i < HC_CHARCOUNT; ++i)
        ++bl_count[e->len[i]];
    bl_count[0] = 0;
    for (i = 1; i <= HC_CODELEN; ++i) {
        code = (code + bl_count[i-1]) << 1;
        next_code[i] = code;
    }
    for (i = 0; i < HC_CHARCOUNT; ++i)
        if (e->len[i]) e->code[i] = next_code[e->len[i]]++;
 }

 static void ht_initdecoder(HDecoder *d, const HEncoder *e) {
    size_t i;
    memset(d, 0, sizeof(*d));
    d->count = 1;
    for (i = 0; i < HC_CHARCOUNT; ++i) {
        size_t code = e->code[i], len = e->len[i];
        size_t ti = 0;
        while (len) {
            size_t curr = len < HC_BITSHIFT ? len : HC_BITSHIFT;
            size_t old = ti, bit = code >> (len - curr);
            if ((len -= curr) == 0) {
                size_t padding = HC_BITSHIFT - curr;
                short  data    = (e->len[i]<<CHAR_BIT|i)&(~(short)0);
                size_t j, count;
                for (j = 0, count = 1<<padding; j < count; ++j)
                    d->nodes[ti][(bit<<padding)+j] = data;
            } else if ((ti = d->nodes[ti][bit]) == 0) {
                d->nodes[old][bit] = d->count;
                ti = d->count++;
            }
            assert(ti < d->count);
            code &= (1<<len) - 1;
        }
    }
 }

 static void ht_heapadjust(HNode *h, size_t i, size_t count) {
    HNode curr = h[i];
    size_t root, child;
    for (root = i; (child = root*2+1) < count; root = child) {
        if (child < count-1 && h[child].weight > h[child+1].weight)
            ++child;
        if (curr.weight <= h[child].weight)
            break;
        h[root] = h[child];
    }
    h[root] = curr;
 }

 static size_t ht_buildencoder(HEncoder *e, HNode *nodes, size_t count) {
    size_t i, hsize = count;
    for (i = count/2; i > 0; --i)
        ht_heapadjust(nodes, i, hsize);
    ht_heapadjust(nodes, 0, hsize);
    for (; --hsize > 0; ++count) {
        HNode t     = nodes[0];
        nodes[0]    = nodes[hsize];
        nodes[hsize] = t;
        ht_heapadjust(nodes, 0, hsize);
        nodes[count] = nodes[0];
        nodes[0].weight += nodes[hsize].weight;
        nodes[0].code    = 0;
        nodes[0].left    = hsize;
        nodes[0].right   = count;
        nodes[0].leaf    = 0;
        ht_heapadjust(nodes, 0, hsize);
    }
    memset(e, 0, sizeof(HEncoder));
    ht_traverse(e, nodes, 0, 0);
    ht_initencoder(e);
    return count;
 }

 static size_t ht_buildnode(HNode *nodes, FILE *fp, size_t *pfilelen) {
    size_t i, count, weight[HC_CHARCOUNT] = {0};
    size_t filelen = 0;
    int ch;
    while ((ch = fgetc(fp)) != EOF)
        ++weight[ch & HC_CHARMASK], ++filelen;
    if (pfilelen) *pfilelen = filelen;
    for (i = count = 0; i < HC_CHARCOUNT; ++i) {
        if (weight[i]) {
            nodes[count].weight = weight[i];
            nodes[count].code   = i;
            nodes[count].leaf   = 1;
            ++count;
        }
    }
    for (i = 0; i < HC_CHARCOUNT && count < 2; ++i) {
        if (!weight[i]) {
            nodes[count].weight = weight[i];
            nodes[count].code   = i;
            nodes[count].leaf   = 1;
            ++count;
        }
    } 
    return count;
 }

 static void ht_encode(HEncoder *e, FILE *infp, FILE *outfp) {
    int ch;
    unsigned carry = 0;
    size_t count = 0;
    while ((ch = fgetc(infp)) != EOF) {
        size_t code = e->code[ch & HC_CHARMASK];
        size_t len = e->len[ch & HC_CHARMASK];
        while (len) {
            size_t curr = len < CHAR_BIT ? len : CHAR_BIT;
            if (curr) {
                carry  = (carry<<curr)|(code>>(len - curr));
                count += curr;
                len   -= curr;
            }
            /* count数量永远不会超过CHAR_BIT*2，所以不需要while */
            assert(count < CHAR_BIT*2);
            if (count >= CHAR_BIT) {
                fputc(carry >> (count-CHAR_BIT), outfp);
                count -= CHAR_BIT;
                carry &= (1 << count) - 1;
            }
        }
    }
    assert(count <= CHAR_BIT);
    fputc(carry << (CHAR_BIT-count), outfp);
 }

 static int ht_decode(HDecoder *d, size_t filelen, FILE *infp, FILE *outfp) {
    int ch, carry = 0;
    size_t ti = 0, count = 0;
    while (filelen != 0 && (ch = fgetc(infp)) != EOF) {
        carry = carry<<CHAR_BIT|(ch & HC_CHARMASK);
        count += CHAR_BIT;
        while (filelen != 0 && count >= HC_BITSHIFT) {
            unsigned short code = d->nodes[ti][carry>>(count-HC_BITSHIFT)];
            size_t len = code>>CHAR_BIT;
            if (len == 0) {
                count -= HC_BITSHIFT;
                ti = code;
            } else {
                size_t remain = len % HC_BITSHIFT;
                count -= remain ? remain : HC_BITSHIFT;
                fputc(code&HC_CHARMASK, outfp), --filelen;
                ti = 0;
            }
            carry &= (1 << count) - 1;
        }
    }
    return filelen == 0;
 }

 static int report_error(const char *fn) {
    fprintf(stderr, "%s: %s\n", fn, strerror(errno));
    exit(1);
    return 0; /* 避免警告 */
 }

 static void write_char(FILE *fp, int ch)
 { if (fputc(ch, fp) != ch) report_error("write"); }

 static void write_size(FILE *fp, size_t sz) {
    size_t i;
    write_char(fp, HC_CODELEN);
    for (i = 0; i < HC_CODELEN; ++i) {
        write_char(fp, sz&HC_CHARMASK);
        sz >>= CHAR_BIT;
    }
 }

 static size_t read_size(FILE *fp) {
    size_t i, sz = 0, count;
    int ch;
    if ((ch = fgetc(fp)) == EOF
            || (count = (size_t)ch) > HC_CODELEN)
        return 0;
    for (i = 0; i < count; ++i) {
        if ((ch = fgetc(fp)) == EOF)
            return 0;
        sz |= (ch&HC_CHARMASK)<<(i*CHAR_BIT);
    }
    return sz;
 }

 static int encode_file(const char *infile, const char *outfile) {
    FILE *infp, *outfp;
    HEncoder e;
    HNode nodes[HC_CHARCOUNT*2];
    size_t i, count, filelen;
    if ((infp = fopen(infile, "rb")) == NULL)
        return report_error(infile);
    if ((outfp = fopen(outfile, "wb")) == NULL)
        return report_error(outfile);
    count = ht_buildnode(nodes, infp, &filelen);
    ht_buildencoder(&e, nodes, count);
    write_size(outfp, filelen);
    for (i = 0; i < HC_CHARCOUNT; ++i)
        write_char(outfp, e.len[i]);
    rewind(infp);
    ht_encode(&e, infp, outfp);
    fclose(infp);
    fclose(outfp);
    return 1;
 }

 static int decode_file(const char *infile, const char *outfile) {
    FILE *infp, *outfp;
    HEncoder e;
    HDecoder d;
    size_t i, filelen;
    int ch;
    if ((infp = fopen(infile, "rb")) == NULL)
        return report_error(infile);
    if ((outfp = fopen(outfile, "wb")) == NULL)
        return report_error(outfile);
    if ((filelen = read_size(infp)) == 0)
        return report_error(infile);
    if (filelen > HC_SIZEMAX) {
        fprintf(stderr, "%s: file too big (size: %u)\n",
                infile, (unsigned)filelen);
        exit(1);
    }
    for (i = 0; i < HC_CHARCOUNT; ++i) {
        if ((ch = fgetc(infp)) == EOF)
            return report_error(infile);
        if ((e.len[i] = (size_t)(ch & HC_CHARMASK)) > HC_CODELEN) {
            fprintf(stderr, "%s: table error\n", infile);
            exit(1);
        }
    }
    ht_initencoder(&e);
    ht_initdecoder(&d, &e);
    if (!ht_decode(&d, filelen, infp, outfp)) {
        fprintf(stderr, "%s: decode error\n", outfile);
        exit(1);
    }
    fclose(infp);
    fclose(outfp);
    return 1;
 }

 int main(int argc, char **argv)
 {
    if (argc >= 4 && strcmp(argv[1], "encode") == 0)
        encode_file(argv[2], argv[3]);
    else if (argc >= 4 && strcmp(argv[1], "decode") == 0)
        decode_file(argv[2], argv[3]);
    else {
        fprintf(stderr, "Usage: %s (encode|decode) infile outfile\n", argv[0]);
        return 0;
    }
    return 0;
 }

 /* cc:run+='decode data.bin out.txt' */
	#define _CRT_SECURE_NO_WARNINGS
	#include <assert.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <limits.h>
	#include <errno.h>


	/* 能处理的最大文件大小，在这种情况下，最大的霍夫曼编码也不可能超过
	* size_t 的大小 */
	#define HC_SIZEMAX (~(size_t)0 - 100)

	/* 该文件限制下最大的霍夫曼编码长度 */
	#define HC_CODELEN (sizeof(size_t)*CHAR_BIT)

	/* 字符表长度 */
	#define HC_CHARCOUNT (1<<CHAR_BIT)

	/* 字符掩码 */
	#define HC_CHARMASK (HC_CHARCOUNT-1)

	/* 一次处理多少bit编码（使用2^N叉树解析） */
	#define HC_BITSHIFT 4

	/* 最多有多少树节点：霍夫曼编码总节点数有字母表*2
	* 字母表取8bit树，则最多有512个节点。使用2^N叉树最坏情况下一个节点存
	* 储2N个二叉树节点，所以最大节点数为字母表/N */
	#define HC_MAXNODES (HC_CHARCOUNT/HC_BITSHIFT)

	/* 霍夫曼解码器：前两个字节为节点数量，后面是2^N叉树
	* 每个节点是short，其中如果高8位是0，则低八位是子树节点索引。否则，高
	* 8位是当前节点编码长度（最大255），低8位是字符 */
	typedef struct HDecoder {
	unsigned short count;
	unsigned short nodes[HC_MAXNODES][1<<HC_BITSHIFT];
	} HDecoder;

	/* 霍夫曼编码器：code为编码的二进制值，len是编码的比特位 */
	typedef struct HEncoder {
	size_t code[HC_CHARCOUNT];
	size_t len[HC_CHARCOUNT];
	} HEncoder;

	/* 二叉霍夫曼树节点，节点总数是字符集*2，所以left和right分别为
	* CHAR_BIT+1就足够了 */
	typedef struct HNode {
	unsigned weight;
	unsigned code: CHAR_BIT;
	unsigned left: CHAR_BIT+1;
	unsigned right: CHAR_BIT+1;
	unsigned leaf: 1;
	} HNode;

	static void ht_traverse(HEncoder e, HNode nodes, size_t ni, size_t len) {
	HNode *c = &nodes[ni];
	if (c->leaf) {
	assert(len < HC_CODELEN);
	e->len[c->code] = len;
	} else {
	ht_traverse(e, nodes, c->left, len+1);
	ht_traverse(e, nodes, c->right, len+1);
	}
	}

	static void ht_initencoder(HEncoder *e) {
	size_t bl_count[HC_CODELEN+1] = {0};
	size_t next_code[HC_CODELEN+1] = {0};
	size_t i, code = 0;
	for (i = 0; i < HC_CHARCOUNT; ++i)
	++bl_count[e->len[i]];
	bl_count[0] = 0;
	for (i = 1; i <= HC_CODELEN; ++i) {
	code = (code + bl_count[i-1]) << 1;
	next_code[i] = code;
	}
	for (i = 0; i < HC_CHARCOUNT; ++i)
	if (e->len[i]) e->code[i] = next_code[e->len[i]]++;
	}

	static void ht_initdecoder(HDecoder d, const HEncoder e) {
	size_t i;
	memset(d, 0, sizeof(*d));
	d->count = 1;
	for (i = 0; i < HC_CHARCOUNT; ++i) {
	size_t code = e->code[i], len = e->len[i];
	size_t ti = 0;
	while (len) {
	size_t curr = len < HC_BITSHIFT ? len : HC_BITSHIFT;
	size_t old = ti, bit = code >> (len - curr);
	if ((len -= curr) == 0) {
	size_t padding = HC_BITSHIFT - curr;
	short data = (e->len[i]<<CHAR_BIT\|i)&(~(short)0);
	size_t j, count;
	for (j = 0, count = 1<<padding; j < count; ++j)
	d->nodes[ti][(bit<<padding)+j] = data;
	} else if ((ti = d->nodes[ti][bit]) == 0) {
	d->nodes[old][bit] = d->count;
	ti = d->count++;
	}
	assert(ti < d->count);
	code &= (1<<len) - 1;
	}
	}
	}

	static void ht_heapadjust(HNode *h, size_t i, size_t count) {
	HNode curr = h[i];
	size_t root, child;
	for (root = i; (child = root*2+1) < count; root = child) {
	if (child < count-1 && h[child].weight > h[child+1].weight)
	++child;
	if (curr.weight <= h[child].weight)
	break;
	h[root] = h[child];
	}
	h[root] = curr;
	}

	static size_t ht_buildencoder(HEncoder e, HNode nodes, size_t count) {
	size_t i, hsize = count;
	for (i = count/2; i > 0; --i)
	ht_heapadjust(nodes, i, hsize);
	ht_heapadjust(nodes, 0, hsize);
	for (; --hsize > 0; ++count) {
	HNode t = nodes[0];
	nodes[0] = nodes[hsize];
	nodes[hsize] = t;
	ht_heapadjust(nodes, 0, hsize);
	nodes[count] = nodes[0];
	nodes[0].weight += nodes[hsize].weight;
	nodes[0].code = 0;
	nodes[0].left = hsize;
	nodes[0].right = count;
	nodes[0].leaf = 0;
	ht_heapadjust(nodes, 0, hsize);
	}
	memset(e, 0, sizeof(HEncoder));
	ht_traverse(e, nodes, 0, 0);
	ht_initencoder(e);
	return count;
	}

	static size_t ht_buildnode(HNode nodes, FILE fp, size_t *pfilelen) {
	size_t i, count, weight[HC_CHARCOUNT] = {0};
	size_t filelen = 0;
	int ch;
	while ((ch = fgetc(fp)) != EOF)
	++weight[ch & HC_CHARMASK], ++filelen;
	if (pfilelen) *pfilelen = filelen;
	for (i = count = 0; i < HC_CHARCOUNT; ++i) {
	if (weight[i]) {
	nodes[count].weight = weight[i];
	nodes[count].code = i;
	nodes[count].leaf = 1;
	++count;
	}
	}
	for (i = 0; i < HC_CHARCOUNT && count < 2; ++i) {
	if (!weight[i]) {
	nodes[count].weight = weight[i];
	nodes[count].code = i;
	nodes[count].leaf = 1;
	++count;
	}
	}
	return count;
	}

	static void ht_encode(HEncoder e, FILE infp, FILE *outfp) {
	int ch;
	unsigned carry = 0;
	size_t count = 0;
	while ((ch = fgetc(infp)) != EOF) {
	size_t code = e->code[ch & HC_CHARMASK];
	size_t len = e->len[ch & HC_CHARMASK];
	while (len) {
	size_t curr = len < CHAR_BIT ? len : CHAR_BIT;
	if (curr) {
	carry = (carry<<curr)\|(code>>(len - curr));
	count += curr;
	len -= curr;
	}
	/* count数量永远不会超过CHAR_BIT2，所以不需要while /
	assert(count < CHAR_BIT*2);
	if (count >= CHAR_BIT) {
	fputc(carry >> (count-CHAR_BIT), outfp);
	count -= CHAR_BIT;
	carry &= (1 << count) - 1;
	}
	}
	}
	assert(count <= CHAR_BIT);
	fputc(carry << (CHAR_BIT-count), outfp);
	}

	static int ht_decode(HDecoder d, size_t filelen, FILE infp, FILE *outfp) {
	int ch, carry = 0;
	size_t ti = 0, count = 0;
	while (filelen != 0 && (ch = fgetc(infp)) != EOF) {
	carry = carry<<CHAR_BIT\|(ch & HC_CHARMASK);
	count += CHAR_BIT;
	while (filelen != 0 && count >= HC_BITSHIFT) {
	unsigned short code = d->nodes[ti][carry>>(count-HC_BITSHIFT)];
	size_t len = code>>CHAR_BIT;
	if (len == 0) {
	count -= HC_BITSHIFT;
	ti = code;
	} else {
	size_t remain = len % HC_BITSHIFT;
	count -= remain ? remain : HC_BITSHIFT;
	fputc(code&HC_CHARMASK, outfp), --filelen;
	ti = 0;
	}
	carry &= (1 << count) - 1;
	}
	}
	return filelen == 0;
	}

	static int report_error(const char *fn) {
	fprintf(stderr, "%s: %s\n", fn, strerror(errno));
	exit(1);
	return 0; /* 避免警告 */
	}

	static void write_char(FILE *fp, int ch)
	{ if (fputc(ch, fp) != ch) report_error("write"); }

	static void write_size(FILE *fp, size_t sz) {
	size_t i;
	write_char(fp, HC_CODELEN);
	for (i = 0; i < HC_CODELEN; ++i) {
	write_char(fp, sz&HC_CHARMASK);
	sz >>= CHAR_BIT;
	}
	}

	static size_t read_size(FILE *fp) {
	size_t i, sz = 0, count;
	int ch;
	if ((ch = fgetc(fp)) == EOF
	\|\| (count = (size_t)ch) > HC_CODELEN)
	return 0;
	for (i = 0; i < count; ++i) {
	if ((ch = fgetc(fp)) == EOF)
	return 0;
	sz \|= (ch&HC_CHARMASK)<<(i*CHAR_BIT);
	}
	return sz;
	}

	static int encode_file(const char infile, const char outfile) {
	FILE infp, outfp;
	HEncoder e;
	HNode nodes[HC_CHARCOUNT*2];
	size_t i, count, filelen;
	if ((infp = fopen(infile, "rb")) == NULL)
	return report_error(infile);
	if ((outfp = fopen(outfile, "wb")) == NULL)
	return report_error(outfile);
	count = ht_buildnode(nodes, infp, &filelen);
	ht_buildencoder(&e, nodes, count);
	write_size(outfp, filelen);
	for (i = 0; i < HC_CHARCOUNT; ++i)
	write_char(outfp, e.len[i]);
	rewind(infp);
	ht_encode(&e, infp, outfp);
	fclose(infp);
	fclose(outfp);
	return 1;
	}

	static int decode_file(const char infile, const char outfile) {
	FILE infp, outfp;
	HEncoder e;
	HDecoder d;
	size_t i, filelen;
	int ch;
	if ((infp = fopen(infile, "rb")) == NULL)
	return report_error(infile);
	if ((outfp = fopen(outfile, "wb")) == NULL)
	return report_error(outfile);
	if ((filelen = read_size(infp)) == 0)
	return report_error(infile);
	if (filelen > HC_SIZEMAX) {
	fprintf(stderr, "%s: file too big (size: %u)\n",
	infile, (unsigned)filelen);
	exit(1);
	}
	for (i = 0; i < HC_CHARCOUNT; ++i) {
	if ((ch = fgetc(infp)) == EOF)
	return report_error(infile);
	if ((e.len[i] = (size_t)(ch & HC_CHARMASK)) > HC_CODELEN) {
	fprintf(stderr, "%s: table error\n", infile);
	exit(1);
	}
	}
	ht_initencoder(&e);
	ht_initdecoder(&d, &e);
	if (!ht_decode(&d, filelen, infp, outfp)) {
	fprintf(stderr, "%s: decode error\n", outfile);
	exit(1);
	}
	fclose(infp);
	fclose(outfp);
	return 1;
	}

	int main(int argc, char **argv)
	{
	if (argc >= 4 && strcmp(argv[1], "encode") == 0)
	encode_file(argv[2], argv[3]);
	else if (argc >= 4 && strcmp(argv[1], "decode") == 0)
	decode_file(argv[2], argv[3]);
	else {
	fprintf(stderr, "Usage: %s (encode\|decode) infile outfile\n", argv[0]);
	return 0;
	}
	return 0;
	}

	/* cc:run+='decode data.bin out.txt' */