Last active
November 30, 2019 11:56
-
-
Save starwing/ff3e7078c3c07b6f7a0c9d1b634a64d8 to your computer and use it in GitHub Desktop.
Arithmetic Coding in C: https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _CRT_SECURE_NO_WARNINGS | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <limits.h> | |
#define SHIFT (sizeof(unsigned)*CHAR_BIT) | |
#define RANGE_MAX (~0u) | |
#define CHAR_MASK ((1u<<CHAR_BIT)-1) | |
enum ExtraChar { ESC = 1<<CHAR_BIT, EOS, CHAR_COUNT }; | |
static unsigned mul_div(unsigned a, unsigned b, unsigned c) { | |
unsigned long long ret = a; | |
ret *= b, ret /= c; | |
return (unsigned)ret; | |
} | |
typedef struct ACBuffer { | |
char buff[BUFSIZ]; | |
unsigned curr, count; | |
const char *fname; | |
FILE *fp; | |
} ACBuffer; | |
static void ac_freebuffer(ACBuffer *b) | |
{ if (b->fp) fclose(b->fp), b->fp = NULL; } | |
static void ac_initbuffer(ACBuffer *b, const char *fname, const char *mode) { | |
memset(b, 0, sizeof(*b)); | |
if ((b->fp = fopen(b->fname = fname, mode)) == NULL) | |
perror(fname), exit(1); | |
} | |
static void ac_putchar(ACBuffer *b, int ch) { | |
if (ch != EOS) b->buff[b->curr++] = ch; | |
if (ch == EOS || b->curr >= BUFSIZ) { | |
size_t count = fwrite(b->buff, 1, b->curr, b->fp); | |
if (count != b->curr) perror(b->fname), exit(1); | |
b->curr = 0; | |
} | |
} | |
static int ac_getchar(ACBuffer *b) { | |
if (b->curr == b->count) { | |
size_t count = fread(b->buff, 1, BUFSIZ, b->fp); | |
if (count != BUFSIZ && ferror(b->fp)) perror(b->fname), exit(1); | |
b->curr = 0, b->count = (unsigned)count; | |
if (b->count == 0) return EOS; | |
} | |
return b->buff[b->curr++] & CHAR_MASK; | |
} | |
typedef struct ACEncoder { | |
unsigned freq[CHAR_COUNT], sums[CHAR_COUNT], sum; | |
unsigned lower, upper; | |
unsigned carry, current; | |
size_t cc; | |
} ACEncoder; | |
static void ac_addbit(ACEncoder *e, int pos, unsigned v) { | |
int i; | |
for (i = pos+1; i <= CHAR_COUNT; i += i&-i) | |
e->sums[i-1] += v; | |
e->freq[pos] += v, e->sum += v; | |
} | |
static unsigned ac_fetchbit(ACEncoder *e, int pos) { | |
int i, sum = 0; | |
for (i = pos+1; i > 0; i -= i&-i) | |
sum += e->sums[i-1]; | |
return (unsigned)sum; | |
} | |
static unsigned ac_log2(unsigned v) { | |
unsigned r, shift; | |
r = (v > 0xFFFF) << 4; v >>= r; | |
shift = (v > 0xFF ) << 3; v >>= shift; r |= shift; | |
shift = (v > 0xF ) << 2; v >>= shift; r |= shift; | |
shift = (v > 0x3 ) << 1; v >>= shift; r |= shift; | |
return (r |= (v >> 1)) + 1; | |
} | |
static void ac_init(ACEncoder *e) { | |
memset(e, 0, sizeof(*e)); | |
ac_addbit(e, ESC, 1); | |
e->upper = RANGE_MAX; | |
} | |
static size_t ac_update(ACEncoder *e, unsigned prev, unsigned curr, unsigned sum) { | |
unsigned delta = e->upper - e->lower; | |
e->lower += mul_div(delta, prev, sum); | |
e->upper = mul_div(delta, curr, sum) + e->lower; | |
return SHIFT - ac_log2(e->lower^e->upper); | |
} | |
static size_t ac_underflow(ACEncoder *e, size_t shift) { | |
size_t pending = 0; | |
e->lower <<= shift, e->upper <<= shift, e->upper |= (1<<shift)-1; | |
if ((e->lower & ~e->upper & 1<<(SHIFT-2)) != 0) { | |
unsigned mask = (~e->lower|e->upper) << 1; | |
pending = SHIFT - ac_log2(mask); | |
e->lower = (e->lower << pending) ^ 1<<(SHIFT-1); | |
e->upper = (e->upper << pending) | 1<<(SHIFT-1) | (1<<pending)-1; | |
} | |
return pending; | |
} | |
static void ac_encode_flush(ACEncoder *e, ACBuffer *out, | |
unsigned bin, size_t shift, size_t pending) { | |
if (pending) { | |
ac_encode_flush(e, out, ~(~(bin>>(SHIFT-1))+1) | |
^(1u<<(SHIFT-1)), pending+1, 0); | |
bin <<= 1, shift -= 1; | |
} | |
while (shift > 0) { | |
size_t cc = e->cc+shift > SHIFT ? SHIFT - e->cc : shift; | |
e->carry = (e->carry << cc) | (bin >> (SHIFT-cc)); | |
shift -= cc, e->cc += cc; | |
while (e->cc >= CHAR_BIT) | |
ac_putchar(out, (e->carry >> (e->cc -= CHAR_BIT)) & CHAR_MASK); | |
} | |
} | |
static size_t ac_encode_char(ACEncoder *e, ACBuffer *out, int ch, size_t p) { | |
size_t shift; | |
unsigned prev = ch, curr = 1, sum = CHAR_COUNT; | |
if (e->freq[ch]) | |
prev = ac_fetchbit(e, ch-1), curr = e->freq[ch], sum = e->sum; | |
else | |
p = ac_encode_char(e, out, ESC, p); | |
if ((shift = ac_update(e, prev, curr, sum)) > 0) | |
ac_encode_flush(e, out, e->lower, shift, p), p = 0; | |
ac_addbit(e, ch, 1); | |
return p + ac_underflow(e, shift); | |
} | |
static void ac_encode(ACEncoder *e, ACBuffer *in, ACBuffer *out) { | |
size_t pending = 0; | |
int ch; | |
while ((ch = ac_getchar(in)) != EOS) | |
pending = ac_encode_char(e, out, ch, pending); | |
pending = ac_encode_char(e, out, EOS, pending); | |
ac_encode_flush(e, out, (1+(e->lower>>(SHIFT-2)))<<(SHIFT-2), 2, pending); | |
if (e->cc != 0) ac_encode_flush(e, out, 0, CHAR_BIT-e->cc, 0); | |
ac_putchar(out, EOS); | |
} | |
static unsigned ac_decode_fetch(ACEncoder *e, ACBuffer *in, size_t shift) { | |
unsigned current = 0; | |
while (shift > 0) { | |
size_t cc = shift > CHAR_BIT ? CHAR_BIT : shift; | |
if (e->cc < cc) { | |
int ch = ac_getchar(in); | |
if (ch == EOS) ch = 0; | |
e->carry = (e->carry << CHAR_BIT) | ch; | |
e->cc += CHAR_BIT; | |
} | |
shift -= cc, e->cc -= cc; | |
current = (current << cc) | (e->carry >> e->cc); | |
e->carry &= (1<<e->cc) - 1; | |
} | |
return current; | |
} | |
static void ac_decode_fill(ACEncoder *e, ACBuffer *in, | |
unsigned prev, unsigned curr, unsigned sum) { | |
size_t shift = ac_update(e, prev, curr, sum); | |
e->current = (e->current << shift) | ac_decode_fetch(e, in, shift); | |
if ((shift = ac_underflow(e, shift)) > 0) | |
e->current = ((e->current << shift)^(1<<(SHIFT-1))) | |
| ac_decode_fetch(e, in, shift); | |
if (!(e->lower <= e->current && e->current < e->upper)) | |
fprintf(stderr, "decode fail"), exit(1); | |
} | |
static void ac_decode(ACEncoder *e, ACBuffer *in, ACBuffer *out) { | |
int ch = 0; | |
e->current = ac_decode_fetch(e, in, SHIFT); | |
while (ch != EOS) { | |
unsigned dx = mul_div(e->current-e->lower+1, e->sum, e->upper-e->lower); | |
unsigned i, x = 0, cx, p = 0, cp; | |
for (i = CHAR_BIT+1; i > 0; --i) | |
if ((cp = p+(1<<(i-1))) <= CHAR_COUNT | |
&& (cx = x+e->sums[cp-1]) <= dx) | |
p = cp, x = cx; | |
ac_decode_fill(e, in, ac_fetchbit(e, p-1), e->freq[p], e->sum); | |
if (p == ESC) { | |
ac_addbit(e, p, 1); | |
p = mul_div(e->current-e->lower+1, CHAR_COUNT, e->upper-e->lower); | |
ac_decode_fill(e, in, p, 1, CHAR_COUNT); | |
} | |
ac_putchar(out, ch = (int)p); | |
ac_addbit(e, ch, 1); | |
} | |
} | |
static void encode_file(const char *infile, const char *outfile) { | |
ACBuffer in, out; | |
ACEncoder e; | |
ac_initbuffer(&in, infile, "rb"); | |
ac_initbuffer(&out, outfile, "wb"); | |
ac_init(&e); | |
ac_encode(&e, &in, &out); | |
ac_freebuffer(&in); | |
ac_freebuffer(&out); | |
} | |
static void decode_file(const char *infile, const char *outfile) { | |
ACBuffer in, out; | |
ACEncoder e; | |
ac_initbuffer(&in, infile, "rb"); | |
ac_initbuffer(&out, outfile, "wb"); | |
ac_init(&e); | |
ac_decode(&e, &in, &out); | |
ac_freebuffer(&in); | |
ac_freebuffer(&out); | |
} | |
int main(int argc, char **argv) { | |
if (argc >= 4 && strcmp(argv[1], "encode") == 0) | |
encode_file(argv[2], argv[3]); | |
else if (argc >= 4 && strcmp(argv[1], "decode") == 0) | |
decode_file(argv[2], argv[3]); | |
else | |
fprintf(stderr, "Usage: %s (encode|decode) infile outfile\n", argv[0]); | |
return 0; | |
} | |
/* cc: run+='encode % out.bin' */ | |
/* xcc: run+='decode out.bin out.txt' */ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment