Last active
November 30, 2022 04:32
-
-
Save rygorous/b434cf2916be5c9573796b5f96671cbe to your computer and use it in GitHub Desktop.
2x interleaved rANS encoder/decoder from BitKnit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#define BITKNIT_BRANCHLESS_RENORM | |
// RAD-esque types | |
typedef size_t UINTa; | |
typedef uint8_t U8; | |
typedef uint16_t U16; | |
typedef uint32_t U32; | |
// You need to supply: | |
static U16 read16le_unaligned(U8 const *ptr); // read 16 bits little-endian unaligned on target platform | |
static void write16le(U8 *dest, U16 val); // write 16 bits little-endian aligned on target platform | |
static inline U32 highest_set_bit(U32 x); // index of highest set bit (derived from CLZ/BitScanReverse). May assume x!=0. | |
static UINTa const kBlockSize = 1 << 16; // must be pow2 | |
// Basic entropy coder parameters | |
static U32 const kProbBits = 15; // must be <=15 for the implementation in this file | |
static U32 const kMaxProb = 1u << kProbBits; | |
static U32 const kRansL = 1u << 16; | |
// -------------------------------------------------------------------------- | |
// Encoder | |
// NOTE: Max number of symbols per input byte depends on the parse. With the | |
// current encoding, we have: | |
// | |
// - Literal: 1 sym = 1.000 sym/byte (spb) | |
// - Len-2 match: 1 sym len, 1 sym offs top => 2 syms/2 bytes = 1 spb | |
// (len-2 is always a rep match) | |
// - len-3 match: 1 sym len, 1 sym offs top, 1 sym offs bot, 1 sym offs extra => 4 syms/3 bytes = 1.333 spb | |
// (can't have two offs extra syms [more than 16 offs extra bits] for | |
// a len-3 match due to offs limit) | |
// - len-4 match: 1 sym len, 1 sym offs top, 1 sym offs bot, 2 sym offs extra => 5 syms/4 bytes = 1.250 spb | |
// (len-4 has no offs limit) | |
// | |
// Really long matches can have one extra sym for the length, but they're already in a range where they | |
// end up with way below 1 spb. | |
// | |
// So to handle the worst case, we allocate storage for 1.5 * kBlockSize symbols (a bit extra slack past | |
// 1.333 just in case of later encoding tweaks). This should always be enough, and even for random data, | |
// you typically only get very slightly above 1.0spb. If this number of symbols is not sufficient, the | |
// block will end up getting sent uncompressed. | |
static UINTa const kMaxSymsPerBlock = kBlockSize + (kBlockSize / 2); | |
struct RansEncSym | |
{ | |
U16 start; | |
U16 freq; | |
}; | |
struct RansEncoder | |
{ | |
RansEncSym *sym_cur; // current fill position in symbol buffer | |
RansEncSym *sym_end; // end of symbol buffer | |
RansEncSym sym_buf[kMaxSymsPerBlock]; | |
}; | |
static void ransenc_init(RansEncoder *r) | |
{ | |
r->sym_cur = r->sym_buf; | |
r->sym_end = r->sym_buf + kMaxSymsPerBlock; | |
} | |
// Internal: adds symbol to the list | |
static inline void ransenc_add_sym(RansEncoder *r, U32 start, U32 freq) | |
{ | |
// If no more free space, bail. (Hitting this mark is treated as an error | |
// condition in flush). | |
if (r->sym_cur == r->sym_end) | |
return; | |
RansEncSym *s = r->sym_cur++; | |
s->start = (U16) start; | |
s->freq = (U16) freq; | |
} | |
static void ransenc_encode(RansEncoder *r, U32 start, U32 freq) | |
{ | |
RR_ASSERT(freq > 0 && freq <= kMaxProb); // freq == kMaxProb is pointless but allowed | |
RR_ASSERT(start + freq <= kMaxProb); | |
ransenc_add_sym(r, start, freq); | |
} | |
static void ransenc_putbits(RansEncoder *r, U32 val, U32 nbits) | |
{ | |
RR_ASSERT(nbits < 32); | |
val &= (1u << nbits) - 1; | |
if (nbits < 16) | |
ransenc_add_sym(r, val, nbits | 0x8000); | |
else | |
{ | |
ransenc_add_sym(r, val >> 16, (nbits - 16) | 0x8000); // top bits | |
ransenc_add_sym(r, val & 0xffff, 0xc000); // raw 16 bits for bottom | |
} | |
} | |
static U32 ransenc_renorm(U32 x, U32 xmax, U8 **outp) | |
{ | |
if (x < xmax) // no renorm necessary | |
return x; | |
*outp -= 2; | |
write16le(*outp, x & 0xffff); | |
return x >> 16; | |
} | |
// This is the low-level func used by flush; use "putbits" to buffer | |
// up events during normal encoding! | |
static U32 ransenc_write_bits(U32 x, U32 val, U32 nbits, U8 **outp) | |
{ | |
RR_ASSERT(nbits <= 16); | |
RR_ASSERT(val < (1u << nbits)); | |
if (nbits) | |
x = ransenc_renorm(x, 1u << (32 - nbits), outp); | |
return (x << nbits) | val; | |
} | |
static void *ransenc_attempt_flush(RansEncoder *r, UINTa uncomp_size, UINTa *compressed_size) | |
{ | |
// if we ran out of symbol buffer space, keep this block uncompressed | |
if (r->sym_cur == r->sym_end) | |
{ | |
r->sym_cur = r->sym_buf; // reset sym ptr | |
return 0; | |
} | |
// We write output in reverse, using the same storage as the sym buf. | |
// Since each symbol generates <=16 bits of output data and RansEncSym | |
// is larger than 16 bits, this is safe and means we use less memory. | |
// It also means we don't have to check for buffer overruns in the main | |
// encode loop (since the output buffer is, by definition, always | |
// large enough). | |
U8 *out_end = (U8*)r->sym_cur; | |
U8 *out_cur = out_end; | |
U8 *out_beg = (U8*)r->sym_buf; | |
// actual encoding processes symbols in reverse order | |
U32 xmax_base = (kRansL >> kProbBits) << 16; | |
U32 xs = kRansL, xa = kRansL; | |
for (RansEncSym const *symp = r->sym_cur; symp > r->sym_buf; ) | |
{ | |
// Invariant: every symbol writes <=16 bits which is less than | |
// the size of a RansEncSym (so sharing space is fine) | |
RansEncSym s = *--symp; | |
if (s.freq < 0x8000) | |
{ | |
// renormalize | |
U32 x = ransenc_renorm(xs, xmax_base * s.freq, &out_cur); | |
// encode symbol | |
x = ((x / s.freq) << kProbBits) + (x % s.freq) + s.start; | |
xs = xa; | |
xa = x; | |
} | |
else if (s.freq == 0xc000) // raw 16-bit bypass | |
{ | |
out_cur -= 2; | |
write16le(out_cur, s.start & 0xffff); | |
} | |
else | |
{ | |
U32 x = ransenc_write_bits(xs, s.start, s.freq & 0x7fff, &out_cur); | |
xs = xa; | |
xa = x; | |
} | |
} | |
// reset symbol pointer | |
r->sym_cur = r->sym_buf; | |
// if we're out of space or didn't compress the data enough, use an uncompressed block. | |
static UINTa const kMaxFlushSize = 5*2; // 5=max number of write16s executed | |
if (out_cur - out_beg < kMaxFlushSize || (out_end - out_cur) + kMaxFlushSize >= uncomp_size) | |
return 0; | |
// Weave alternate stream state (xs) into primary (xa) state | |
// Note we're still writing backwards! | |
U32 nhighest = highest_set_bit(xs); | |
RR_ASSERT(nhighest >= 16 && nhighest < 32); | |
// low bits of xs | |
out_cur -= 2; | |
write16le(out_cur, xs & 0xffff); | |
// high bits of xs + bit count | |
xa = ransenc_write_bits(xa, (xs >> 16) & ((1u << (nhighest - 16)) - 1), nhighest - 16, &out_cur); | |
xa = ransenc_write_bits(xa, nhighest - 16, 4, &out_cur); | |
// xa low state then high state. we want high state last since we use hi!=0 to signal an | |
// ANS compressed block (since with our normalization conditions, hi==0 can never happen). | |
out_cur -= 4; | |
write16le(out_cur + 2, xa & 0xffff); | |
write16le(out_cur + 0, (xa >> 16)); | |
*compressed_size = out_end - out_cur; | |
return out_cur; | |
} | |
// -------------------------------------------------------------------------- | |
// Decoder | |
struct RansDecoder | |
{ | |
U32 x; | |
U32 xalt; | |
}; | |
static void ransdec_init_empty(RansDecoder *r) | |
{ | |
r->x = r->xalt = kRansL; | |
} | |
static bool ransdec_in_final_state(RansDecoder const *r) | |
{ | |
return r->x == kRansL && r->xalt == kRansL; | |
} | |
// Renormalize. Internal use only, not part of the interface! | |
static RADFORCEINLINE void ransdec_renorm(RansDecoder *r, U8 const **p, U32 x) | |
{ | |
#ifndef BITKNIT_BRANCHLESS_RENORM | |
if (x < kRansL) | |
{ | |
x = (x << 16) | read16le_unaligned(*p); | |
*p += 2; | |
} | |
#else | |
U8 const *ptr = *p; | |
U8 const *ptr_inc = ptr + 2; | |
U32 y = (x << 16) | read16le_unaligned(ptr); | |
ptr = (x < kRansL) ? ptr_inc : ptr; | |
x = (x < kRansL) ? y : x; | |
*p = ptr; | |
#endif | |
r->x = r->xalt; | |
r->xalt = x; | |
} | |
static inline void ransdec_advance(RansDecoder *r, U8 const **p, U32 start, U32 freq) | |
{ | |
U32 x = r->x; | |
U32 xs = (x & (kMaxProb - 1)) - start; | |
ransdec_renorm(r, p, freq * (x >> kProbBits) + xs); | |
} | |
// short: nbits <= 16 | |
static inline U32 ransdec_getbits_short(RansDecoder *r, U8 const **p, U32 nbits) | |
{ | |
RR_ASSERT(nbits <= 16); | |
U32 x = r->x; | |
U32 v = x & ((1u << nbits) - 1); | |
ransdec_renorm(r, p, x >> nbits); | |
return v; | |
} | |
static inline U32 ransdec_getbits(RansDecoder *r, U8 const **p, U32 nbits) | |
{ | |
RR_ASSERT(nbits < 32); | |
// grab high 0-15 bits | |
U32 val = r->x; | |
ransdec_renorm(r, p, val >> (nbits & 15)); | |
if (nbits >= 16) // grab low bits if necessary | |
{ | |
val = (val << 16) | read16le_unaligned(*p); | |
*p += 2; | |
} | |
return val & ((1u << nbits) - 1); | |
} | |
static void ransdec_init(RansDecoder *r, U8 const **p) | |
{ | |
// Read starting state for first stream, high | |
// word (LE) then low word (LE). weird mixed-endian | |
// byte order but it makes sense with our tag scheme. | |
// (see ransenc_attempt_flush). | |
r->x = read16le_unaligned(*p + 0) << 16; | |
r->x |= read16le_unaligned(*p + 2); | |
RR_ASSERT(r->x >= kRansL); // should be normalized: hi word=0 implies uncompressed! | |
*p += 4; | |
// number of bits for alternate stream start | |
U32 na = ransdec_getbits(r, p, 4) + 16; | |
r->x = r->xalt; // we're not interleaved yet! | |
U32 xalt = ransdec_getbits(r, p, na) | (1u << na); | |
RR_ASSERT(xalt >= kRansL); // should be normalized: na >= 16 implies xalt >= kRansL | |
r->x = r->xalt; // still not interleaved! | |
r->xalt = xalt; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment