Skip to content

Instantly share code, notes, and snippets.

@rygorous
Last active November 30, 2022 04:32
Show Gist options
  • Save rygorous/b434cf2916be5c9573796b5f96671cbe to your computer and use it in GitHub Desktop.
Save rygorous/b434cf2916be5c9573796b5f96671cbe to your computer and use it in GitHub Desktop.
2x interleaved rANS encoder/decoder from BitKnit
#include <stdint.h>
#define BITKNIT_BRANCHLESS_RENORM
// RAD-esque types
typedef size_t UINTa;
typedef uint8_t U8;
typedef uint16_t U16;
typedef uint32_t U32;
// You need to supply:
static U16 read16le_unaligned(U8 const *ptr); // read 16 bits little-endian unaligned on target platform
static void write16le(U8 *dest, U16 val); // write 16 bits little-endian aligned on target platform
static inline U32 highest_set_bit(U32 x); // index of highest set bit (derived from CLZ/BitScanReverse). May assume x!=0.
static UINTa const kBlockSize = 1 << 16; // must be pow2
// Basic entropy coder parameters
static U32 const kProbBits = 15; // must be <=15 for the implementation in this file
static U32 const kMaxProb = 1u << kProbBits;
static U32 const kRansL = 1u << 16;
// --------------------------------------------------------------------------
// Encoder
// NOTE: Max number of symbols per input byte depends on the parse. With the
// current encoding, we have:
//
// - Literal: 1 sym = 1.000 sym/byte (spb)
// - Len-2 match: 1 sym len, 1 sym offs top => 2 syms/2 bytes = 1 spb
// (len-2 is always a rep match)
// - len-3 match: 1 sym len, 1 sym offs top, 1 sym offs bot, 1 sym offs extra => 4 syms/3 bytes = 1.333 spb
// (can't have two offs extra syms [more than 16 offs extra bits] for
// a len-3 match due to offs limit)
// - len-4 match: 1 sym len, 1 sym offs top, 1 sym offs bot, 2 sym offs extra => 5 syms/4 bytes = 1.250 spb
// (len-4 has no offs limit)
//
// Really long matches can have one extra sym for the length, but they're already in a range where they
// end up with way below 1 spb.
//
// So to handle the worst case, we allocate storage for 1.5 * kBlockSize symbols (a bit extra slack past
// 1.333 just in case of later encoding tweaks). This should always be enough, and even for random data,
// you typically only get very slightly above 1.0spb. If this number of symbols is not sufficient, the
// block will end up getting sent uncompressed.
static UINTa const kMaxSymsPerBlock = kBlockSize + (kBlockSize / 2);
struct RansEncSym
{
U16 start;
U16 freq;
};
struct RansEncoder
{
RansEncSym *sym_cur; // current fill position in symbol buffer
RansEncSym *sym_end; // end of symbol buffer
RansEncSym sym_buf[kMaxSymsPerBlock];
};
static void ransenc_init(RansEncoder *r)
{
r->sym_cur = r->sym_buf;
r->sym_end = r->sym_buf + kMaxSymsPerBlock;
}
// Internal: adds symbol to the list
static inline void ransenc_add_sym(RansEncoder *r, U32 start, U32 freq)
{
// If no more free space, bail. (Hitting this mark is treated as an error
// condition in flush).
if (r->sym_cur == r->sym_end)
return;
RansEncSym *s = r->sym_cur++;
s->start = (U16) start;
s->freq = (U16) freq;
}
static void ransenc_encode(RansEncoder *r, U32 start, U32 freq)
{
RR_ASSERT(freq > 0 && freq <= kMaxProb); // freq == kMaxProb is pointless but allowed
RR_ASSERT(start + freq <= kMaxProb);
ransenc_add_sym(r, start, freq);
}
static void ransenc_putbits(RansEncoder *r, U32 val, U32 nbits)
{
RR_ASSERT(nbits < 32);
val &= (1u << nbits) - 1;
if (nbits < 16)
ransenc_add_sym(r, val, nbits | 0x8000);
else
{
ransenc_add_sym(r, val >> 16, (nbits - 16) | 0x8000); // top bits
ransenc_add_sym(r, val & 0xffff, 0xc000); // raw 16 bits for bottom
}
}
static U32 ransenc_renorm(U32 x, U32 xmax, U8 **outp)
{
if (x < xmax) // no renorm necessary
return x;
*outp -= 2;
write16le(*outp, x & 0xffff);
return x >> 16;
}
// This is the low-level func used by flush; use "putbits" to buffer
// up events during normal encoding!
static U32 ransenc_write_bits(U32 x, U32 val, U32 nbits, U8 **outp)
{
RR_ASSERT(nbits <= 16);
RR_ASSERT(val < (1u << nbits));
if (nbits)
x = ransenc_renorm(x, 1u << (32 - nbits), outp);
return (x << nbits) | val;
}
static void *ransenc_attempt_flush(RansEncoder *r, UINTa uncomp_size, UINTa *compressed_size)
{
// if we ran out of symbol buffer space, keep this block uncompressed
if (r->sym_cur == r->sym_end)
{
r->sym_cur = r->sym_buf; // reset sym ptr
return 0;
}
// We write output in reverse, using the same storage as the sym buf.
// Since each symbol generates <=16 bits of output data and RansEncSym
// is larger than 16 bits, this is safe and means we use less memory.
// It also means we don't have to check for buffer overruns in the main
// encode loop (since the output buffer is, by definition, always
// large enough).
U8 *out_end = (U8*)r->sym_cur;
U8 *out_cur = out_end;
U8 *out_beg = (U8*)r->sym_buf;
// actual encoding processes symbols in reverse order
U32 xmax_base = (kRansL >> kProbBits) << 16;
U32 xs = kRansL, xa = kRansL;
for (RansEncSym const *symp = r->sym_cur; symp > r->sym_buf; )
{
// Invariant: every symbol writes <=16 bits which is less than
// the size of a RansEncSym (so sharing space is fine)
RansEncSym s = *--symp;
if (s.freq < 0x8000)
{
// renormalize
U32 x = ransenc_renorm(xs, xmax_base * s.freq, &out_cur);
// encode symbol
x = ((x / s.freq) << kProbBits) + (x % s.freq) + s.start;
xs = xa;
xa = x;
}
else if (s.freq == 0xc000) // raw 16-bit bypass
{
out_cur -= 2;
write16le(out_cur, s.start & 0xffff);
}
else
{
U32 x = ransenc_write_bits(xs, s.start, s.freq & 0x7fff, &out_cur);
xs = xa;
xa = x;
}
}
// reset symbol pointer
r->sym_cur = r->sym_buf;
// if we're out of space or didn't compress the data enough, use an uncompressed block.
static UINTa const kMaxFlushSize = 5*2; // 5=max number of write16s executed
if (out_cur - out_beg < kMaxFlushSize || (out_end - out_cur) + kMaxFlushSize >= uncomp_size)
return 0;
// Weave alternate stream state (xs) into primary (xa) state
// Note we're still writing backwards!
U32 nhighest = highest_set_bit(xs);
RR_ASSERT(nhighest >= 16 && nhighest < 32);
// low bits of xs
out_cur -= 2;
write16le(out_cur, xs & 0xffff);
// high bits of xs + bit count
xa = ransenc_write_bits(xa, (xs >> 16) & ((1u << (nhighest - 16)) - 1), nhighest - 16, &out_cur);
xa = ransenc_write_bits(xa, nhighest - 16, 4, &out_cur);
// xa low state then high state. we want high state last since we use hi!=0 to signal an
// ANS compressed block (since with our normalization conditions, hi==0 can never happen).
out_cur -= 4;
write16le(out_cur + 2, xa & 0xffff);
write16le(out_cur + 0, (xa >> 16));
*compressed_size = out_end - out_cur;
return out_cur;
}
// --------------------------------------------------------------------------
// Decoder
struct RansDecoder
{
U32 x;
U32 xalt;
};
static void ransdec_init_empty(RansDecoder *r)
{
r->x = r->xalt = kRansL;
}
static bool ransdec_in_final_state(RansDecoder const *r)
{
return r->x == kRansL && r->xalt == kRansL;
}
// Renormalize. Internal use only, not part of the interface!
static RADFORCEINLINE void ransdec_renorm(RansDecoder *r, U8 const **p, U32 x)
{
#ifndef BITKNIT_BRANCHLESS_RENORM
if (x < kRansL)
{
x = (x << 16) | read16le_unaligned(*p);
*p += 2;
}
#else
U8 const *ptr = *p;
U8 const *ptr_inc = ptr + 2;
U32 y = (x << 16) | read16le_unaligned(ptr);
ptr = (x < kRansL) ? ptr_inc : ptr;
x = (x < kRansL) ? y : x;
*p = ptr;
#endif
r->x = r->xalt;
r->xalt = x;
}
static inline void ransdec_advance(RansDecoder *r, U8 const **p, U32 start, U32 freq)
{
U32 x = r->x;
U32 xs = (x & (kMaxProb - 1)) - start;
ransdec_renorm(r, p, freq * (x >> kProbBits) + xs);
}
// short: nbits <= 16
static inline U32 ransdec_getbits_short(RansDecoder *r, U8 const **p, U32 nbits)
{
RR_ASSERT(nbits <= 16);
U32 x = r->x;
U32 v = x & ((1u << nbits) - 1);
ransdec_renorm(r, p, x >> nbits);
return v;
}
static inline U32 ransdec_getbits(RansDecoder *r, U8 const **p, U32 nbits)
{
RR_ASSERT(nbits < 32);
// grab high 0-15 bits
U32 val = r->x;
ransdec_renorm(r, p, val >> (nbits & 15));
if (nbits >= 16) // grab low bits if necessary
{
val = (val << 16) | read16le_unaligned(*p);
*p += 2;
}
return val & ((1u << nbits) - 1);
}
static void ransdec_init(RansDecoder *r, U8 const **p)
{
// Read starting state for first stream, high
// word (LE) then low word (LE). weird mixed-endian
// byte order but it makes sense with our tag scheme.
// (see ransenc_attempt_flush).
r->x = read16le_unaligned(*p + 0) << 16;
r->x |= read16le_unaligned(*p + 2);
RR_ASSERT(r->x >= kRansL); // should be normalized: hi word=0 implies uncompressed!
*p += 4;
// number of bits for alternate stream start
U32 na = ransdec_getbits(r, p, 4) + 16;
r->x = r->xalt; // we're not interleaved yet!
U32 xalt = ransdec_getbits(r, p, na) | (1u << na);
RR_ASSERT(xalt >= kRansL); // should be normalized: na >= 16 implies xalt >= kRansL
r->x = r->xalt; // still not interleaved!
r->xalt = xalt;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment