rygorous · November 30, 2022 04:32
diff --git a/gistfile1.cpp b/gistfile1.cpp
 #include <stdint.h>

 #define BITKNIT_BRANCHLESS_RENORM

 // RAD-esque types
 typedef size_t UINTa;
 typedef uint8_t U8;
 typedef uint16_t U16;
 typedef uint32_t U32;

 // You need to supply:
 static U16 read16le_unaligned(U8 const *ptr); // read 16 bits little-endian unaligned on target platform
 static void write16le(U8 *dest, U16 val);     // write 16 bits little-endian aligned on target platform
 static inline U32 highest_set_bit(U32 x);     // index of highest set bit (derived from CLZ/BitScanReverse). May assume x!=0.

 static UINTa const kBlockSize = 1 << 16; // must be pow2

 // Basic entropy coder parameters
 static U32 const kProbBits = 15; // must be <=15 for the implementation in this file
 static U32 const kMaxProb = 1u << kProbBits;
 static U32 const kRansL = 1u << 16;

 // --------------------------------------------------------------------------
 // Encoder

 // NOTE: Max number of symbols per input byte depends on the parse. With the
 // current encoding, we have:
 //
 // - Literal:      1 sym = 1.000 sym/byte (spb)
 // - Len-2 match:  1 sym len, 1 sym offs top => 2 syms/2 bytes = 1 spb
 //                 (len-2 is always a rep match)
 // - len-3 match:  1 sym len, 1 sym offs top, 1 sym offs bot, 1 sym offs extra => 4 syms/3 bytes = 1.333 spb
 //                 (can't have two offs extra syms [more than 16 offs extra bits] for
 //                 a len-3 match due to offs limit)
 // - len-4 match:  1 sym len, 1 sym offs top, 1 sym offs bot, 2 sym offs extra => 5 syms/4 bytes = 1.250 spb
 //                 (len-4 has no offs limit)
 //
 // Really long matches can have one extra sym for the length, but they're already in a range where they
 // end up with way below 1 spb.
 //
 // So to handle the worst case, we allocate storage for 1.5 * kBlockSize symbols (a bit extra slack past
 // 1.333 just in case of later encoding tweaks). This should always be enough, and even for random data,
 // you typically only get very slightly above 1.0spb. If this number of symbols is not sufficient, the
 // block will end up getting sent uncompressed.
 static UINTa const kMaxSymsPerBlock = kBlockSize + (kBlockSize / 2);

 struct RansEncSym
 {
    U16 start;
    U16 freq;
 };

 struct RansEncoder
 {
    RansEncSym *sym_cur;    // current fill position in symbol buffer
    RansEncSym *sym_end;    // end of symbol buffer
    RansEncSym sym_buf[kMaxSymsPerBlock];
 };

 static void ransenc_init(RansEncoder *r)
 {
    r->sym_cur = r->sym_buf;
    r->sym_end = r->sym_buf + kMaxSymsPerBlock;
 }

 // Internal: adds symbol to the list
 static inline void ransenc_add_sym(RansEncoder *r, U32 start, U32 freq)
 {
    // If no more free space, bail. (Hitting this mark is treated as an error
    // condition in flush).
    if (r->sym_cur == r->sym_end)
        return;

    RansEncSym *s = r->sym_cur++;
    s->start = (U16) start;
    s->freq = (U16) freq;
 }

 static void ransenc_encode(RansEncoder *r, U32 start, U32 freq)
 {
    RR_ASSERT(freq > 0 && freq <= kMaxProb); // freq == kMaxProb is pointless but allowed
    RR_ASSERT(start + freq <= kMaxProb);
    ransenc_add_sym(r, start, freq);
 }

 static void ransenc_putbits(RansEncoder *r, U32 val, U32 nbits)
 {
    RR_ASSERT(nbits < 32);
    val &= (1u << nbits) - 1;

    if (nbits < 16)
        ransenc_add_sym(r, val, nbits | 0x8000);
    else
    {
        ransenc_add_sym(r, val >> 16, (nbits - 16) | 0x8000); // top bits
        ransenc_add_sym(r, val & 0xffff, 0xc000); // raw 16 bits for bottom
    }
 }

 static U32 ransenc_renorm(U32 x, U32 xmax, U8 **outp)
 {
    if (x < xmax) // no renorm necessary
        return x;

    *outp -= 2;
    write16le(*outp, x & 0xffff);
    return x >> 16;
 }

 // This is the low-level func used by flush; use "putbits" to buffer
 // up events during normal encoding!
 static U32 ransenc_write_bits(U32 x, U32 val, U32 nbits, U8 **outp)
 {
    RR_ASSERT(nbits <= 16);
    RR_ASSERT(val < (1u << nbits));
    if (nbits)
        x = ransenc_renorm(x, 1u << (32 - nbits), outp);
    return (x << nbits) | val;
 }

 static void *ransenc_attempt_flush(RansEncoder *r, UINTa uncomp_size, UINTa *compressed_size)
 {
    // if we ran out of symbol buffer space, keep this block uncompressed
    if (r->sym_cur == r->sym_end)
    {
        r->sym_cur = r->sym_buf; // reset sym ptr
        return 0;
    }

    // We write output in reverse, using the same storage as the sym buf.
    // Since each symbol generates <=16 bits of output data and RansEncSym
    // is larger than 16 bits, this is safe and means we use less memory.
    // It also means we don't have to check for buffer overruns in the main
    // encode loop (since the output buffer is, by definition, always
    // large enough).
    U8 *out_end = (U8*)r->sym_cur;
    U8 *out_cur = out_end;
    U8 *out_beg = (U8*)r->sym_buf;

    // actual encoding processes symbols in reverse order
    U32 xmax_base = (kRansL >> kProbBits) << 16;
    U32 xs = kRansL, xa = kRansL;
    for (RansEncSym const *symp = r->sym_cur; symp > r->sym_buf; )
    {
        // Invariant: every symbol writes <=16 bits which is less than
        // the size of a RansEncSym (so sharing space is fine)
        RansEncSym s = *--symp;

        if (s.freq < 0x8000)
        {
            // renormalize
            U32 x = ransenc_renorm(xs, xmax_base * s.freq, &out_cur);

            // encode symbol
            x = ((x / s.freq) << kProbBits) + (x % s.freq) + s.start;
            xs = xa;
            xa = x;
        }
        else if (s.freq == 0xc000) // raw 16-bit bypass
        {
            out_cur -= 2;
            write16le(out_cur, s.start & 0xffff);
        }
        else
        {
            U32 x = ransenc_write_bits(xs, s.start, s.freq & 0x7fff, &out_cur);
            xs = xa;
            xa = x;
        }
    }

    // reset symbol pointer
    r->sym_cur = r->sym_buf;

    // if we're out of space or didn't compress the data enough, use an uncompressed block.
    static UINTa const kMaxFlushSize = 5*2; // 5=max number of write16s executed
    if (out_cur - out_beg < kMaxFlushSize || (out_end - out_cur) + kMaxFlushSize >= uncomp_size)
        return 0;

    // Weave alternate stream state (xs) into primary (xa) state
    // Note we're still writing backwards!
    U32 nhighest = highest_set_bit(xs);
    RR_ASSERT(nhighest >= 16 && nhighest < 32);

    // low bits of xs
    out_cur -= 2;
    write16le(out_cur, xs & 0xffff);
    
    // high bits of xs + bit count
    xa = ransenc_write_bits(xa, (xs >> 16) & ((1u << (nhighest - 16)) - 1), nhighest - 16, &out_cur);
    xa = ransenc_write_bits(xa, nhighest - 16, 4, &out_cur);

    // xa low state then high state. we want high state last since we use hi!=0 to signal an
    // ANS compressed block (since with our normalization conditions, hi==0 can never happen).
    out_cur -= 4;
    write16le(out_cur + 2, xa & 0xffff);
    write16le(out_cur + 0, (xa >> 16));

    *compressed_size = out_end - out_cur;
    return out_cur;
 }

 // --------------------------------------------------------------------------
 // Decoder

 struct RansDecoder
 {
    U32 x;
    U32 xalt;
 };

 static void ransdec_init_empty(RansDecoder *r)
 {
    r->x = r->xalt = kRansL;
 }

 static bool ransdec_in_final_state(RansDecoder const *r)
 {
    return r->x == kRansL && r->xalt == kRansL;
 }

 // Renormalize. Internal use only, not part of the interface!
 static RADFORCEINLINE void ransdec_renorm(RansDecoder *r, U8 const **p, U32 x)
 {
 #ifndef BITKNIT_BRANCHLESS_RENORM
    if (x < kRansL)
    {
        x = (x << 16) | read16le_unaligned(*p);
        *p += 2;
    }
 #else
    U8 const *ptr = *p;
    U8 const *ptr_inc = ptr + 2;
    U32 y = (x << 16) | read16le_unaligned(ptr);
    ptr = (x < kRansL) ? ptr_inc : ptr;
    x = (x < kRansL) ? y : x;
    *p = ptr;
 #endif

    r->x = r->xalt;
    r->xalt = x;
 }

 static inline void ransdec_advance(RansDecoder *r, U8 const **p, U32 start, U32 freq)
 {
    U32 x = r->x;
    U32 xs = (x & (kMaxProb - 1)) - start;
    ransdec_renorm(r, p, freq * (x >> kProbBits) + xs);
 }

 // short: nbits <= 16
 static inline U32 ransdec_getbits_short(RansDecoder *r, U8 const **p, U32 nbits)
 {
    RR_ASSERT(nbits <= 16);
    U32 x = r->x;
    U32 v = x & ((1u << nbits) - 1);
    ransdec_renorm(r, p, x >> nbits);
    return v;
 }

 static inline U32 ransdec_getbits(RansDecoder *r, U8 const **p, U32 nbits)
 {
    RR_ASSERT(nbits < 32);

    // grab high 0-15 bits
    U32 val = r->x;
    ransdec_renorm(r, p, val >> (nbits & 15));
    if (nbits >= 16) // grab low bits if necessary
    {
        val = (val << 16) | read16le_unaligned(*p);
        *p += 2;
    }
    return val & ((1u << nbits) - 1);
 }

 static void ransdec_init(RansDecoder *r, U8 const **p)
 {
    // Read starting state for first stream, high
    // word (LE) then low word (LE). weird mixed-endian
    // byte order but it makes sense with our tag scheme.
    // (see ransenc_attempt_flush).
    r->x  = read16le_unaligned(*p + 0) << 16;
    r->x |= read16le_unaligned(*p + 2);
    RR_ASSERT(r->x >= kRansL); // should be normalized: hi word=0 implies uncompressed!
    *p += 4;

    // number of bits for alternate stream start
    U32 na = ransdec_getbits(r, p, 4) + 16;
    r->x = r->xalt; // we're not interleaved yet!

    U32 xalt = ransdec_getbits(r, p, na) | (1u << na);
    RR_ASSERT(xalt >= kRansL); // should be normalized: na >= 16 implies xalt >= kRansL
    r->x = r->xalt; // still not interleaved!
    r->xalt = xalt;
 }
	#include <stdint.h>

	#define BITKNIT_BRANCHLESS_RENORM

	// RAD-esque types
	typedef size_t UINTa;
	typedef uint8_t U8;
	typedef uint16_t U16;
	typedef uint32_t U32;

	// You need to supply:
	static U16 read16le_unaligned(U8 const *ptr); // read 16 bits little-endian unaligned on target platform
	static void write16le(U8 *dest, U16 val); // write 16 bits little-endian aligned on target platform
	static inline U32 highest_set_bit(U32 x); // index of highest set bit (derived from CLZ/BitScanReverse). May assume x!=0.

	static UINTa const kBlockSize = 1 << 16; // must be pow2

	// Basic entropy coder parameters
	static U32 const kProbBits = 15; // must be <=15 for the implementation in this file
	static U32 const kMaxProb = 1u << kProbBits;
	static U32 const kRansL = 1u << 16;

	// --------------------------------------------------------------------------
	// Encoder

	// NOTE: Max number of symbols per input byte depends on the parse. With the
	// current encoding, we have:
	//
	// - Literal: 1 sym = 1.000 sym/byte (spb)
	// - Len-2 match: 1 sym len, 1 sym offs top => 2 syms/2 bytes = 1 spb
	// (len-2 is always a rep match)
	// - len-3 match: 1 sym len, 1 sym offs top, 1 sym offs bot, 1 sym offs extra => 4 syms/3 bytes = 1.333 spb
	// (can't have two offs extra syms [more than 16 offs extra bits] for
	// a len-3 match due to offs limit)
	// - len-4 match: 1 sym len, 1 sym offs top, 1 sym offs bot, 2 sym offs extra => 5 syms/4 bytes = 1.250 spb
	// (len-4 has no offs limit)
	//
	// Really long matches can have one extra sym for the length, but they're already in a range where they
	// end up with way below 1 spb.
	//
	// So to handle the worst case, we allocate storage for 1.5 * kBlockSize symbols (a bit extra slack past
	// 1.333 just in case of later encoding tweaks). This should always be enough, and even for random data,
	// you typically only get very slightly above 1.0spb. If this number of symbols is not sufficient, the
	// block will end up getting sent uncompressed.
	static UINTa const kMaxSymsPerBlock = kBlockSize + (kBlockSize / 2);

	struct RansEncSym
	{
	U16 start;
	U16 freq;
	};

	struct RansEncoder
	{
	RansEncSym *sym_cur; // current fill position in symbol buffer
	RansEncSym *sym_end; // end of symbol buffer
	RansEncSym sym_buf[kMaxSymsPerBlock];
	};

	static void ransenc_init(RansEncoder *r)
	{
	r->sym_cur = r->sym_buf;
	r->sym_end = r->sym_buf + kMaxSymsPerBlock;
	}

	// Internal: adds symbol to the list
	static inline void ransenc_add_sym(RansEncoder *r, U32 start, U32 freq)
	{
	// If no more free space, bail. (Hitting this mark is treated as an error
	// condition in flush).
	if (r->sym_cur == r->sym_end)
	return;

	RansEncSym *s = r->sym_cur++;
	s->start = (U16) start;
	s->freq = (U16) freq;
	}

	static void ransenc_encode(RansEncoder *r, U32 start, U32 freq)
	{
	RR_ASSERT(freq > 0 && freq <= kMaxProb); // freq == kMaxProb is pointless but allowed
	RR_ASSERT(start + freq <= kMaxProb);
	ransenc_add_sym(r, start, freq);
	}

	static void ransenc_putbits(RansEncoder *r, U32 val, U32 nbits)
	{
	RR_ASSERT(nbits < 32);
	val &= (1u << nbits) - 1;

	if (nbits < 16)
	ransenc_add_sym(r, val, nbits \| 0x8000);
	else
	{
	ransenc_add_sym(r, val >> 16, (nbits - 16) \| 0x8000); // top bits
	ransenc_add_sym(r, val & 0xffff, 0xc000); // raw 16 bits for bottom
	}
	}

	static U32 ransenc_renorm(U32 x, U32 xmax, U8 **outp)
	{
	if (x < xmax) // no renorm necessary
	return x;

	*outp -= 2;
	write16le(*outp, x & 0xffff);
	return x >> 16;
	}

	// This is the low-level func used by flush; use "putbits" to buffer
	// up events during normal encoding!
	static U32 ransenc_write_bits(U32 x, U32 val, U32 nbits, U8 **outp)
	{
	RR_ASSERT(nbits <= 16);
	RR_ASSERT(val < (1u << nbits));
	if (nbits)
	x = ransenc_renorm(x, 1u << (32 - nbits), outp);
	return (x << nbits) \| val;
	}

	static void ransenc_attempt_flush(RansEncoder r, UINTa uncomp_size, UINTa *compressed_size)
	{
	// if we ran out of symbol buffer space, keep this block uncompressed
	if (r->sym_cur == r->sym_end)
	{
	r->sym_cur = r->sym_buf; // reset sym ptr
	return 0;
	}

	// We write output in reverse, using the same storage as the sym buf.
	// Since each symbol generates <=16 bits of output data and RansEncSym
	// is larger than 16 bits, this is safe and means we use less memory.
	// It also means we don't have to check for buffer overruns in the main
	// encode loop (since the output buffer is, by definition, always
	// large enough).
	U8 out_end = (U8)r->sym_cur;
	U8 *out_cur = out_end;
	U8 out_beg = (U8)r->sym_buf;

	// actual encoding processes symbols in reverse order
	U32 xmax_base = (kRansL >> kProbBits) << 16;
	U32 xs = kRansL, xa = kRansL;
	for (RansEncSym const *symp = r->sym_cur; symp > r->sym_buf; )
	{
	// Invariant: every symbol writes <=16 bits which is less than
	// the size of a RansEncSym (so sharing space is fine)
	RansEncSym s = *--symp;

	if (s.freq < 0x8000)
	{
	// renormalize
	U32 x = ransenc_renorm(xs, xmax_base * s.freq, &out_cur);

	// encode symbol
	x = ((x / s.freq) << kProbBits) + (x % s.freq) + s.start;
	xs = xa;
	xa = x;
	}
	else if (s.freq == 0xc000) // raw 16-bit bypass
	{
	out_cur -= 2;
	write16le(out_cur, s.start & 0xffff);
	}
	else
	{
	U32 x = ransenc_write_bits(xs, s.start, s.freq & 0x7fff, &out_cur);
	xs = xa;
	xa = x;
	}
	}

	// reset symbol pointer
	r->sym_cur = r->sym_buf;

	// if we're out of space or didn't compress the data enough, use an uncompressed block.
	static UINTa const kMaxFlushSize = 5*2; // 5=max number of write16s executed
	if (out_cur - out_beg < kMaxFlushSize \|\| (out_end - out_cur) + kMaxFlushSize >= uncomp_size)
	return 0;

	// Weave alternate stream state (xs) into primary (xa) state
	// Note we're still writing backwards!
	U32 nhighest = highest_set_bit(xs);
	RR_ASSERT(nhighest >= 16 && nhighest < 32);

	// low bits of xs
	out_cur -= 2;
	write16le(out_cur, xs & 0xffff);

	// high bits of xs + bit count
	xa = ransenc_write_bits(xa, (xs >> 16) & ((1u << (nhighest - 16)) - 1), nhighest - 16, &out_cur);
	xa = ransenc_write_bits(xa, nhighest - 16, 4, &out_cur);

	// xa low state then high state. we want high state last since we use hi!=0 to signal an
	// ANS compressed block (since with our normalization conditions, hi==0 can never happen).
	out_cur -= 4;
	write16le(out_cur + 2, xa & 0xffff);
	write16le(out_cur + 0, (xa >> 16));

	*compressed_size = out_end - out_cur;
	return out_cur;
	}

	// --------------------------------------------------------------------------
	// Decoder

	struct RansDecoder
	{
	U32 x;
	U32 xalt;
	};

	static void ransdec_init_empty(RansDecoder *r)
	{
	r->x = r->xalt = kRansL;
	}

	static bool ransdec_in_final_state(RansDecoder const *r)
	{
	return r->x == kRansL && r->xalt == kRansL;
	}

	// Renormalize. Internal use only, not part of the interface!
	static RADFORCEINLINE void ransdec_renorm(RansDecoder r, U8 const *p, U32 x)
	{
	#ifndef BITKNIT_BRANCHLESS_RENORM
	if (x < kRansL)
	{
	x = (x << 16) \| read16le_unaligned(*p);
	*p += 2;
	}
	#else
	U8 const ptr = p;
	U8 const *ptr_inc = ptr + 2;
	U32 y = (x << 16) \| read16le_unaligned(ptr);
	ptr = (x < kRansL) ? ptr_inc : ptr;
	x = (x < kRansL) ? y : x;
	*p = ptr;
	#endif

	r->x = r->xalt;
	r->xalt = x;
	}

	static inline void ransdec_advance(RansDecoder r, U8 const *p, U32 start, U32 freq)
	{
	U32 x = r->x;
	U32 xs = (x & (kMaxProb - 1)) - start;
	ransdec_renorm(r, p, freq * (x >> kProbBits) + xs);
	}

	// short: nbits <= 16
	static inline U32 ransdec_getbits_short(RansDecoder r, U8 const *p, U32 nbits)
	{
	RR_ASSERT(nbits <= 16);
	U32 x = r->x;
	U32 v = x & ((1u << nbits) - 1);
	ransdec_renorm(r, p, x >> nbits);
	return v;
	}

	static inline U32 ransdec_getbits(RansDecoder r, U8 const *p, U32 nbits)
	{
	RR_ASSERT(nbits < 32);

	// grab high 0-15 bits
	U32 val = r->x;
	ransdec_renorm(r, p, val >> (nbits & 15));
	if (nbits >= 16) // grab low bits if necessary
	{
	val = (val << 16) \| read16le_unaligned(*p);
	*p += 2;
	}
	return val & ((1u << nbits) - 1);
	}

	static void ransdec_init(RansDecoder r, U8 const *p)
	{
	// Read starting state for first stream, high
	// word (LE) then low word (LE). weird mixed-endian
	// byte order but it makes sense with our tag scheme.
	// (see ransenc_attempt_flush).
	r->x = read16le_unaligned(*p + 0) << 16;
	r->x \|= read16le_unaligned(*p + 2);
	RR_ASSERT(r->x >= kRansL); // should be normalized: hi word=0 implies uncompressed!
	*p += 4;

	// number of bits for alternate stream start
	U32 na = ransdec_getbits(r, p, 4) + 16;
	r->x = r->xalt; // we're not interleaved yet!

	U32 xalt = ransdec_getbits(r, p, na) \| (1u << na);
	RR_ASSERT(xalt >= kRansL); // should be normalized: na >= 16 implies xalt >= kRansL
	r->x = r->xalt; // still not interleaved!
	r->xalt = xalt;
	}