namandixit · October 4, 2025 12:36
diff --git a/std.h b/std.h
 /*
 * Creator: Naman Dixit
 * Notice: © Copyright 2024 Naman Dixit
 * License: BSD Zero Clause License
 * SPDX: 0BSD (https://spdx.org/licenses/0BSD.html)
 */

 #if !defined(STD_H_INCLUDE_GUARD)

 /* Compiler **************************************************************************/

 #if defined(_MSC_VER)
 # if defined(__clang__)
 #  define ENV_COMPILER_CLANG
 #  define ENV_COMPILER_CLANG_WITH_MSVC
 # else
 #  define ENV_COMPILER_MSVC
 # endif
 #elif defined (__GNUC__)
 # if defined(__clang__)
 #  define ENV_COMPILER_CLANG
 #  define ENV_COMPILER_CLANG_WITH_GCC
 # else
 #  define ENV_COMPILER_GCC
 # endif
 #elif defined(__clang__)
 # define ENV_COMPILER_CLANG
 #else
 # error Compiler not supported
 #endif

 /* Operating System ******************************************************************/

 #if defined(_WIN32)
 #  define ENV_OS_WINDOWS
 #elif defined(__linux__)
 # define ENV_OS_LINUX
 #else
 # error Operating system not supported
 #endif

 /* CPU Architecture ******************************************************************/

 #if defined(ENV_COMPILER_MSVC) || defined(ENV_COMPILER_CLANG_WITH_MSVC)
 # if defined(_M_IX86)
 #  define ENV_ARCH_X86
 # elif defined(_M_X64)
 #  define ENV_ARCH_X64
 # endif
 #elif defined(ENV_COMPILER_CLANG) || defined(ENV_COMPILER_GCC)
 # if defined(__i386__)
 #  define ENV_ARCH_X86
 # elif defined(__x86_64__)
 #  define ENV_ARCH_X64
 # endif
 #endif

 #if !defined(ENV_ARCH_X64)  // && !defined(ENV_ARCH_X86)
 # error Architecture not supported
 #endif

 /* Word Bit-width ********************************************************************/

 #if defined(ENV_ARCH_X86)
 # define ENV_BITWIDTH_32
 # error "Bitwidth not supported"
 #elif defined(ENV_ARCH_X64)
 # define ENV_BITWIDTH_64
 #else
 # error "Bitwidth not supported"
 #endif

 /* Programming Language **************************************************************/

 #if defined(ENV_COMPILER_MSVC)
 # if defined(__cplusplus)
 #  if __cplusplus == 199711L
 #   define ENV_LANG_CXX 1998
 #  elif __cplusplus == 201402L
 #   define ENV_LANG_CXX 2014
 #  elif __cplusplus == 201703L
 #   define ENV_LANG_CXX 2017
 #  elif __cplusplus == 202002L
 #   define ENV_LANG_CXX 2020
 #  else
 #   define ENV_LANG_CXX 2017 // A future C++, assume the last best supported
 #  endif
 # elif defined(__STDC_VERSION__)
 #  if (__STDC_VERSION__ == 201112L) || (__STDC_VERSION__ == 201710L)
 #   define ENV_LANG_C 2011
 #  elif (__STDC_VERSION__ == 202311L) || (__STDC_VERSION__ == 202312L) // 202312L is with /std:clatest before official release of C23 support
 #   define ENV_LANG_C 2023
 #  else
 #   define ENV_LANG_C 2011 // Earliest C version for which MSVC supports __STDC_VERSION__
 #  endif
 # elif defined(__STDC__) // All Microsoft extensions are off [/Za (Disable Language Extensions), similar to pedantic]
 #  define ENV_LANG_C 1989
 # else // /Za (Disable Language Extensions) is not provided, but MS never properly supported C99.
 #  define ENV_LANG_C 1989
 # endif
 #elif defined(ENV_COMPILER_CLANG) || defined(ENV_COMPILER_GCC)
 # if defined(__cplusplus)
 #  if defined(__OBJC__)
 #   define ENV_LANG_OBJCPP 1
 #  endif
 #  if __cplusplus == 199711L
 #   define ENV_LANG_CXX 1998
 #  elif __cplusplus == 201103L
 #   define ENV_LANG_CXX 2011
 #  elif __cplusplus == 201402L
 #   define ENV_LANG_CXX 2014
 #  elif __cplusplus == 201703L
 #   define ENV_LANG_CXX 2017
 #  elif __cplusplus == 202002L
 #   define ENV_LANG_CXX 2020
 #  elif __cplusplus == 202302L
 #   define ENV_LANG_CXX 2023
 #  else
 #   define ENV_LANG_CXX 2020 // A future C++, assume the last best supported
 #  endif
 # elif defined(__STDC_VERSION__) // Using C Language >= 1994 (1989)
 #  if defined(__OBJC__)
 #   define ENV_LANG_OBJC 1
 #  endif
 #  if (__STDC_VERSION__ == 199409L)
 #   define ENV_LANG_C 1989
 #  elif (__STDC_VERSION__ == 199901L)
 #   define ENV_LANG_C 1999
 #  elif (__STDC_VERSION__ == 201112L) || (__STDC_VERSION__ == 201710L)
 #   define ENV_LANG_C 2011
 #  elif (__STDC_VERSION__ == 202311L)
 #   define ENV_LANG_C 2023
 #  else
 #   define ENV_LANG_C 1999 // At least C99 (__STDC_VERSION__ is defined, C94 is too old to fallback on)
 #  endif
 # elif defined(__STDC__) && !defined(__STDC_VERSION__)
 #  if defined(__OBJC__)
 #   define ENV_LANG_OBJC 1
 #  endif
 #  define ENV_LANG_C 1989 // Since C89 doesn't require definition of __STDC_VERSION__
 # endif
 #endif

 #if !defined(ENV_LANG_C) && !defined(ENV_LANG_CXX)
 # error Language not supported
 #endif

 /* Endianness ************************************************************************/

 #if defined(ENV_OS_WINDOWS)
 # if defined(ENV_ARCH_X86) || defined(ENV_ARCH_X64)
 #  define ENV_ENDIAN_LITTLE
 # else
 #  error Could not determine endianness on Windows
 # endif
 #elif defined(ENV_OS_LINUX)
 # include <endian.h>
 # if __BYTE_ORDER == __LITTLE_ENDIAN
 #  define ENV_ENDIAN_LITTLE
 # elif __BYTE_ORDER == __BIG_ENDIAN
 #  define ENV_ENDIAN_BIG
 # else
 #  error Could not determine endianness on Linux
 # endif
 #else
 #  error Can not determine endianness, unknown environment
 #endif

 /* Constants *************************************************************************/

 #define KiB(x) (   (x) * 1024ULL)
 #define MiB(x) (KiB(x) * 1024ULL)
 #define GiB(x) (MiB(x) * 1024ULL)
 #define TiB(x) (GiB(x) * 1024ULL)

 #define HUNDRED  100L
 #define THOUSAND 1000L
 #define MILLION  1000000L
 #define BILLION  1000000000L

 /* Attributes ************************************************************************/

 //#define macro_gensym_uniq(prefix) macro_gensym2_(prefix, __COUNTER__) // Commented since I am going to use __COUNTER__ for profile events
 #define macro_gensym_line(prefix) macro_gensym2_(prefix, __LINE__)
 #define macro_gensym_func(prefix) macro_gensym2_(prefix, __func__)
 #define macro_gensym_file(prefix) macro_gensym2_(prefix, __FILE__)

 #if defined(ENV_LANG_C) && ((ENV_LANG_C < 2023) || (defined(ENV_COMPILER_MSVC))) // Remove when MSVC properly supports C23
 # define nullptr NULL
 #endif

 #if (defined(ENV_LANG_C) && ENV_LANG_C == 2011)
 # define static_assert(...) _Static_assert(__VA_ARGS__)
 #elif (defined(ENV_LANG_C) && ENV_LANG_C < 2011)
 # if defined(ENV_COMPILER_GCC) || defined(ENV_COMPILER_CLANG)
 #  define static_assert__HELPER(expr, msg)                              \
    (!!sizeof (struct { unsigned int macro_gensym_line(STATIC_ASSERTION__): (expr) ? 1 : -1; }))
 #  define static_assert(expr, msg)                                      \
    extern int (*assert_function__(void)) [static_assert__HELPER(expr, msg)]
 # elif defined(ENV_COMPILER_MSVC)
 #  define static_assert(expr, msg)                                      \
    extern char macro_gensym_line(STATIC_ASSERTION__1__)[1]; extern char macro_gensym_line(STATIC_ASSERTION__2__)[(expr)?1:2]
 # endif
 #endif

 #define global_variable   static
 #define global_immutable  static const
 #define persistent_variable  static
 #define persistent_immutable static const

 #define internal_function static
 #define header_function   static inline

 #if defined(ENV_COMPILER_MSVC) || defined(ENV_COMPILER_CLANG_WITH_MSVC)
 # if defined(BUILD_DLL)
 #  define exported_function __declspec(dllexport)
 # else
 #  define exported_function __declspec(dllimport)
 # endif
 #elif defined(ENV_COMPILER_GCC) || defined(ENV_COMPILER_CLANG)
 # define exported_function __attribute__((visibility("default")))
 #endif

 #if defined(ENV_COMPILER_MSVC) || defined(ENV_COMPILER_CLANG_WITH_MSVC)
 # define exported_data __declspec(dllexport)
 #elif defined(ENV_COMPILER_GCC) || defined(ENV_COMPILER_CLANG)
 # define exported_data __attribute__((visibility("default")))
 #endif

 #if defined(ENV_COMPILER_GCC) || defined(ENV_COMPILER_CLANG)
 #define packed_data(...) __VA_ARGS__ __attribute__((__packed__))
 #elif defined(ENV_COMPILER_MSVC)
 #define packed_data(...) __pragma( pack(push, 1) ) __VA_ARGS__ __pragma( pack(pop))
 #endif

 #if defined(__has_c_attribute)
 # if __has_c_attribute(fallthrough)
 #  define switch_fallthrough [[fallthrough]]
 # endif
 #elif defined(__cplusplus) && defined(__has_cpp_attribute)
 # if __has_cpp_attribute(fallthrough)
 #  define switch_fallthrough [[fallthrough]]
 # endif
 #endif
 #if !defined(switch_fallthrough)
 # if defined(ENV_COMPILER_GCC) && __GNUC__ >= 7
 #  define switch_fallthrough __attribute__ ((fallthrough))
 # elif defined(ENV_COMPILER_CLANG) && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__  >= 9))
 #  define switch_fallthrough __attribute__ ((fallthrough))
 # else
 #  define switch_fallthrough
 # endif
 #endif

 /* Macros ****************************************************************************/

 #define elemin(array) (sizeof(array)/sizeof((array)[0]))

 // Type-casts
 #if defined(ENV_LANG_C)
 # define cast_bit(T, m)   (*((T*)(&(m)))) /* Reading float as int, float has to be stored in a variable */
 # define cast_ptr(T, v)   ((T)(v)) /* Casting float* to int* */
 # define cast_val(T, v)   ((T)(v)) /* Convering float to int */
 # define cast_const(T, v) ((T)(v)) /* Removing const */
 #elif defined(ENV_LANG_CXX)
 template<class To, class From>
 To cast_bit_func(const From& src) noexcept {
    static_assert(sizeof(To) == sizeof(From));
    union { From f; To t; } u;
    u.f = src;
    return u.t; // implementation-defined/UB by strict standard, but works on many compilers
 }
 # define cast_bit(T, m)   (cast_bit_func<T>(m)) /* Reading float as int, float has to be stored in a variable */
 # define cast_ptr(T, v)   (reinterpret_cast<T>(v)) /* Casting float* to int* */
 # define cast_val(T, v)   (static_cast<T>(v)) /* Convering float to int */
 # define cast_const(T, v) (const_cast<T>(v)) /* Removing const */
 #endif

 #if defined(ENV_LANG_CXX)
 # define typeof(...) decltype(__VA_ARGS__)
 #endif

 // Compound Literal
 #if defined(ENV_LANG_C)
 # define compound_init(T, ...) (T) __VA_ARGS__
 #elif defined(ENV_LANG_CXX)
 # define compound_init(T, ...) T __VA_ARGS__
 #endif

 #define containerof(ptr, type, member) (cast_val(type*, cast_val(void*, (cast_ptr(Byte*, ptr)) - offsetof(type, member))))

 #define unused_variable(var) (void)var

 // Likely/unlikely for better branch prediction
 #if defined(ENV_COMPILER_MSVC)
 # define likely(x)   (x)
 # define unlikely(x) (x)
 #elif defined(ENV_COMPILER_CLANG) || defined(ENV_COMPILER_GCC)
 # define likely(x)   __builtin_expect(!!(x), 1)
 # define unlikely(x) __builtin_expect(!!(x), 0)
 #endif

 // Defer-like macros
 #define macro_gensym2_(prefix, suffix) macro_gensym_cat_(prefix, suffix)
 #define macro_gensym_cat_(prefix, suffix) prefix ## suffix

 #define macro_entail(...)                       \
    goto macro_gensym_line(jump_to_else);       \
                                                \
    while (true)                                \
        if (true) {                             \
            __VA_ARGS__;                        \
            break;                              \
        } else macro_gensym_line(jump_to_else):

 #define macro_envelop(cloak_arg_pre_action, cloak_arg_post_action)      \
    cloak_arg_pre_action;                                               \
    goto macro_gensym_line(jump_to_else);                               \
                                                                        \
    while (true)                                                        \
        if (true) {                                                     \
            cloak_arg_post_action;                                      \
            break;                                                      \
        } else macro_gensym_line(jump_to_else):

 /* Error handling macros
 *
 * Sample Code:
 *
 *         T *ta = nullptr; *tb = nullptr;
 *         err_try(doing_stuff) {
 *             ta = tCreate();
 *             if (!ta) err_throw(doing_stuff);
 *             tb = tCreate();
 *             if (!tb) err_throw(doing_stuff);
 *             stuffDo(ta, tb);
 *         } err_catch(doing_stuff) {
 *             if (ta) tDelete(ta);
 *             if (tb) tDelete(tb);
 *         }
 */
 #define err_try(fail)   Bool fail = false; do
 #define err_throw(fail) {fail=true;break;}(void)0
 #define err_catch(fail) while (0); if (fail)

 #define eat_semicolon() static_assert(1, "")

 /* Less noisy pragmas */
 #if defined(ENV_COMPILER_CLANG)
 # define pragma_msvc(P) eat_semicolon()
 # define pragma_clang(P) _Pragma(P) eat_semicolon()
 # define pragma_gcc(P) eat_semicolon()
 #elif defined(ENV_COMPILER_GCC)
 # define pragma_msvc(P) eat_semicolon()
 # define pragma_clang(P) eat_semicolon()
 # define pragma_gcc(P) _Pragma(P) eat_semicolon()
 #elif defined(ENV_COMPILER_MSVC)
 # define pragma_msvc(P) _Pragma(P) eat_semicolon()
 # define pragma_clang(P) eat_semicolon()
 # define pragma_gcc(P) eat_semicolon()
 #endif

 /* Compiler-specific tokens */
 #if defined(ENV_COMPILER_CLANG)
 # define with_msvc(P)
 # define with_clang(P) P
 # define with_gcc(P)
 #elif defined(ENV_COMPILER_GCC)
 # define with_msvc(P)
 # define with_clang(P)
 # define with_gcc(P) P
 #elif defined(ENV_COMPILER_MSVC)
 # define with_msvc(P) P
 # define with_clang(P)
 # define with_gcc(P)
 #endif

 #define with_clang_gcc(P)  with_clang(P) with_gcc(P)
 #define with_clang_msvc(P) with_clang(P) with_msvc(P)

 #if defined(ENV_LANG_C) && ENV_LANG_C >= 1999
 # define static_array_size(size) with_clang_gcc(static size)
 #else
 # define static_array_size(size) (size)
 #endif

 /* Disable all warnings in headers */
 #define disable_warnings()                                      \
    pragma_clang("clang diagnostic push");                      \
    pragma_clang("clang diagnostic ignored \"-Weverything\"");  \
    pragma_msvc("warning ( push , 0 )")
 #define enable_warnings()                       \
    pragma_clang("clang diagnostic pop");       \
    pragma_msvc("warning ( pop )")

 #define allow_padding_in_type(...)              \
    pragma_msvc("warning ( push )");            \
    pragma_msvc("warning ( disable: 4820 )");   \
    __VA_ARGS__                                 \
    pragma_msvc("warning ( pop )");

 /* Stringify contents of a macro */
 #define stringify(a) stringify_helper_(a)
 #define stringify_helper_(a) #a

 /* Header Files **********************************************************************/

 // Function-less C89 headers
 #include <float.h>
 #include <limits.h>
 #include <stdarg.h>
 #include <stddef.h>

 // Function-less C99 headers
 #if (defined(ENV_LANG_C) && (ENV_LANG_C >= 1999)) || (defined(ENV_LANG_CXX) && (ENV_LANG_CXX >= 2011))
 # include <inttypes.h>
 # include <stdbool.h>
 # include <stdint.h>
 #endif

 // Function-less C11 headers
 // stdalign
 #if (defined(ENV_LANG_C) && (ENV_LANG_C >= 2011)) || (defined(ENV_LANG_CXX) && (ENV_LANG_CXX >= 2011))
 # include <stdalign.h>
 #else
 # define alignof(...)  __alignof__(__VA_ARGS__)
 typedef struct max_align_t {
    char align_buf[__BIGGEST_ALIGNMENT__] __attribute__((aligned(__BIGGEST_ALIGNMENT__)));
 } max_align_t;
 #endif


 #if defined(ENV_ARCH_X64) || defined(ENV_ARCH_X86)
 # include <pmmintrin.h> // SSE3, 100% support as per Steam Hardware Survey
 #endif

 #if defined(ENV_COMPILER_MSVC)
 // WARN(naman): intrin0.inl.h is an internal header. This might break.
 // NOTE(naman): We include intrin0.inl.h instead of the documented intrin.h because intrin.h includes
 //              EVERY possible intrinsic, and we don't want that (the whole point of only include headers
 //              SSE3 above).
 # include <intrin0.inl.h>
 #else
 # if defined(ENV_ARCH_X64) || defined(ENV_ARCH_X86)
 #  include <x86intrin.h> // Add with carry
 # endif
 #endif

 // FIXME(naman): MSVC doesn't seem to define max_align_t for C11 code, so this will have to suffice for now.
 // For allocation alignments, see docs.microsoft.com/en-us/cpp/c-runtime-library/reference/malloc?view=msvc-160
 #if defined(ENV_COMPILER_MSVC)
 # if defined(ENV_LANG_C)
 #  if defined(ENV_ARCH_x86)
 typedef struct { double a; } max_align_t; //  8-bytes aligned (docs.microsoft.com/en-us/cpp/c-runtime-library/reference/malloc?view=msvc-160)
 static_assert(alignof(max_align_t) == 8, "Alignment of max_align_t is not 8");
 #  elif defined(ENV_ARCH_X64)
 typedef struct { __m128 sse; } max_align_t; // 16-byte aligned (docs.microsoft.com/en-us/cpp/cpp/m128?view=msvc-160)
 static_assert(alignof(max_align_t) == 16, "Alignment of max_align_t is not 16");
 #  endif
 # elif defined(ENV_LANG_CXX)
 #  include <cstddef>
 typedef std::max_align_t max_align_t;
 # endif
 #endif

 #if defined(ENV_COMPILER_CLANG) || defined(ENV_COMPILER_GCC)
 # if defined(BUILD_SANITIZERS) && (__has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__))
 disable_warnings();
 #  define _DISABLE_STL_ANNOTATION
 #  include <sanitizer/asan_interface.h>
 #  undef _DISABLE_STL_ANNOTATION
 enable_warnings();
 #  define ENV_SANITIZER_ADDRESS
 # endif
 #endif

 /* Primitive Types *******************************************************************/

 typedef int                     Sint;
 typedef unsigned int            Uint;

 #include <stdint.h>
 typedef int8_t                  Sint8;
 typedef int16_t                 Sint16;
 typedef int32_t                 Sint32;
 typedef int64_t                 Sint64;

 typedef uint8_t                 Uint8;
 typedef uint16_t                Uint16;
 typedef uint32_t                Uint32;
 typedef uint64_t                Uint64;

 typedef float                   Float32;
 typedef double                  Float64;

 typedef size_t                  Size;
 #if defined(ENV_OS_LINUX)
 typedef ssize_t                 SSize;
 #elif defined(ENV_OS_WINDOWS)
 # include <basetsd.h>
 typedef SSIZE_T                 SSize;
 #endif

 typedef uintptr_t               Uptr;
 typedef intptr_t                Sptr;
 typedef ptrdiff_t               Dptr;

 typedef unsigned char           Byte;

 typedef char                    Char;

 #if defined(ENV_LANG_C) && (ENV_LANG_C >= 1999)
 typedef _Bool                   Bool;
 #elif defined(ENV_LANG_C) && (ENV_LANG_C == 2011)
 typedef _Bool                   Bool;
 # else
 typedef bool                    Bool;
 #endif

 // NOTE(naman): We define our own true & false because by default, they are defined as 1 & 0 and that doesn't play well with _Generic.
 pragma_clang("clang diagnostic push");
 pragma_clang("clang diagnostic ignored \"-Wkeyword-macro\"");
 #if defined(ENV_LANG_C)
 # undef true
 # define true ((Bool)+1)
 # undef false
 # define false ((Bool)+0)
 #endif
 pragma_clang("clang diagnostic pop");

 /* Helper Stuff **********************************************************************/

 #if defined(ENV_LANG_C)
 # define maximum(x, y) ((x) > (y) ? (x) : (y))
 # define minimum(x, y) ((x) < (y) ? (x) : (y))
 #elif defined(ENV_LANG_CXX)
 template<typename T>
 const T& maximum (const T& a, const T& b)
 {
    return (a < b) ? b : a;
 }

 template<typename T>
 const T& minimum (const T& a, const T& b)
 {
    return (a > b) ? b : a;
 }
 #endif

 #define memzero(_x) memset((_x), 0, sizeof(*(_x)))

 #if defined(ENV_LANG_C)
 # define valzero(_t) (_t){}
 #elif defined(ENV_LANG_CXX)
 # define valzero(_t) _t{}
 #endif

 #if defined(BUILD_INTERNAL)
 # if defined(ENV_OS_WINDOWS)
 #  define breakpoint() __debugbreak()
 # elif defined(ENV_OS_LINUX)
 #  if defined(ENV_ARCH_X86) || defined(ENV_ARCH_X64)
 #   if defined(ENV_COMPILER_GCC) || defined(ENV_COMPILER_CLANG)
 #    define breakpoint() __asm__ volatile("int $0x03")
 #   endif // !GCC && !Clang
 #  endif // !x86 && !x64
 # endif // !window && !linux
 #else
 # define breakpoint() do {} while (0)
 #endif

 #define debugBreak() breakpoint()
 #define debugAssert(...) do { if (!(__VA_ARGS__)) { debugBreak(); } } while (0)

 #if defined(BUILD_INTERNAL)
 global_immutable Bool debug_unreachable_code_executed = false;
 # define debugUnreachable() debugAssert(debug_unreachable_code_executed == true)
 #else
 # if defined(ENV_COMPILER_MSVC)
 #  define debugUnreachable() __assume(false)
 # elif defined(ENV_COMPILER_GCC) || defined(ENV_COMPILER_CLANG)
 #  define debugUnreachable() __builtin_unreachable()
 # endif
 #endif

 #define COMPARE_FUNC(name) Sint name (const void *a, const void *b)

 #if defined(ENV_OS_WINDOWS)
 # if defined(ENV_LANG_CXX)
 extern "C" {
 # endif

 void * __cdecl memset(void *, Sint,         Size);     _Pragma("intrinsic(memset)")
 void * __cdecl memcpy(void *, const void *, Size);     _Pragma("intrinsic(memcpy)")
 int    __cdecl memcmp(const void*, const void*, Size); _Pragma("intrinsic(memcmp)")
 Size   __cdecl strlen(const char*);                    _Pragma("intrinsic(strlen)")

 header_function
 Sint strlcmp (const Char *a, Size al, const Char *b)
 {
    Size bl = strlen(b);
    /**/ if (al > bl) return  a[bl];
    else if (al < bl) return -b[al];
    else {
        Sint result = memcmp(a, b, minimum(al, bl));
        return result;
    }
 }

 header_function
 Sint strcmp (const Char *a, const Char *b)
 {
    Size al = strlen(a);
    Sint result = strlcmp(a, al, b);
    return result;
 }

 header_function
 Bool strleq (const Char *a, Size al, const Char *b)
 {
    return strlcmp(a, al, b) == 0;
 }

 header_function
 Bool streq (const Char *a, const Char *b)
 {
    return strcmp(a, b) == 0;
 }

 # if defined(ENV_LANG_CXX)
 }
 # endif
 #else
 # include <string.h>
 #endif

 /* *********************************************************************************
 * Base 32 Encoding
 * ****************
 * Inspired by Crockford's Base 32 encoding (https://www.crockford.com/base32.html)
 *
 * Changes:
 *
 *     1. U/u is decoded to the same integer as V/v.
 *        When encoding, v is the right value to encode to.
 *
 *     2. For the extra check symbols, use - . _ ~ u, since
 *        these are the symbols considered unreserved in URLs
 *        See section 2.3 of https://www.ietf.org/rfc/rfc3986.txt
 *
 *        (Explaination: If checksum is required, the integer
 *        value of the string is divided by 37 (the next prime
 *        after 32) and the encoded remainder is appended after
 *        the string. Since the value will be <= 37, five extra
 *        symbols are required).
 *
 *     3. The encoded characters are lower-case, not upper case
 *        since upper case looks like screaming. Also, this way,
 *        alphabets and numbers are a bit more distinguishable.
 *        And it reads like r0x0ring h4x0r 13375p34k, n00bs4uc3!
 *        (Obligatory: https://megatokyo.com/strip/9)
 */

 header_function
 Uint8 b32DecodeChar (Char code) // Character to integer
 {
    switch (code) {
        case 'O': case 'o':
        case '0':           return 0;
        case 'I': case 'i':
        case 'L': case 'l':
        case '1':           return 1;
        case '2':           return 2;
        case '3':           return 3;
        case '4':           return 4;
        case '5':           return 5;
        case '6':           return 6;
        case '7':           return 7;
        case '8':           return 8;
        case '9':           return 9;
        case 'A': case 'a': return 10;
        case 'B': case 'b': return 11;
        case 'C': case 'c': return 12;
        case 'D': case 'd': return 13;
        case 'E': case 'e': return 14;
        case 'F': case 'f': return 15;
        case 'G': case 'g': return 16;
        case 'H': case 'h': return 17;
        case 'J': case 'j': return 18;
        case 'K': case 'k': return 19;
        case 'M': case 'm': return 20;
        case 'N': case 'n': return 21;
        case 'P': case 'p': return 22;
        case 'Q': case 'q': return 23;
        case 'R': case 'r': return 24;
        case 'S': case 's': return 25;
        case 'T': case 't': return 26;
        case 'U': case 'u':
        case 'V': case 'v': return 27;
        case 'W': case 'w': return 28;
        case 'X': case 'x': return 29;
        case 'Y': case 'y': return 30;
        case 'Z': case 'z': return 31;

        default: debugUnreachable(); return 0;
    }
 }

 header_function
 Uint8 b32DecodeChecksumChar (Char check) // Character to integer
 {
    switch (check) {
        case '-': return 32;
        case '.': return 33;
        case '_': return 34;
        case '~': return 35;
        case 'u': return 36;
        default: return b32DecodeChar(check);
    }
 }

 header_function
 Char b32EncodeChar (Uint8 val) // Integer to character
 {
    switch (val) {
        case  0: return '0'; case  1: return '1'; case  2: return '2'; case  3: return '3';
        case  4: return '4'; case  5: return '5'; case  6: return '6'; case  7: return '7';
        case  8: return '8'; case  9: return '9'; case 10: return 'a'; case 11: return 'b';
        case 12: return 'c'; case 13: return 'd'; case 14: return 'e'; case 15: return 'f';
        case 16: return 'g'; case 17: return 'h'; case 18: return 'j'; case 19: return 'k';
        case 20: return 'm'; case 21: return 'n'; case 22: return 'p'; case 23: return 'q';
        case 24: return 'r'; case 25: return 's'; case 26: return 't'; case 27: return 'v';
        case 28: return 'w'; case 29: return 'x'; case 30: return 'y'; case 31: return 'z';

        default: debugUnreachable(); return 0;
    }
 }

 header_function
 Char b32EncodeChecksumChar (Uint8 val) // Integer to character
 {
    switch (val) {
        case 32: return '-';
        case 33: return '.';
        case 34: return '_';
        case 35: return '~';
        case 36: return 'u';
        default: return b32EncodeChar(val);
    }
 }

 header_function
 Bool b32VerifyInputString (Char *str, Size len, Char checksum)
 {
    Uint8 mod = 0;

    for (Size i = 0; i < len; i++) {
        Bool valid = false;
        valid |= (str[i] >= 0x30) && (str[i] <= 0x39); // 0-9

        valid |= (str[i] >= 0x61) && (str[i] <= 0x68); // a-h
        valid |= (str[i] == 0x6a); // j
        valid |= (str[i] == 0x6b); // k
        valid |= (str[i] == 0x6d); // m
        valid |= (str[i] == 0x6e); // n
        valid |= (str[i] >= 0x70) && (str[i] <= 0x74); // p-t
        valid |= (str[i] >= 0x76) && (str[i] <= 0x7a); // v-z

        valid |= (str[i] >= 0x41) && (str[i] <= 0x48); // A-H
        valid |= (str[i] == 0x4a); // j
        valid |= (str[i] == 0x4b); // k
        valid |= (str[i] == 0x4d); // m
        valid |= (str[i] == 0x4e); // n
        valid |= (str[i] >= 0x50) && (str[i] <= 0x54); // p-t
        valid |= (str[i] >= 0x56) && (str[i] <= 0x5a); // v-z

        if (!valid) return false;

        mod = (mod + b32DecodeChar(str[i])) % 37u;
    }

    if (checksum) {
        Uint8 check_car = b32DecodeChecksumChar(checksum);
        Uint8 check_val = mod;
        if (check_val != check_car) return false;
    }

    return true;
 }

 header_function
 Char b32GetChecksumChar (Char *str, Size len)
 {
    Uint8 mod = 0;
    for (Size i = 0; i < len; i++) {
        mod = (mod + b32DecodeChar(str[i])) % 37u;
    }
    Char check = b32EncodeChecksumChar(mod);
    return check;
 }

 /************************************************************************************
 * Unicode Processing
 */

 // TODO(naman): Go through these security guidelines for safe UTF-8 processing:
 // https://security.stackexchange.com/questions/257017/what-are-best-practices-for-handling-user-unicode-in-a-web-application

 /*
 * First codepoint   Last codepoint       Byte 1         Byte 2        Byte 3       Byte 4
 *     U+0000            U+007F          0yyyzzzz
 *     U+0080            U+07FF          110xxxyy       10yyzzzz
 *     U+0800            U+FFFF          1110wwww       10xxxxyy      10yyzzzz
 *     U+010000          U+10FFFF        11110uvv       10vvwwww      10xxxxyy     10yyzzzz
 */

 // UTF32 to UTF8
 header_function
 Size unicodeEncode (Uint32 code, Byte buffer[static_array_size(4)])
 {
    if (buffer == nullptr) return 0;

    if (code <= 0x7F) {
        buffer[0] = cast_val(Byte, code);                          // 0yyyzzzz
        return 1;
    } else if (code <= 0x7FF) {
        buffer[0] = cast_val(Byte, cast_val(Byte, 0xC0) | cast_val(Byte, (code >> 6)));            // 110xxxyy
        buffer[1] = cast_val(Byte, cast_val(Byte, 0x80) | cast_val(Byte, (code & 0x3F)));          // 10yyzzzz
        return 2;
    } else if (code <= 0xFFFF) {
        buffer[0] = cast_val(Byte, cast_val(Byte, 0xE0) | cast_val(Byte, (code >> 12)));           // 1110wwww
        buffer[1] = cast_val(Byte, cast_val(Byte, 0x80) | cast_val(Byte, ((code >> 6) & 0x3F)));   // 10xxxxyy
        buffer[2] = cast_val(Byte, cast_val(Byte, 0x80) | cast_val(Byte, (code & 0x3F)));          // 10yyzzzz
        return 3;
    } else if (code <= 0x10FFFF) {
        buffer[0] = cast_val(Byte, cast_val(Byte, 0xF0) | cast_val(Byte, (code >> 18)));           // 11110uvv
        buffer[1] = cast_val(Byte, cast_val(Byte, 0x80) | cast_val(Byte, ((code >> 12) & 0x3F)));  // 10vvwwww
        buffer[2] = cast_val(Byte, cast_val(Byte, 0x80) | cast_val(Byte, ((code >> 6) & 0x3F)));   // 10xxxxyy
        buffer[3] = cast_val(Byte, cast_val(Byte, 0x80) | cast_val(Byte, (code & 0x3F)));          // 10yyzzzz
        return 4;
    } else {
        return 0; // Invalid codepoint
    }
 }

 // UTF8 to UTF32
 header_function
 Bool unicodeDecode (const Byte *buffer, Size len, Size *offset, Uint32 *codepoint)
 {
    if (buffer == nullptr) return 0;

    Size   temp1 = 0; if (offset    == nullptr) offset    = &temp1;
    Uint32 temp2 = 0; if (codepoint == nullptr) codepoint = &temp2;

    Size o = *offset;
    len -= o;

    *codepoint = 0;

    if (len == 0) return false;
    Uint32 b0 = buffer[o+0];
    *offset += 1;
    if (b0 == 0) return true; // nullptr-byte

    if ((b0 & 0x80) == 0) { // 0x80 = 10000000
        *codepoint = b0;
        return true;
    }

    if (len == 1) return false;
    Uint32 b1 = buffer[o+1];
    *offset += 1;
    if (b1 == 0) return false;
    Uint32 BA1 = b1 & 0xC0; // 0xC0 = 11000000
    Uint32 BB1 = b1 & 0x3F; // 0x3F = 00111111

    if ((b0 & 0xE0) == 0xC0) { // 0xE0 = 11100000 , 0xC0 = 11000000
        if (BA1 != 0x80) return false; // 0x80 = 10000000
        Uint32 B = b0 & 0x1F; // 0x1F = 00011111
        *codepoint = (B << 6) | BB1;
        return true;
    }

    if (len == 2) return false;
    Uint32 b2 = buffer[o+2];
    *offset += 1;
    if (b2 == 0) return false;
    Uint32 BA2 = b2 & 0xC0; // 0xC0 = 11000000
    Uint32 BB2 = b2 & 0x3F; // 0x3F = 00111111

    if ((b0 & 0xF0) == 0xE0) { // 0xF0 = 11110000 , 0xE0 = 11100000
        if (BA1 != 0x80) return false; // 0x80 = 10000000
        if (BA2 != 0x80) return false; // 0x80 = 10000000
        Uint32 B = b0 & 0x0F; // 0x0F = 00001111
        *codepoint = (B << 12) | (BB1 << 6) | BB2;
        return true;
    }

    if (len == 3) return false;
    Uint32 b3 = buffer[o+3];
    *offset += 1;
    if (b3 == 0) return false;
    Uint32 BA3 = b3 & 0xC0; // 0xC0 = 11000000
    Uint32 BB3 = b3 & 0x3F; // 0x3F = 00111111

    if ((b0 & 0xF8) == 0xF0) { // 0xF8 = 11111000 , 0xF0 = 11110000
        if (BA1 != 0x80) return false; // 0x80 = 10000000
        if (BA2 != 0x80) return false; // 0x80 = 10000000
        if (BA3 != 0x80) return false; // 0x80 = 10000000

        Uint32 B = b0 & 0x07; // 0x07 = 00000111
        Uint32 result = (B << 18) | (BB1 << 12) | (BB2 << 6) | BB3;

        if (result > 0x10FFFF) return false;

        *codepoint = result;
        return true;
    }

    return false;
 }

 /**********************************************************************
 * Bitwise Delight
 */

 // NOTE(naman): Bit vectors are supposed to be zero-indexed.
 // NOTE(naman): Base type of bit vectors shouldn't have size > 8 bytes (to prevent shift overflow).

 #define bitToBytes(b) (((b)+(CHAR_BIT-1))/(CHAR_BIT))

 #define bit_ValInBuf(array, index) ((index)/(CHAR_BIT * sizeof(*(array))))
 #define bit_BitInVal(array, index) ((index)%(CHAR_BIT * sizeof(*(array))))
 #define bitSet(array, index)                                            \
    ((array)[bit_ValInBuf(array, index)] |= (1LLU << bit_BitInVal(array, index)))
 #define bitReset(array, index)                                          \
    ((array)[bit_ValInBuf(array, index)] &= ~(1LLU << bit_BitInVal(array, index)))
 #define bitToggle(array, index)                                         \
    ((array)[bit_ValInBuf(array, index)] ^= ~(1LLU << bit_BitInVal(array, index)))
 #define bitTest(array, index)                                           \
    ((array)[bit_ValInBuf(array, index)] & (1LLU << bit_BitInVal(array, index)))

 header_function
 Uint8 bitMSBU32 (Uint32 x)
 {
    debugAssert(x);
    Uint8 result;
 #if defined(ENV_COMPILER_MSVC)
    // _BitScanReverse: Search MSB->LSB and load the position of first bit in location (zero-indexed)
    unsigned long msb_location = 0;
    debugAssert(_BitScanReverse(&msb_location, x));
    result = cast_val(Uint8, msb_location);
 #else
    // __builtin_clz: Returns the number of leading 0-bits starting from MSB
    Sint maybe_set_bits = 32 - __builtin_clz(x);
    Sint msb_location = maybe_set_bits - 1; // Zero indexing
    result = cast_val(Uint8, msb_location);
 #endif
    return result;
 }

 header_function
 Uint8 bitMSBU64 (Uint64 x)
 {
    debugAssert(x);
    Uint8 result;
 #if defined(ENV_COMPILER_MSVC)
    // _BitScanReverse64: Search MSB->LSB and load the position of first bit in location (zero-indexed)
    unsigned long msb_location = 0;
    debugAssert(_BitScanReverse64(&msb_location, x));
    result = cast_val(Uint8, msb_location);
 #else
    // __builtin_clzll: Returns the number of leading 0-bits starting from MSB
    Sint maybe_set_bits = 64 - __builtin_clzll(x);
    Sint msb_location = maybe_set_bits - 1; // Zero indexing
    result = cast_val(Uint8, msb_location);
 #endif
    return result;
 }

 header_function
 Uint8 bitLSBU32 (Uint32 x)
 {
    debugAssert(x);
    Uint8 result;
 #if defined(ENV_COMPILER_MSVC)
    // _BitScanForward: Search LSB->MSB and load the position of first bit in location (zero-indexed)
    unsigned long lsb_location = 0;
    debugAssert(_BitScanForward(&lsb_location, x));
    result = cast_val(Uint8, lsb_location);
 #else
    // __builtin_ctz: Returns the number of trailing 0-bits starting from LSB
    Sint lsb_location = __builtin_ctz(x);
    result = cast_val(Uint8, lsb_location);
 #endif
    return result;
 }

 header_function
 Uint8 bitLSBU64 (Uint64 x)
 {
    debugAssert(x);
    Uint8 result;
 #if defined(ENV_COMPILER_MSVC)
    // _BitScanForward: Search LSB->MSB and load the position of first bit in location (zero-indexed)
    unsigned long lsb_location = 0;
    debugAssert(_BitScanForward64(&lsb_location, x));
    result = cast_val(Uint8, lsb_location);
 #else
    // __builtin_ctzll: Returns the number of trailing 0-bits starting from LSB
    Sint lsb_location = __builtin_ctzll(x);
    result = cast_val(Uint8, lsb_location);
 #endif
    return result;
 }

 #if defined(ENV_LANG_C) && ENV_LANG_C >= 2011
 # define bitMSB(_x) _Generic((_x), Uint32: bitMSBU32, Uint64: bitMSBU64)(_x)
 # define bitLSB(_x) _Generic((_x), Uint32: bitLSBU32, Uint64: bitLSBU64)(_x)
 #elif defined(ENV_LANG_CXX)
 header_function Uint8 bitMSB (Uint32 x) { return bitMSBU32(x); }
 header_function Uint8 bitMSB (Uint64 x) { return bitMSBU64(x); }
 header_function Uint8 bitLSB (Uint32 x) { return bitLSBU32(x); }
 header_function Uint8 bitLSB (Uint64 x) { return bitLSBU64(x); }
 #endif // !LANG_C && !LANG_CPP

 header_function
 Uint64 bitPow2U32 (Uint32 x)
 {
    Uint64 y = 1ull << x;
    return y;
 }

 header_function
 Uint64 bitPow2U64 (Uint64 x)
 {
    Uint64 y = 1ull << x;
    return y;
 }

 header_function
 Uint8 bitLog2U32 (Uint32 x)
 {
    debugAssert(x);
    Uint8 y = x ? bitMSBU32(x) : 0u;
    return y;
 }

 header_function
 Uint8 bitLog2U64 (Uint64 x)
 {
    debugAssert(x);
    Uint8 y = x ? bitMSBU64(x) : 0u;
    return y;
 }

 header_function
 Bool bitIsPowerOf2U32 (Uint32 x)
 {
    debugAssert(x);
    Bool b = (x & (x - 1)) == 0;
    return b;
 }

 header_function
 Bool bitIsPowerOf2U64 (Uint64 x)
 {
    debugAssert(x);
    Bool b = (x & (x - 1)) == 0;
    return b;
 }

 header_function
 Uint64 bitPrevPowerOf2U32 (Uint32 x)
 {
    debugAssert(x);
    Uint64 y = bitIsPowerOf2U32(x) ? (1u << (bitLog2U32(x) - 1u)) : x;
    return y;
 }

 header_function
 Uint64 bitPrevPowerOf2U64 (Uint64 x)
 {
    debugAssert(x);
    Uint64 y = bitIsPowerOf2U64(x) ? (1ull << (bitLog2U64(x) - 1ull)) : x;
    return y;
 }

 header_function
 Uint64 bitNextPowerOf2U32 (Uint32 x)
 {
    debugAssert(x);
    Uint64 y = bitIsPowerOf2U32(x) ? (1u <<   (bitLog2U32(x) + 1u)) : x;
    return y;
 }

 header_function
 Uint64 bitNextPowerOf2U64 (Uint64 x)
 {
    debugAssert(x);
    Uint64 y = bitIsPowerOf2U64(x) ? (1ull << (bitLog2U64(x) + 1ull)) : x;
    return y;
 }

 // Number of leading zeros (on the most significant end)
 header_function
 Uint8 bitMSZerosU32 (Uint32 x)
 {
    debugAssert(x);
    Uint8 clz = 32U - bitMSBU32(x) - 1U;
    return clz;
 }

 header_function
 Uint8 bitMSZerosU64 (Uint64 x)
 {
    debugAssert(x);
    Uint8 clz = 64U - bitMSBU64(x) - 1U;
    return clz;
 }

 // square root functions from Mark Dickinson
 //    https://stackoverflow.com/a/70550680/12862673

 header_function
 Uint16 bitSqrtU32 (Uint32 x)
 {
    Sint lz = bitMSZerosU32(x | 1U) & 0x1E;
    x <<= lz;
    Uint32 y = 1u + (x >> 30);
    y = (y << 1) + (x >> 27) / y;
    y = (y << 3) + (x >> 21) / y;
    y = (y << 7) + (x >> 9) / y;
    y -= x < cast_val(Uint32, y) * y;
    return cast_val(Uint16, (y >> (lz >> 1)));
 }

 header_function
 Uint32 bitSqrtU64 (Uint64 x)
 {
    Uint8 lz = cast_val(Uint8, bitMSZerosU64(x | 1ull) & 0x3Eu);
    x <<= lz;
    Uint64 y = 2ull + (x >> 63);
    y = (y << 1) + (x >> 59) / y;
    y = (y << 3) + (x >> 53) / y;
    y = (y << 7) + (x >> 41) / y;
    y = (y << 15) + (x >> 17) / y;
    y -= x < cast_val(Uint64, y) * y;
    return cast_val(Uint32, (y >> (lz >> 1)));
 }

 #if defined(ENV_LANG_C) && ENV_LANG_C >= 2011
 # define bitPow2(x)         _Generic((x), Uint32: bitPow2U32,         Uint64: bitPow2U64)(x)
 # define bitLog2(x)         _Generic((x), Uint32: bitLog2U32,         Uint64: bitLog2U64)(x)
 # define bitIsPowerOf2(x)   _Generic((x), Uint32: bitIsPowerOf2U32,   Uint64: bitIsPowerOf2U64)(x)
 # define bitPrevPowerOf2(x) _Generic((x), Uint32: bitPrevPowerOf2U32, Uint64: bitPrevPowerOf2U64)(x)
 # define bitNextPowerOf2(x) _Generic((x), Uint32: bitNextPowerOf2U32, Uint64: bitNextPowerOf2U64)(x)
 # define bitSqrt(x)         _Generic((x), Uint32: bitSqrtU32,         Uint64: bitSqrtU64)(x)
 #elif defined(ENV_LANG_CXX)
 header_function Uint64 bitPow2         (Uint32 x) { return bitPow2U32(x); }
 header_function Uint64 bitPow2         (Uint64 x) { return bitPow2U64(x); }
 header_function Uint8  bitLog2         (Uint32 x) { return bitLog2U32(x); }
 header_function Uint8  bitLog2         (Uint64 x) { return bitLog2U64(x); }
 header_function Bool   bitIsPowerOf2   (Uint32 x) { return bitIsPowerOf2U32(x); }
 header_function Bool   bitIsPowerOf2   (Uint64 x) { return bitIsPowerOf2U64(x); }
 header_function Uint64 bitPrevPowerOf2 (Uint32 x) { return bitPrevPowerOf2U32(x); }
 header_function Uint64 bitPrevPowerOf2 (Uint64 x) { return bitPrevPowerOf2U64(x); }
 header_function Uint64 bitNextPowerOf2 (Uint32 x) { return bitNextPowerOf2U32(x); }
 header_function Uint64 bitNextPowerOf2 (Uint64 x) { return bitNextPowerOf2U64(x); }
 header_function Uint16 bitSqrt         (Uint32 x) { return bitSqrtU32(x); }
 header_function Uint32 bitSqrt         (Uint64 x) { return bitSqrtU64(x); }
 #endif

 /*************************************************************************************
 * Serialization
 */

 header_function
 void srlzWrite8 (Uint8 src, Byte *dest)
 {
    dest[0] = src;
 }

 #define srlzWrite8BE srlzWrite8
 #define srlzWrite8LE srlzWrite8

 header_function
 void srlzWrite16LE (Uint16 src, Byte *dest)
 {
    dest[0] = cast_val(Uint8, (0x00000000000000FF & (src)) >> 000);
    dest[1] = cast_val(Uint8, (0x000000000000FF00 & (src)) >> 010);
 }

 header_function
 void srlzWrite16BE (Uint16 src, Byte *dest)
 {
    dest[0] = cast_val(Uint8, (0x000000000000FF00 & (src)) >> 010);
    dest[1] = cast_val(Uint8, (0x00000000000000FF & (src)) >> 000);
 }

 header_function
 void srlzWrite32LE (Uint32 src, Byte *dest)
 {
    dest[0] = cast_val(Uint8, (0x00000000000000FF & (src)) >> 000);
    dest[1] = cast_val(Uint8, (0x000000000000FF00 & (src)) >> 010);
    dest[2] = cast_val(Uint8, (0x0000000000FF0000 & (src)) >> 020);
    dest[3] = cast_val(Uint8, (0x00000000FF000000 & (src)) >> 030);
 }

 header_function
 void srlzWrite32BE (Uint32 src, Byte *dest)
 {
    dest[0] = cast_val(Uint8, (0x00000000FF000000 & (src)) >> 030);
    dest[1] = cast_val(Uint8, (0x0000000000FF0000 & (src)) >> 020);
    dest[2] = cast_val(Uint8, (0x000000000000FF00 & (src)) >> 010);
    dest[3] = cast_val(Uint8, (0x00000000000000FF & (src)) >> 000);
 }

 header_function
 void srlzWrite64LE (Uint64 src, Byte *dest)
 {
    dest[0] = cast_val(Uint8, (0x00000000000000FF & (src)) >> 000);
    dest[1] = cast_val(Uint8, (0x000000000000FF00 & (src)) >> 010);
    dest[2] = cast_val(Uint8, (0x0000000000FF0000 & (src)) >> 020);
    dest[3] = cast_val(Uint8, (0x00000000FF000000 & (src)) >> 030);
    dest[4] = cast_val(Uint8, (0x000000FF00000000 & (src)) >> 040);
    dest[5] = cast_val(Uint8, (0x0000FF0000000000 & (src)) >> 050);
    dest[6] = cast_val(Uint8, (0x00FF000000000000 & (src)) >> 060);
    dest[7] = cast_val(Uint8, (0xFF00000000000000 & (src)) >> 070);
 }

 header_function
 void srlzWrite64BE (Uint64 src, Byte *dest)
 {
    dest[0] = cast_val(Uint8, (0xFF00000000000000 & (src)) >> 070);
    dest[1] = cast_val(Uint8, (0x00FF000000000000 & (src)) >> 060);
    dest[2] = cast_val(Uint8, (0x0000FF0000000000 & (src)) >> 050);
    dest[3] = cast_val(Uint8, (0x000000FF00000000 & (src)) >> 040);
    dest[4] = cast_val(Uint8, (0x00000000FF000000 & (src)) >> 030);
    dest[5] = cast_val(Uint8, (0x0000000000FF0000 & (src)) >> 020);
    dest[6] = cast_val(Uint8, (0x000000000000FF00 & (src)) >> 010);
    dest[7] = cast_val(Uint8, (0x00000000000000FF & (src)) >> 000);
 }

 header_function
 Uint8 srlzRead8 (const Byte *src)
 {
    Uint8 result = src[0];
    return result;
 }

 #define srlzRead8BE srlzRead8
 #define srlzRead8LE srlzRead8

 header_function
 Uint16 srlzRead16LE (const Byte *src) {
    Uint16 result = cast_val(Uint16, cast_val(Uint16, 255 & src[1]) << 8 | cast_val(Uint16, 255 & src[0]));
    return result;
 }

 header_function
 Uint16 srlzRead16BE (const Byte *src) {
    Uint16 result = cast_val(Uint16, cast_val(Uint16, 255 & src[0]) << 8 | cast_val(Uint16, 255 & src[1]));
    return result;
 }

 header_function
 Uint32 srlzRead32LE (const Byte *src)
 {
    Uint32 result = (cast_val(Uint32, 255 & src[3]) << 030 | cast_val(Uint32, 255 & src[2]) << 020 |
                     cast_val(Uint32, 255 & src[1]) << 010 | cast_val(Uint32, 255 & src[0]) << 000);
    return result;
 }

 header_function
 Uint32 srlzRead32BE (const Byte *src)
 {
    Uint32 result = (cast_val(Uint32, 255 & src[0]) << 030 | cast_val(Uint32, 255 & src[1]) << 020 |
                     cast_val(Uint32, 255 & src[2]) << 010 | cast_val(Uint32, 255 & src[3]) << 000);
    return result;
 }

 header_function
 Uint64 srlzRead64LE (const Byte *src)
 {
    Uint64 result = (cast_val(Uint64, 255 & src[7]) << 070 | cast_val(Uint64, 255 & src[6]) << 060 |
                     cast_val(Uint64, 255 & src[5]) << 050 | cast_val(Uint64, 255 & src[4]) << 040 |
                     cast_val(Uint64, 255 & src[3]) << 030 | cast_val(Uint64, 255 & src[2]) << 020 |
                     cast_val(Uint64, 255 & src[1]) << 010 | cast_val(Uint64, 255 & src[0]) << 000);
    return result;
 }

 header_function
 Uint64 srlzRead64BE (const Byte *src)
 {
    Uint64 result = (cast_val(Uint64, 255 & src[0]) << 070 | cast_val(Uint64, 255 & src[1]) << 060 |
                     cast_val(Uint64, 255 & src[2]) << 050 | cast_val(Uint64, 255 & src[3]) << 040 |
                     cast_val(Uint64, 255 & src[4]) << 030 | cast_val(Uint64, 255 & src[5]) << 020 |
                     cast_val(Uint64, 255 & src[6]) << 010 | cast_val(Uint64, 255 & src[7]) << 000);
    return result;
 }

 typedef union srlz_8  { Uint8   u8; Sint8  s8;               } srlz_8;
 typedef union srlz_16 { Uint16 u16; Sint16 s16;              } srlz_16;
 typedef union srlz_32 { Uint32 u32; Sint32 s32; Float32 f32; } srlz_32;
 typedef union srlz_64 { Uint64 u64; Sint64 s64; Float64 f64; } srlz_64;

 /*************************************************************************************
 * Marshalling
 */

 #define MARSHAL_FUNC(_name) Bool _name (void* data, Size size, void* userdata)

 header_function
 Bool marshalUint8 (Uint8 *datum, MARSHAL_FUNC((*func)), void *funcdata, Bool write, Bool big_endian) {
    unused_variable(big_endian);

    srlz_8 srlzd = {0};
    Byte buf[sizeof(*datum)] = {0};
    if (write) {
        srlzd.u8 = *datum;
        srlzWrite8(srlzd.u8, buf);
        if (!func(buf, sizeof(*datum), funcdata)) return false;
    } else {
        if (!func(buf, sizeof(*datum), funcdata)) return false;
        srlzd.u8 = srlzRead8(buf);
        *datum = srlzd.u8;
    }

    return true;
 }

 header_function
 Bool marshalSint8 (Sint8 *datum, MARSHAL_FUNC((*func)), void *funcdata, Bool write, Bool big_endian) {
    unused_variable(big_endian);

    srlz_8 srlzd = {0};
    Byte buf[sizeof(*datum)] = {0};
    if (write) {
        srlzd.s8 = *datum;
        srlzWrite8(srlzd.u8, buf);
        if (!func(buf, sizeof(*datum), funcdata)) return false;
    } else {
        if (!func(buf, sizeof(*datum), funcdata)) return false;
        srlzd.u8 = srlzRead8(buf);
        *datum = srlzd.s8;
    }

    return true;
 }

 header_function
 Bool marshalUint16 (Uint16 *datum, MARSHAL_FUNC((*func)), void *funcdata, Bool write, Bool big_endian) {
    srlz_16 srlzd = {0};
    Byte buf[sizeof(*datum)] = {0};
    if (write) {
        srlzd.u16 = *datum;
        if (big_endian) { srlzWrite16BE(srlzd.u16, buf); }
        else            { srlzWrite16LE(srlzd.u16, buf); }
        if (!func(buf, sizeof(*datum), funcdata)) return false;
    } else {
        if (!func(buf, sizeof(*datum), funcdata)) return false;
        if (big_endian) { srlzd.u16 = srlzRead16BE(buf); }
        else            { srlzd.u16 = srlzRead16LE(buf); }
        *datum = srlzd.u16;
    }

    return true;
 }

 header_function
 Bool marshalSint16 (Sint16 *datum, MARSHAL_FUNC((*func)), void *funcdata, Bool write, Bool big_endian) {
    srlz_16 srlzd = {0};
    Byte buf[sizeof(*datum)] = {0};
    if (write) {
        srlzd.s16 = *datum;
        if (big_endian) { srlzWrite16BE(srlzd.u16, buf); }
        else            { srlzWrite16LE(srlzd.u16, buf); }
        if (!func(buf, sizeof(*datum), funcdata)) return false;
    } else {
        if (!func(buf, sizeof(*datum), funcdata)) return false;
        if (big_endian) { srlzd.u16 = srlzRead16BE(buf); }
        else            { srlzd.u16 = srlzRead16LE(buf); }
        *datum = srlzd.s16;
    }

    return true;
 }

 header_function
 Bool marshalUint32 (Uint32 *datum, MARSHAL_FUNC((*func)), void *funcdata, Bool write, Bool big_endian) {
    srlz_32 srlzd = {0};
    Byte buf[sizeof(*datum)] = {0};
    if (write) {
        srlzd.u32 = *datum;
        if (big_endian) { srlzWrite32BE(srlzd.u32, buf); }
        else            { srlzWrite32LE(srlzd.u32, buf); }
        if (!func(buf, sizeof(*datum), funcdata)) return false;
    } else {
        if (!func(buf, sizeof(*datum), funcdata)) return false;
        if (big_endian) { srlzd.u32 = srlzRead32BE(buf); }
        else            { srlzd.u32 = srlzRead32LE(buf); }
        *datum = srlzd.u32;
    }

    return true;
 }

 header_function
 Bool marshalSint32 (Sint32 *datum, MARSHAL_FUNC((*func)), void *funcdata, Bool write, Bool big_endian) {
    srlz_32 srlzd = {0};
    Byte buf[sizeof(*datum)] = {0};
    if (write) {
        srlzd.s32 = *datum;
        if (big_endian) { srlzWrite32BE(srlzd.u32, buf); }
        else            { srlzWrite32LE(srlzd.u32, buf); }
        if (!func(buf, sizeof(*datum), funcdata)) return false;
    } else {
        if (!func(buf, sizeof(*datum), funcdata)) return false;
        if (big_endian) { srlzd.u32 = srlzRead32BE(buf); }
        else            { srlzd.u32 = srlzRead32LE(buf); }
        *datum = srlzd.s32;
    }

    return true;
 }

 header_function
 Bool marshalFloat32 (Float32 *datum, MARSHAL_FUNC((*func)), void *funcdata, Bool write, Bool big_endian) {
    srlz_32 srlzd = {0};
    Byte buf[sizeof(*datum)] = {0};
    if (write) {
        srlzd.f32 = *datum;
        if (big_endian) { srlzWrite32BE(srlzd.u32, buf); }
        else            { srlzWrite32LE(srlzd.u32, buf); }
        if (!func(buf, sizeof(*datum), funcdata)) return false;
    } else {
        if (!func(buf, sizeof(*datum), funcdata)) return false;
        if (big_endian) { srlzd.u32 = srlzRead32BE(buf); }
        else            { srlzd.u32 = srlzRead32LE(buf); }
        *datum = srlzd.f32;
    }

    return true;
 }

 header_function
 Bool marshalUint64 (Uint64 *datum, MARSHAL_FUNC((*func)), void *funcdata, Bool write, Bool big_endian) {
    srlz_64 srlzd = {0};
    Byte buf[sizeof(*datum)] = {0};
    if (write) {
        srlzd.u64 = *datum;
        if (big_endian) { srlzWrite64BE(srlzd.u64, buf); }
        else            { srlzWrite64LE(srlzd.u64, buf); }
        if (!func(buf, sizeof(*datum), funcdata)) return false;
    } else {
        if (!func(buf, sizeof(*datum), funcdata)) return false;
        if (big_endian) { srlzd.u64 = srlzRead64BE(buf); }
        else            { srlzd.u64 = srlzRead64LE(buf); }
        *datum = srlzd.u64;
    }

    return true;
 }

 header_function
 Bool marshalSint64 (Sint64 *datum, MARSHAL_FUNC((*func)), void *funcdata, Bool write, Bool big_endian) {
    srlz_64 srlzd = {0};
    Byte buf[sizeof(*datum)] = {0};
    if (write) {
        srlzd.s64 = *datum;
        if (big_endian) { srlzWrite64BE(srlzd.u64, buf); }
        else            { srlzWrite64LE(srlzd.u64, buf); }
        if (!func(buf, sizeof(*datum), funcdata)) return false;
    } else {
        if (!func(buf, sizeof(*datum), funcdata)) return false;
        if (big_endian) { srlzd.u64 = srlzRead64BE(buf); }
        else            { srlzd.u64 = srlzRead64LE(buf); }
        *datum = srlzd.s64;
    }

    return true;
 }

 header_function
 Bool marshalFloat64 (Float64 *datum, MARSHAL_FUNC((*func)), void *funcdata, Bool write, Bool big_endian) {
    srlz_64 srlzd = {0};
    Byte buf[sizeof(*datum)] = {0};
    if (write) {
        srlzd.f64 = *datum;
        if (big_endian) { srlzWrite64BE(srlzd.u64, buf); }
        else            { srlzWrite64LE(srlzd.u64, buf); }
        if (!func(buf, sizeof(*datum), funcdata)) return false;
    } else {
        if (!func(buf, sizeof(*datum), funcdata)) return false;
        if (big_endian) { srlzd.u64 = srlzRead64BE(buf); }
        else            { srlzd.u64 = srlzRead64LE(buf); }
        *datum = srlzd.f64;
    }

    return true;
 }

 /*************************************************************************************
 * Free Grid
 */

 /*
 *   row mask     |‾‾‾‾‾‾‾‾‾‾side‾‾‾‾‾‾‾‾‾|
 *      lsb
 *       1        FF    FF    FF    FF   FF    ‾|
 *       1        FF    FF    FF    FF   FF     |
 *       1        FF    FF    07    00   00    side
 *       0        00    00    00    00   00     |
 *       0        00    00    00    00   00    _|
 *      msb
 *                1     1     1     1     1  <- col mask
 *               lsb                     msb
 */

 // NOTE(naman): Here, the "correct" thing to do would have been to
 // calculate the `side` variable using `sqrt(elem_count)`. But even
 // if we take the maximum possible size, the allocation size is
 // 32 KiB which is around the minimum allocation we can make with
 // the root TLSF allocator. So, there is no point in making a smaller
 // allocation here.
 #define FG_SIDE 64u

 // NOTE(naman): 2^18 is 64*64*64 (= 262144), and that is the maximum count we support
 #define FREE_GRID_MAXIMUM_ELEMENTS (1U << 18)

 typedef struct Free_Grid {
    Uint64 masks[FG_SIDE * FG_SIDE * sizeof(Uint64)];

    Uint8 count_masks_in_row[FG_SIDE];
    Uint8 count_masks_in_col[FG_SIDE];
    Uint64 row_mask;
    Uint64 col_mask;

    Uint32 count;
    Uint32 allocated;
 } Free_Grid;

 header_function
 void fgCreate (Free_Grid *fg, Uint32 elem_count)
 {
    debugAssert(elem_count <= FREE_GRID_MAXIMUM_ELEMENTS);

    memzero(fg);
    fg->count = elem_count;

    Uint32 fully_marked_mask_count = elem_count / 64;
    Uint32 extra_marked_bits_count = elem_count % 64;

    if (fully_marked_mask_count) memset(fg->masks, 0xFF, fully_marked_mask_count * sizeof(Uint64));
    fg->masks[fully_marked_mask_count] |= ((~cast_val(Uint64, 0x0ull)) >> (64 - extra_marked_bits_count));

    Uint32 total_marked_masks = fully_marked_mask_count + (extra_marked_bits_count ? 1 : 0);

    Uint32 fully_filled_rows = total_marked_masks / FG_SIDE;
    Uint8 partially_filled_row_size = total_marked_masks % FG_SIDE;
    Uint32 total_rows = fully_filled_rows + (partially_filled_row_size ? 1 : 0);
    fg->row_mask = ((~cast_val(Uint64, 0x0ull)) >> (64 - total_rows));

    Uint32 col_bits_count = fully_filled_rows ? FG_SIDE : partially_filled_row_size;
    fg->col_mask = ((~cast_val(Uint64, 0x0ull)) >> (64 - minimum(FG_SIDE, col_bits_count)));

    for (Size i = 0; i < fully_filled_rows; i++) {
        fg->count_masks_in_row[i] = FG_SIDE;
    }
    if (partially_filled_row_size) {
        pragma_msvc("warning ( push )");
        pragma_msvc("warning ( disable: 6386 )"); // Analyze: WRITE_OVERRUN (Buffer overrun while writing to 'fg->count_masks_in_row')
        fg->count_masks_in_row[fully_filled_rows] = partially_filled_row_size;
        pragma_msvc("warning ( pop )");
    }

    for (Size i = 0; i < FG_SIDE; i++) {
        fg->count_masks_in_col[i] = cast_val(Uint8, fully_filled_rows);
        if (partially_filled_row_size && (i < partially_filled_row_size)) {
            fg->count_masks_in_col[i]++;
        }
    }

    return;
 }

 header_function
 void fgDelete (Free_Grid *fg)
 {
    unused_variable(fg);
 }

 header_function
 Bool fgAllocate (Free_Grid *fg, Uint32 *index)
 {
    Uint8 row_first_bit_index = bitLSBU64(fg->row_mask);
    if (row_first_bit_index == cast_val(Uint8, -1)) {
        return false;
    }
    debugAssert(row_first_bit_index < 64);

    Uint8 col_first_bit_index = bitLSBU64(fg->col_mask);
    debugAssert(col_first_bit_index != cast_val(Uint8, -1));
    while (!fg->masks[(row_first_bit_index * FG_SIDE) + col_first_bit_index]) {
        col_first_bit_index++;
    }
    debugAssert(col_first_bit_index < 64);

    Uint32 mask_array_index = row_first_bit_index * FG_SIDE + col_first_bit_index;
    debugAssert(mask_array_index < (64*64));
    debugAssert(fg->masks[mask_array_index]);

    Uint8 mask_first_bit_index = bitLSBU64(fg->masks[mask_array_index]);
    debugAssert(mask_first_bit_index != cast_val(Uint8, -1));
    *index = mask_array_index * 64 + cast_val(Uint32, mask_first_bit_index);
    debugAssert(*index < fg->count);

    fg->masks[mask_array_index] &= ~(1ULL << mask_first_bit_index);
    if (fg->masks[mask_array_index] == 0) {
        fg->count_masks_in_row[row_first_bit_index]--;
        fg->count_masks_in_col[col_first_bit_index]--;

        if (fg->count_masks_in_row[row_first_bit_index] == 0) {
            fg->row_mask &= ~(1ULL << row_first_bit_index);
        }

        if (fg->count_masks_in_col[col_first_bit_index] == 0) {
            fg->col_mask &= ~(1ULL << col_first_bit_index);
        }
    }

    fg->allocated++;

    return true;
 }

 header_function
 void fgFree (Free_Grid *fg, Uint32 index)
 {
    debugAssert(index < fg->count);

    Uint32 fully_traversed_mask_index = (index + 1) / 64;
    Uint32 extra_untreaded_bits_count = (index + 1) % 64;

    Uint32 total_relevant_mask_count = fully_traversed_mask_index + (extra_untreaded_bits_count ? 1 : 0);
    Uint32 final_relevant_mask_index = total_relevant_mask_count - 1;

    Uint32 row_bit_index = final_relevant_mask_index / FG_SIDE;
    Uint32 col_bit_index = final_relevant_mask_index - (row_bit_index * FG_SIDE);

    Uint32 relevant_bit_index;
    if (extra_untreaded_bits_count) {
        relevant_bit_index = extra_untreaded_bits_count - 1;
    } else {
        relevant_bit_index = 63;
    }

    debugAssert((fg->masks[final_relevant_mask_index] & (1ULL << relevant_bit_index)) == 0);

    Uint64 old_mask = fg->masks[final_relevant_mask_index];
    fg->masks[final_relevant_mask_index] |= (1ULL << relevant_bit_index);
    if (old_mask == 0) {
        fg->count_masks_in_row[row_bit_index]++;
        fg->count_masks_in_col[col_bit_index]++;
        fg->row_mask |= (1ULL << row_bit_index);
        fg->col_mask |= (1ULL << col_bit_index);
    }
 }

 #undef FG_SIZE

 /**************************************************************************************
 * Printing
 */

 #define PRINT_FUNC(name) Size name (void *userdata, const Char *fmt, ...)

 #define PRINT_CALLBACK_FUNC(name) void name (void *userdata, Char *filled_buffer, Size buffer_len, Size buffer_cap)

 typedef enum Print_Flags {
    Print_Flags_LEFT_JUSTIFIED     = 1u << 0,
    Print_Flags_ALTERNATE_FORM     = 1u << 1,
    Print_Flags_LEADING_PLUS       = 1u << 2,
    Print_Flags_LEADING_SPACE      = 1u << 3,
    Print_Flags_LEADING_ZERO       = 1u << 4,
    Print_Flags_INT8               = 1u << 5,
    Print_Flags_INT16              = 1u << 6,
    Print_Flags_INT64              = 1u << 7,
    Print_Flags_NEGATIVE           = 1u << 8,
    Print_Flags_FLOAT_FIXED        = 1u << 9,
    Print_Flags_FLOAT_EXP          = 1u << 10,
    Print_Flags_FLOAT_HEX          = 1u << 11,
 } Print_Flags;

 // FIXME(naman): This needs to change to either yield or call a callback every n characters,
 // so that functions using this don't have to call it twice (once to get size, then to
 // get the actual characters) in cases when the buffer is too small the first time.
 // FIXME(naman): Replace memcpy, etc. with loops so that atleast some of the characters
 // can get copied if the buffer is too small. Once this is done, uncomment sbPrintSized()
 // TODO(naman): Add the following features:
 //                  * Comma in numbers
 //                    + Indian style  (`)
 //                    + Western style (')
 //                  * Unit suffix
 //                    + SI  (1 K = 1000) (:)
 //                    + IEC (1 K = 1024) (;)
 header_function
 Size printStringVarArg (Char *buffer, Size buffer_size,
                        Char const *format, va_list va,
                        PRINT_CALLBACK_FUNC((*callback)), void *userdata)
 {
    debugAssert(buffer);
    debugAssert(buffer_size);
    debugAssert(format);
    debugAssert(callback);

    Char const *fmt = format;
    Size buffer_index = 0;
    Size total_size = 0;

 #define PRINT_ADD_CHAR(_c) do {                                         \
        if (buffer_index == buffer_size) {                              \
            callback(userdata, buffer, buffer_index, buffer_size);      \
            buffer_index = 0;                                           \
        }                                                               \
        buffer[buffer_index++] = _c;                                    \
        total_size++;                                                   \
    } while (0)

    while (true) {
        // Copy everything up to the next % (or end of string)
        while ((fmt[0] != '%') && (fmt[0] != '\0')) {
            PRINT_ADD_CHAR(fmt[0]);
            fmt++;
        }

        if (fmt[0] == '%') {
            Char const *format_pointer = fmt;

            fmt++;

            // read the modifiers first

            Uint32 flags = 0;

            // flags
            while (true) {
                switch (fmt[0]) {
                    case '-': { // if we have left justify
                        flags |= Print_Flags_LEFT_JUSTIFIED;
                        fmt++;
                        continue;
                    } break;
                    case '#': { // if we use alternate form
                        flags |= Print_Flags_ALTERNATE_FORM;
                        fmt++;
                        continue;
                    } break;
                    case '+': { // if we have leading plus
                        flags |= Print_Flags_LEADING_PLUS;
                        fmt++;
                        continue;
                    } break;
                    case ' ': { // if we have leading space
                        flags |= Print_Flags_LEADING_SPACE;
                        fmt++;
                        continue;
                    } break;
                    case '0': { // if we have leading zero
                        flags |= Print_Flags_LEADING_ZERO;
                        fmt++;
                        goto flags_done;
                    } break;
                    default: {
                        goto flags_done;
                    } break;
                }
            }
        flags_done:

            Sint minimum_width = 0; // called field width in documentation

            // get the field width
            if (fmt[0] == '*') {
                minimum_width = va_arg(va, Sint);
                fmt++;
            } else {
                while ((fmt[0] >= '0') && (fmt[0] <= '9')) {
                    minimum_width = (minimum_width * 10) + (fmt[0] - '0');
                    fmt++;
                }
            }

            Sint precision = -1;

            // get the precision
            if (fmt[0] == '.') {
                fmt++;
                if (fmt[0] == '*') {
                    precision = va_arg(va, Sint);
                    fmt++;
                } else {
                    precision = 0;

                    if (fmt[0] == '-') { // Negative precision is treated as if precision was omitted
                        fmt++;
                        precision = -1;
                        while ((fmt[0] >= '0') && (fmt[0] <= '9')) {
                            fmt++;
                        }
                    } else {
                        while ((fmt[0] >= '0') && (fmt[0] <='9')) {
                            precision = (precision * 10) + (fmt[0] - '0');
                            fmt++;
                        }
                    }
                }
            }

            // handle integer size overrides
            switch (fmt[0]) {
                case 'h': { // are we 64-bit
                    flags |= Print_Flags_INT16;
                    fmt++;
                    if (fmt[0] == 'h') {
                        flags &= ~cast_val(Uint32, Print_Flags_INT16);
                        flags |= Print_Flags_INT8;
                        fmt++;
                    }
                } break;
                case 'l': { // are we 64-bit
                    flags |= Print_Flags_INT64;
                    fmt++;
                    if (fmt[0] == 'l') fmt++;
                } break;
                case 'z': { // are we 64-bit on size_t?
                    flags |= (sizeof(Size) == 8) ? Print_Flags_INT64 : 0;
                    fmt++;
                } break;
                case 't': { // are we 64-bit on ptrdiff_t?
                    flags |= (sizeof(Dptr) == 8) ? Print_Flags_INT64 : 0;
                    fmt++;
                } break;
                default: {
                } break;
            }

 #  define PRINT_STR_SIZE 2048ULL
            Char num_str[PRINT_STR_SIZE];
            Char *num_str_ptr = num_str;
            Char *str = nullptr;

            Char head_str[8] = {0};
            Size head_index = 0;

            Char tail_str[8] = {0};
            Size tail_index = 0;

            Size len = 0;

            switch (fmt[0]) { // handle each replacement
                case 's': { // string
                    // get the string
                    str = va_arg(va, Char*);
                    if (str == nullptr) {
                        str = cast_const(Char *, "null");
                    }

                    // By this point, str is most definitely not nullptr
                    len = strlen(str);

                    // clamp to precision
                    // Since precision inits at -1, if none was mentioned, this will not execute
                    if (len > cast_val(Size, precision)) {
                        len = cast_val(Size, precision);
                    }

                    precision = 0;
                } break;

                case 'c': { // char
                    // get the character
                    str = num_str_ptr + PRINT_STR_SIZE - 1;
                    *str = cast_val(Char, va_arg(va, Sint));
                    len = 1;
                    precision = 0;
                } break;

                case 'n': { // weird write-bytes specifier
                    if (flags & Print_Flags_INT64) {
                        Sint64 *p = va_arg(va, Sint64*);
                        *p = cast_val(Sint64, buffer_index);
                    } else if (flags & Print_Flags_INT16) {
                        Sint16 *p = va_arg(va, Sint16*);
                        *p = cast_val(Sint16, buffer_index);
                    } else if (flags & Print_Flags_INT8) {
                        Sint8 *p = va_arg(va, Sint8*);
                        *p = cast_val(Sint8, buffer_index);
                    } else {
                        Sint *p = va_arg(va, Sint*);
                        *p = cast_val(Sint, buffer_index);
                    }
                } break;

                case 'b': case 'B':
                case 'o': case 'O' :
                case 'x': case 'X': { // binary
                    enum { BIN, OCT, HEX } base;
                    Bool upper = false;
                    Char type = fmt[0];
                    switch (type) {
                        case 'b': base = BIN;               break;
                        case 'B': base = BIN; upper = true; break;
                        case 'o': base = OCT;               break;
                        case 'O': base = OCT; upper = true; break;
                        case 'x': base = HEX;               break;
                        case 'X': base = HEX; upper = true; break;
                        default:  {
                            base = BIN; // To silence unitialized variable warning
                            debugAssert(false && "Print: Integer base unitialized");
                        } break;
                    }

                    Uint64 num;
                    if (flags & Print_Flags_INT64) {
                        num = va_arg(va, Uint64);
                    } else {
                        num = va_arg(va, Uint32);
                    }

                    Uint64 num_dec = num;
                    if (flags & Print_Flags_INT8) {
                        num_dec = cast_val(Uint8, num_dec);
                    } else if (flags & Print_Flags_INT16) {
                        num_dec = cast_val(Uint16, num_dec);
                    }

                    str = num_str_ptr + PRINT_STR_SIZE;

                    if ((num == 0) && (precision == 0)) {
                        break;
                    }

                    while (true) {
                        Uint64 and_mask = 0;
                        switch (base) {
                            case BIN: and_mask = 0x1ULL; break;
                            case OCT: and_mask = 0x7ULL; break;
                            case HEX: and_mask = 0xFULL; break;
                            default: debugBreak(); break;
                        }

                        Uint64 shift_magnitude = 0;
                        switch (base) {
                            case BIN: shift_magnitude = 1ULL; break;
                            case OCT: shift_magnitude = 3ULL; break;
                            case HEX: shift_magnitude = 4ULL; break;
                            default: debugBreak(); break;
                        }

                        Uint64 n = num_dec & and_mask;
                        num_dec = num_dec >> shift_magnitude;

                        str--;
                        switch (base) {
                            case BIN: {
                                *str = (n == 1) ? '1' : '0';
                            } break;

                            case OCT: {
                                switch (n) {
                                    case 0: *str = '0'; break;
                                    case 1: *str = '1'; break;
                                    case 2: *str = '2'; break;
                                    case 3: *str = '3'; break;
                                    case 4: *str = '4'; break;
                                    case 5: *str = '5'; break;
                                    case 6: *str = '6'; break;
                                    case 7: *str = '7'; break;
                                    default: debugBreak(); break;
                                }
                            } break;

                            case HEX: {
                                switch (n) {
                                    case 0x0: *str = '0'; break;
                                    case 0x1: *str = '1'; break;
                                    case 0x2: *str = '2'; break;
                                    case 0x3: *str = '3'; break;
                                    case 0x4: *str = '4'; break;
                                    case 0x5: *str = '5'; break;
                                    case 0x6: *str = '6'; break;
                                    case 0x7: *str = '7'; break;
                                    case 0x8: *str = '8'; break;
                                    case 0x9: *str = '9'; break;
                                    case 0xA: *str = upper ? 'A' : 'a'; break;
                                    case 0xB: *str = upper ? 'B' : 'b'; break;
                                    case 0xC: *str = upper ? 'C' : 'c'; break;
                                    case 0xD: *str = upper ? 'D' : 'd'; break;
                                    case 0xE: *str = upper ? 'E' : 'e'; break;
                                    case 0xF: *str = upper ? 'F' : 'f'; break;
                                    default: debugBreak(); break;
                                }
                            } break;

                            default: debugBreak(); break;
                        }

                        if ((num_dec != 0) || (((num_str_ptr + PRINT_STR_SIZE) - str) < precision)) {
                            continue;
                        } else {
                            break;
                        }
                    }

                    len = (cast_ptr(Uptr, num_str_ptr) + PRINT_STR_SIZE) - cast_ptr(Uptr, str);

                    Char head_char = 0;
                    switch (base) {
                        case BIN: head_char = upper ? 'B' : 'b'; break;
                        case OCT: head_char = upper ? 'O' : 'o'; break;
                        case HEX: head_char = upper ? 'X' : 'x'; break;
                        default: debugBreak(); break;
                    }

                    if (flags & Print_Flags_ALTERNATE_FORM) {
                        head_str[head_index++] = '0';
                        if (upper) {
                            head_str[head_index++] = head_char;
                        } else {
                            head_str[head_index++] = head_char;
                        }
                    }

                    if (precision < 0) {
                        precision = 0;
                    }
                } break;

                case 'u':
                case 'd': { // integer
                    // get the integer and abs it
                    Uint64 num = 0;
                    if (flags & Print_Flags_INT64) {
                        Sint64 i64 = va_arg(va, Sint64);
                        if ((fmt[0] != 'u') && (i64 < 0)) {
                            num = 0ULL - cast_val(Uint64, i64);
                            flags |= Print_Flags_NEGATIVE;
                        } else {
                            num = cast_val(Uint64, i64);
                        }
                    } else {
                        Sint32 i = va_arg(va, Sint32);
                        if ((fmt[0] != 'u') && (i < 0)) {
                            flags |= Print_Flags_NEGATIVE;
                            num = cast_val(Uint64, 0U - cast_val(Uint32, i));
                        } else {
                            num = cast_val(Uint64, i);
                        }
                    }

                    // convert to string
                    Uint64 num_dec = num;
                    if (flags & Print_Flags_INT8) {
                        num_dec = cast_val(Uint8, num_dec);
                    } else if (flags & Print_Flags_INT16) {
                        num_dec = cast_val(Uint16, num_dec);
                    }

                    str = num_str_ptr + PRINT_STR_SIZE;

                    if ((num == 0) && (precision == 0)) {
                        break;
                    }

                    while (num_dec) {
                        str--;
                        *str = cast_val(Char, num_dec % 10) + '0';
                        num_dec /= 10;
                    }

                    // get the length that we copied
                    len = (cast_ptr(Uptr, num_str_ptr) + PRINT_STR_SIZE) - cast_ptr(Uptr, str);

                    if (len == 0) {
                        --str;
                        *str = '0';
                        len = 1;
                    }

                    if (flags & Print_Flags_NEGATIVE) {
                        head_str[head_index++] = '-';
                    }

                    if (flags & Print_Flags_LEADING_PLUS) {
                        head_str[head_index++] = '+';
                    }

                    if (flags & Print_Flags_LEADING_SPACE) {
                        if (!(flags & Print_Flags_NEGATIVE)) {
                            head_str[head_index++] = ' ';
                        }
                    }

                    if (flags & Print_Flags_LEADING_ZERO) {
                        head_str[head_index++] = '0';
                    }

                    if (precision < 0) {
                        precision = 0;
                    }
                } break;

                case 'p': { // pointer
                    const void *pv = va_arg(va, void*);
                    const Uptr pval = cast_ptr(Uptr, pv);
                    const Uint64 num = cast_val(Uint64, pval);
                    debugAssert(sizeof(pval) <= sizeof(num));

                    Uint64 num_dec = num;
                    str = num_str_ptr + PRINT_STR_SIZE;

                    while (true) {
                        Uint64 n = num_dec & 0xf;
                        num_dec = num_dec >> 4;

                        str--;
                        switch (n) {
                            case 0x0: *str = '0'; break;
                            case 0x1: *str = '1'; break;
                            case 0x2: *str = '2'; break;
                            case 0x3: *str = '3'; break;
                            case 0x4: *str = '4'; break;
                            case 0x5: *str = '5'; break;
                            case 0x6: *str = '6'; break;
                            case 0x7: *str = '7'; break;
                            case 0x8: *str = '8'; break;
                            case 0x9: *str = '9'; break;
                            case 0xA: *str = 'A'; break;
                            case 0xB: *str = 'B'; break;
                            case 0xC: *str = 'C'; break;
                            case 0xD: *str = 'D'; break;
                            case 0xE: *str = 'E'; break;
                            case 0xF: *str = 'F'; break;
                            default: debugBreak(); break;
                        }

                        if ((num_dec != 0) || (((num_str_ptr + PRINT_STR_SIZE) - str) < precision)) {
                            continue;
                        } else {
                            break;
                        }
                    }

                    len = (cast_ptr(Uptr, num_str_ptr) + PRINT_STR_SIZE) - cast_ptr(Uptr, str);
                } break;

                case 'f': case 'F':
                case 'e': case 'E':
                case 'a': case 'A': {
                    switch (fmt[0]) {
                        case 'f': case 'F': { flags |= Print_Flags_FLOAT_FIXED;   } break;
                        case 'e': case 'E': { flags |= Print_Flags_FLOAT_EXP;     } break;
                        case 'a': case 'A': { flags |= Print_Flags_FLOAT_HEX;     } break;
                        default: break;
                    }

                    Bool capital = false;
                    switch (fmt[0]) {
                        case 'F': case 'E': case 'A': { capital = true;  } break;
                        default: break;
                    }

                    Bool precision_provided = precision >= 0;

                    if (flags & Print_Flags_FLOAT_HEX) {
                        // Default precision is set to -1, so this means that if precision is not mentioned,
                        // we print 13 digits which is enough for a lossless roundtrip.
                        if (precision < 0) {
                            precision = 13;
                        } else if (precision > 17) {
                            precision = 13;
                        }
                    } else {
                        // Since decimal (non-hex) is for human reading, 6 is the right precision
                        // (considering that it is also default in libc).
                        if (precision < 0) {
                            precision = 6;
                        } else if (precision > 17) {
                            precision = 17;
                        }
                    }

                    Float64 f64 = va_arg(va, Float64);

                    union FU64 { Float64 f; Uint64 u;} fu64 = {.f = f64};
                    Uint64 bits = fu64.u;

                    /* NOTE(naman): 64-bit IEEE-754 Floating Point structure
                       ____________________________________________
                       |  Sign bit | Exponent bits | Mantissa bits|
                       |     1     |       11      |      52      |
                       --------------------------------------------
                    */

 #define             PRINT_F64_MANTISSA_BITS 52
 #define             PRINT_F64_EXPONENT_BITS 11
 #define             PRINT_F64_BIAS 1023

                    // Is the top bit set?
                    Bool negative  = bits >> (PRINT_F64_MANTISSA_BITS + PRINT_F64_EXPONENT_BITS);
                    // Remove all mantissa bits ------------------ and then reset the sign bit
                    Uint32 exponent_biased = ((cast_val(Uint32, bits >> PRINT_F64_MANTISSA_BITS)) &
                                              ((1U << PRINT_F64_EXPONENT_BITS) - 1U));
                    Sint32 exponent = cast_val(Sint32, exponent_biased) - PRINT_F64_BIAS;
                    // Reset all bits except for the mantissa bits
                    Uint64 mantissa  = bits & ((1ULL << PRINT_F64_MANTISSA_BITS) - 1ULL);

                    if (negative) {
                        flags |= Print_Flags_NEGATIVE;
                    }

                    { // Leading String
                        if (flags & Print_Flags_NEGATIVE) {
                            head_str[head_index++] = '-';
                        } else if (flags & Print_Flags_LEADING_PLUS) {
                            head_str[head_index++] = '+';
                        } else if (flags & Print_Flags_LEADING_SPACE) {
                            if (!(flags & Print_Flags_NEGATIVE)) {
                                head_str[head_index++] = ' ';
                            }
                        }

                        if ((flags & Print_Flags_LEADING_ZERO) &&
                            !(flags & Print_Flags_FLOAT_HEX)) {
                            head_str[head_index++] = '0';
                        }
                    }

                    // Step 1: Print if it is NaN/inf or zero
                    if (exponent_biased == 0x7FF) { // Handle NaN and Inf
                        str = num_str;

                        if (capital) {
                            if (mantissa) {
                                *str++ = 'N';
                                *str++ = 'A';
                                *str++ = 'N';
                            } else {
                                *str++ = 'I';
                                *str++ = 'N';
                                *str++ = 'F';
                            }
                        } else {
                            if (mantissa) {
                                *str++ = 'n';
                                *str++ = 'a';
                                *str++ = 'n';
                            } else {
                                *str++ = 'i';
                                *str++ = 'n';
                                *str++ = 'f';
                            }
                        }
                    } else if (exponent_biased == 0 && mantissa == 0) {
                        if (flags & Print_Flags_FLOAT_HEX) {
                            head_str[head_index++] = '0';
                            head_str[head_index++] = capital ? 'X' : 'x';
                            head_str[head_index++] = '0';
                        }

                        str = num_str;

                        *str++ = '0';

                        if (precision > 0) {
                            *str++ = '.';
                            memset(str, '0', cast_val(Size, precision));
                            str += precision;
                        }

                        if (flags & Print_Flags_FLOAT_EXP) {
                            tail_str[tail_index++] = capital ? 'E' : 'e';
                            tail_str[tail_index++] = '+';
                            tail_str[tail_index++] = '0';
                            tail_str[tail_index++] = '0';
                        } else if (flags & Print_Flags_FLOAT_HEX) {
                            tail_str[tail_index++] = capital ? 'P' : 'p';
                            tail_str[tail_index++] = '+';
                            tail_str[tail_index++] = '0';
                        }
                    }

                    // Normalized means that the float can always be represented as
                    // mantissa_normalised * 2^exponent_normalised
                    Uint64 mantissa_normalised;
                    Sint32 exponent_normalised;

                    if ((exponent_biased == 0) && (mantissa != 0)) { // Subnormal/Denormals
                        // Value of float is:
                        //    (mantissa / 2^52) * 2^(1-1023)
                        // => mantissa * 2^(1 - 1023 - 52)
                        // => mantissa * 2^(-1074)
                        // Meaning, mantissa: mantissa ; exponent: -1074
                        mantissa_normalised = mantissa;
                        exponent_normalised = -1074;
                    } else {
                        // If the float is normal, the value is:
                        //    (1 + (mantissa / 2^52)) * 2^(exponent)
                        // => (2^52 + mantissa) * 2^(exponent - 52)
                        // Meaning, mantissa: 2^52 + mantissa ; exponent: exponent - 52
                        mantissa_normalised = mantissa | (1ULL << 52);
                        exponent_normalised = exponent - 52;
                    }

                    // Step 2: Print if it a hexadecimal float
                    if ((str == nullptr) && (flags & Print_Flags_FLOAT_HEX)) {
                        head_str[head_index++] = '0';
                        head_str[head_index++] = capital ? 'X' : 'x';

                        Uint64 man = mantissa_normalised;
                        Sint32 exp = exponent_normalised + PRINT_F64_MANTISSA_BITS;

                        if (mantissa || precision) {
                            /* This makes it so that the MSB of mantissa is on
                             * 60th bit, and the normal/denormal bit (that we set above)
                             * is on 61st. This is done so that we can later extract the
                             * bit a nibble at a time by taking the top 4 bits repeatedly like this:
                             *     (man >> 60) & 0xF; man <<= 4;
                             * This prevents having to do more complex masking.
                             *
                             * To find out how much to shift, we need to know how many nibbles
                             * need to be printed, and then shift so that that many nibbles end up
                             * in the front (when going from MSB to LSB). That calculation is:
                             *    bitsInFloat64 - (bitsInNibble * (MANTISSA_BITS/bitsInNibble + 1))
                             * => 64 - (4 * (52/4 + 1))
                             * => 64 - (4 * (13 + 1))
                             * => 64 - (4 * 14)
                             * => 64 - 56
                             * => 8
                             * (1 at the end comes to handle the 53rd implicit bit that we just added above)
                             */
                            man <<= 8;

                            /* This will do the rounding if the precision is smaller than maximum.
                             * It does this by incrementing the MSB of mantissa that won't be printed.
                             * Doing that will send a carry forward if the nibble associated with
                             * that MSB was >= 8; and not if it was < 7. Similarly, the carry will
                             * propagate further, rounding each nibble to its nearest value,
                             * with ties to even (i.e., round-to-nearest-even) which is considered default.
                             */
                            if (precision < 13) {
                                /* Now that we have moved all the nibbles to the top with `man <<= 8`,
                                 * the bit index of the first nibble that will NOT be printed
                                 * is (59 - 4*precision).
                                 */
                                const Uint64 one_at_unprinted_msb_if_precision_is[13] = {
                                    1ULL << (59 - 4 * 0),  // 576460752303423488ULL, 0x800000000000000
                                    1ULL << (59 - 4 * 1),  // 36028797018963968ULL,  0x80000000000000
                                    1ULL << (59 - 4 * 2),  // 2251799813685248ULL,   0x8000000000000
                                    1ULL << (59 - 4 * 3),  // 140737488355328ULL,    0x800000000000
                                    1ULL << (59 - 4 * 4),  // 8796093022208ULL,      0x80000000000
                                    1ULL << (59 - 4 * 5),  // 549755813888ULL,       0x8000000000
                                    1ULL << (59 - 4 * 6),  // 34359738368ULL,        0x800000000
                                    1ULL << (59 - 4 * 7),  // 2147483648ULL,         0x80000000
                                    1ULL << (59 - 4 * 8),  // 134217728ULL,          0x8000000
                                    1ULL << (59 - 4 * 9),  // 8388608ULL,            0x800000
                                    1ULL << (59 - 4 * 10), // 524288ULL,             0x80000
                                    1ULL << (59 - 4 * 11), // 32768ULL,              0x8000
                                    1ULL << (59 - 4 * 12), // 2048ULL,               0x800
                                };

                                // Various masks used for rounding
                                Uint64 one_at_unprinted_msb  = one_at_unprinted_msb_if_precision_is[precision];
                                Uint64 ones_after_unprinted_msb = one_at_unprinted_msb - 1ULL;
                                Uint64 zeroes_at_unprinted = ~((ones_after_unprinted_msb << 1) | 1);
                                Uint64 one_at_printed_lsb = one_at_unprinted_msb << 1;

                                // https://medium.com/angular-in-depth/how-to-round-binary-fractions-625c8fa3a1af
                                // Truncate the mantissa so zero all bits that won't be printed due to precision
                                Uint64 lower = man & zeroes_at_unprinted;
                                 // Set most significant unprinted bit (not zero) to one
                                Uint64 middle = lower + one_at_unprinted_msb;
                                // Add one bit at least significant printed bit (and deal with any carry forwards)
                                Uint64 upper = lower + one_at_printed_lsb;

                                if (man < middle) {
                                    man = lower;
                                } else if (man > middle) {
                                    man = upper;
                                } else { // man == middle
                                    // Resolve tie to nearest even
                                    if ((lower & one_at_printed_lsb) == 0) {
                                        man = lower;
                                    } else {
                                        man = upper;
                                    }
                                }
                            }

                            Char const *hexs = capital ? "0123456789ABCDEF" : "0123456789abcdef";

                            str = num_str;

                            // This prints 0/1 depending on normal/denormal status
                            *str++ = hexs[(man >> 60) & 0xF];
                            man <<= 4;

                            if (precision) *str++ = '.';

                            Sint32 n = precision;

                            Uint32 count_of_end_zeroes = 0;
                            while (n--) {
                                if ((man >> 60) & 0xF) {
                                    count_of_end_zeroes = 0;
                                } else {
                                    count_of_end_zeroes++;
                                }

                                *str++ = hexs[(man >> 60) & 0xF];
                                man <<= 4;
                            }

                            if (precision_provided) {
                                count_of_end_zeroes = 0;
                            }

                            str -= count_of_end_zeroes;
                        } else {
                            len = 0;
                        }

                        {
                            tail_str[tail_index++] = capital ? 'P' : 'p';
                            tail_str[tail_index++] = exp > 0 ? '+' : '-';
                            exp = exp > 0 ? exp : -exp;

                            Char es[4] = {0};
                            Char *ep = &es[elemin(es)];
                            Char el = 0;
                            while (exp) {
                                *--ep = cast_val(Char, exp % 10) + '0';
                                el++;
                                exp /= 10;
                            }

                            while (el) {
                                el--;
                                tail_str[tail_index++] = *ep++;
                            }
                        }
                    }

                    Bool rounding_up_added_a_new_integer_digit_on_the_left = true;
                    // If this is true, str will be set to num_str at the end, else to num_str+1.
                    // This is needed to deal with the carry digit when rounding due to less precision.

                    // Step 3: Print if it is the usual float to decimal situation
                    // FIXME(naman): This uses bigint arithmetic, which makes it quite slow (especially
                    //               for doubles with huge integral parts). See about replacing it with
                    //               the Swift's dtoa that uses a improved Grisu2/Errol3 algorithm:
                    //               https://github.com/swiftlang/swift/blob/main/stdlib/public/runtime/SwiftDtoa.cpp
                    if (str == nullptr) { // If still unprinted, print through the long method
                        rounding_up_added_a_new_integer_digit_on_the_left = false;
                        str = num_str + 1;

                        // This algorithm is basically a fixed-point exact-conversion
                        // implementation (inspired by Tei Andu: https://pastebin.com/z9hYEWF1).

                        // Since this implementation is extremely simple, we don't try to find
                        // the minimum digits that allow round tripping, especially since
                        // `printf` like interfaces don't allow asking for it either.
                        // If you want round-tripping, print as a hex-float, or
                        // print with precision set to 17.

                        // If M = mantissa_normalised and X = exponent_normalised,
                        // then M should be interpreted as a number of shape 1.pqrs or 0.pqrs
                        // Multiplying with 2^X turns this into a real floating point value:
                        //         v = M.2^X
                        // But X can be both positive and negative, which means that it could move
                        // the binary point in M right or left. And even if it is positive, it may not
                        // move the binary point enough for M to become an integer, which is what
                        // we want for easy processing.
                        // So, we multiply the whole value `v` with a huge number 2^p, where `p`
                        // is chosen so that it can overpower even the most negative X so that
                        // the whole number `v` becomes a single integer.
                        //         N = v.2^p = M.2^X.2^p
                        //      => N = v.2^p = M.2^(X+p)
                        //      => N = v.2^p = M.2^L
                        // What this means is that we shift the mantissa M left by X+p (=L) bits to turn
                        // the whole number into a giant integer. Later, we can extract the integral
                        // and fractions parts (`I` and `F`) of the original value `v` using:
                        //         I = floor(N/2^p) = N >> p
                        //         F = N mod 2^p = N & ((1 << p) - 1)
                        // Now, L has to be greater than or equal to zero, since shifting right is stupid,
                        // even if the mantissa is already integral. So,
                        //         L >= 0
                        //      => X+p >= 0
                        //      => p >= -X
                        // If `p` is greater than `-X`, we need highest value of `-X` which means we need
                        // lowest value of `X`. So let's find Xmin:
                        //     First, let's say that the number of exponent bits is k.
                        //     Then, bias is equal to 2^(k-1) - 1
                        //     And since the values of exponent_biased can not be 0 and 111...1
                        //     since these values are reserved for NaN and Inf, exponent_biased's range is
                        //     [1, 2^k - 2].
                        //     To remove bias, we subtract it, so:
                        //             exponent = exponent_biased - bias
                        //          => exponent = [1, 2^k - 2] - (2^(k-1) - 1)
                        //          => exponent = [1, 2^k - 2] - 2^(k-1) + 1
                        //          => exponent = [1 - 2^(k-1) + 1, 2.2^(k-1) - 2^(k-1) + 1]
                        //          => exponent = [2 - 2^(k-1), 2^(k-1) - 1]
                        //     For Float32, this is [-126, 127] and for Float64, this is [-1022, 1023]
                        //     Since X = exponent - mantissa_bits,
                        //     X for Float32 is [-126-23,  127-23]  => [-149,  104]
                        //     X for Float64 is [-1022-52, 1023-52] => [-1074, 971]
                        // Therefore, coming to the inequality,
                        //         p >= -X
                        //      => p >= -(-1074)
                        //      => p >= 1074
                        // Similarly to find out what is the maximum possible number of bits in `N`, we need
                        // to add the already existing bits in mantissa and how much we shifted them by. Thus,
                        //         h = L + mantissa_bits
                        //      => h = X + p + mantissa_bits
                        //      => h = (exponent - mantissa_bits) + p + mantissa_bits
                        //      => h = exponent + p
                        // Let's say that the big int that will store the shifted mantissa has B bits.
                        // Accounting for one carry bit, this means that,
                        //         h <= B - 1
                        //      => exponent + p <= B - 1
                        //      => p <= B - 1 - exponent
                        // For inequality to hold forever, (B - 1 - exponent) has to be minimum, meaning
                        // that `exponent` has to be maximum.
                        //      => p <= B - 1 - 1023
                        //      => p <= B - 1024
                        // Combining the two inequalities about `p`, we get
                        //         1074 <= B - 1024
                        //      => B >= 2098
                        // Since the big int B will be composed of Uint64s, we need to find out how many
                        // Uint64s it will take. 2098/(8*8) is 32.78, and the closest larger number that
                        // can be easily cleanly divided in two in 34. However, the Ryan Juckett algorithm used
                        // bigints with 35 Uint32s, so maybe its safer to use 36 Uint64s, meaning:
                        //         B = 2304
                        Uint64 bigint[36] = {0};
                        // Once we set B, we can derive p by:
                        //         p <= 2304 - 1024
                        //      => p <= 1280
                        //     and p >= 1074
                        // The ideal choice is p = 1152, since it is B/2 (easy to deal with in various ways).
                        Uint pivot = 1152; // p above, 160 in reference
                        Sint total_left_shift_signed = exponent_normalised + cast_val(Sint, pivot);
                        debugAssert(total_left_shift_signed >= 0);
                        Uint total_left_shift = cast_val(Uint, total_left_shift_signed);
                        // `L` above, `mlsft` in reference
                        // The casting is safe since pivot is chosen to be big enough that it makes
                        // even fully negative X become positive after addition.

                        // Because pivot is divided by 64 evenly (1152/64=18), the pivot bit will fall
                        // on an element boundary. This means that in the total_left_shift, the shift
                        // contributed by pivot always ends at a element boundary, and then the shift by
                        // exponent_normalised (whether positive or negative) moves the mantissa bits
                        // left or right, potentially out of alignment with that element boundary.
                        Uint pivot_elem = pivot/64;

                        // Since we are using Uint64s for bigint, we need to calculate which elements
                        // of the array do the bits of mantissa get stored in first, and which bits get
                        // stored where.
                        //         N = M.2^L
                        //      => N = M.2^(64*(L/64) + (L%64))
                        // If elems_shifted = L/64, and bits_shifted = L%64
                        //      => N = M.2^(bits_shifted).2^(64*elems_shifted)
                        // This means we first shift by enough bits to move the mantissa into the right
                        // relative position between two elements. Then we shift across the as many word
                        // until the mantissa is in the right place.
                        // NOTE(naman): The LSB->MSB of our mantissa is stored from index 0->35
                        //              So the best way to visualize the bigint array is:
                        //                  bigint = [ 35 | 34 | 33 | 32 | ... | 3 | 2 | 1 | 0 ]
                        //               with the bits in each element going:
                        //                  element = [ 63 | 62 | 61 | ... | 3 | 2 | 1 | 0 ]
                        //               In the rest of the comments below, I will refer to range within
                        //               the array as [larger, smaller] indices, since that is what makes
                        //               sense here.
                        Uint elems_shifted = total_left_shift >> 6; // L / 64, `a` in reference
                        Uint bits_shifted = total_left_shift & ((1u << 6) - 1u); // L % 64, `b` in reference
                        bigint[elems_shifted] = mantissa_normalised << bits_shifted; // Mantissa's LSB
                        if (bits_shifted) {
                            bigint[elems_shifted+1] = mantissa_normalised >> (64u - bits_shifted); // Mantissa's MSB
                        }
                        // The `I` (integer part) is in [35, pivot_elem] and the `F` (fractional part) is
                        // in [pivot_elem-1, 0].

                        Char *str_int_begin = str;
                        Char *str_int_end;

                        { // Print integral part
                            // The basic algorithm is to divide by 10 continuously and collect the remainder.
                            // This remainder is the digit to be printed in the next place value. Of course,
                            // when we are dealing with bigint, we have to modify the algorithm like this:
                            //         carry = 0
                            //         for (&elem in integral_bigint from left to right) {
                            //             current = carry.2^64 + *elem
                            //             *elem = floor(current/10)
                            //             carry = current % 10
                            //         }
                            // As an example, lets say we have the number 5150, and we are dealing with
                            // two digits at a time. Diving by 10 will mean first doing 51/10=5 and getting
                            // carry of 1. Then, we multiply the carry 1 with 100, and add it to 50 to get 150,
                            // which is divided again as 150/10=15. Thus, the final quotient is 515.

                            // Now, doing the (carry.2^64 + *elem) will require more than 64-bits, which we don't
                            // have. Maybe we could do it in 128-bits, but there is an easier hack. What if we break
                            // this operation into two 32-bit ops? That way, we can get it done directly without
                            // needing any new helper functions or data types.
                            // Let start with:
                            //         current = carry.2^64 + *elem
                            // Divide *elem in two 32-bit parts, `hi` and `lo`
                            //         *elem = hi.2^32 + lo
                            // Let's say:
                            //         cur_hi = carry.2^32 + hi
                            //     and cur_hi = 10.q_hi + r_hi // Quotient and Remainder
                            // Similarly,
                            //         cur_lo = r_hi.2^32 + lo
                            //     and cur_lo = 10.q_lo + r_lo
                            // Combining:
                            //         current = carry.2^64 + *elem
                            //      => current = carry.2^64 + hi.2^32 + lo
                            //      => current = 2^32.(carry.2^32 + hi) + lo
                            //      => current = 2^32.cur_hi + lo
                            //      => current = 2^32.(10.q_hi + r_hi) + lo
                            //      => current = 2^32.10.q_hi + 2^32.r_hi + lo
                            //      => current = (q_hi << 32).10 + cur_lo
                            //      => current = (q_hi << 32).10 + 10.q_lo + r_lo
                            //      => current = 10.((q_hi << 32) + q_lo) + r_lo
                            // And so, we can calculate:
                            //         floor(current/10) = (q_hi << 32) +q_lo
                            //     and current % 10      = r_lo

                            Bool non_zero_digits_left = false;
                            Sint non_zero_starts_at = cast_val(Sint, elems_shifted + 1);

                            str_int_begin = str;

                            do {
                                Uint32 carry = 0;
                                non_zero_digits_left = false;

                                for (Sint i = non_zero_starts_at; i >= cast_val(Sint, pivot_elem); i--) {
                                    Uint64 elem = bigint[i];

                                    Uint32 hi = cast_val(Uint32, elem >> 32);
                                    Uint32 lo = cast_val(Uint32, elem);

                                    Uint64 cur_hi = (cast_val(Uint64, carry) << 32) + hi;
                                    Uint32 q_hi = cast_val(Uint32, cur_hi / 10u);
                                    Uint32 r_hi = cast_val(Uint32, cur_hi % 10u);

                                    Uint64 cur_lo = (cast_val(Uint64, r_hi) << 32) + lo;
                                    Uint32 q_lo = cast_val(Uint32, cur_lo / 10u);
                                    Uint32 r_lo = cast_val(Uint32, cur_lo % 10u);

                                    bigint[i] = (cast_val(Uint64, q_hi) << 32) + q_lo;
                                    carry = r_lo;

                                    Bool is_zero = bigint[i] == 0;
                                    if ((i == non_zero_starts_at) && is_zero) {
                                        non_zero_starts_at--;
                                    }

                                    non_zero_digits_left |= !is_zero;
                                }

                                debugAssert(carry < 10);
                                *str++ = cast_val(Char, carry) + '0';
                            } while (non_zero_digits_left);

                            str_int_end = str - 1;

                            // Reverse the printed string
                            Char *begin = str_int_begin;
                            Char *end   = str_int_end;

                            while (end > begin) {
                                Char swap = *end;
                                *end-- = *begin;
                                *begin++ = swap;
                            }
                        }

                        *str++ = '.';

                        Char *str_frc_begin = str;
                        Char *str_frc_end;

                        { // Print Fractional part
                            // When I say that `F` is the fractional part, what I mean is that the
                            // `F` is an integer that will make the fractional part if the number was
                            // written in the format `I.F` (where the dot is the decimal point, not
                            // multiplication. But as it is, if we simply were to read the bits in
                            // [pivot_elem-1, 0], we just get an integer F that represents
                            // the fractional part. So, to actually get the digits out of it,
                            // we will have to figure out how to treat the integer F as a fraction.
                            // Also, remember that we have a total `pivot` number of bits in F,
                            // meaning that the decimal point (after multiplying 2^p originally when
                            // making the whole floating point an integer) is on the pivot point.
                            // So, the way to get the fractional part is to read only what comes after
                            // the pivot.

                            // To calculate the digit:
                            //         d = floor((10 * F)/2^p)
                            // What this means is that we first multiply F by 10 (thus moving one digit to
                            // the left of the decimal point), and then right-shift by p-bits so that all
                            // the numbers right of the decimal point disappear, leaving only the one digit
                            // that was to the left of the point.
                            // Once this is done, before we can do the next digit, we change F with:
                            //         F = (10 * F) % 2^p
                            // Basically, this gives us the bits right of the decimal point after multiplying
                            // with 10.
                            // Since we are dealing with bigint, the multiplication algorithm has to be:
                            //         carry = 0
                            //         while (&elem in fractional_bigint from right to left) {
                            //             product = *elem * 10
                            //             *elem = carry + (product % 2^64)
                            //             carry = product / 2^64
                            //         }
                            // As an example, lets say we have the number 0.5432, and we are dealing with
                            // two digits at a time. Multiplying with 10 will mean first doing 32*10=320
                            // and getting result of 20 and carry of 3. Then, we multiply 54 with 10
                            // to get 540 to which we add the carry 3, getting 543. Again, 43 remain and 5
                            // becomes the carry. This 5 is the digit extracted, since the number now is
                            // 5.432, with 5 stored in carry and 4320 being left for future digit extraction.

                            // Similar to the integral part, doing the (*elem * 10) and (carry + product)
                            // will require more than 64-bits, which we don't have. So, let's try to do this
                            // with 32 bit ops again.
                            // Let's start with:
                            //         product = *elem * 10
                            // Divide *elem in two 32-bit parts, `hi` and `lo`
                            //         *elem = hi.2^32 + lo
                            // Let's say:
                            //         prod_lo = lo * 10
                            //         prod_lo_mod = prod_lo % 2^32
                            //         elem_lo = carry + prod_lo_mod
                            //         carry_lo = prod_lo / 2^32
                            //
                            //         prod_hi = hi * 10
                            //         elem_hi = carry_lo + (prod_hi % 2^32)
                            //         carry_hi = prod_hi / 2^32
                            //
                            //         *elem = elem_hi.2^32 + elem_lo
                            //         carry = carry_hi

                            Bool non_zero_digits_left = false;
                            Uint fraction_starts_at = elems_shifted;

                            Uint digit_count = 0;

                            // This loop will always print atleast one digit
                            do {
                                Uint32 carry = 0;
                                non_zero_digits_left = false;

                                for (Uint32 i = fraction_starts_at; i < pivot_elem; i++) {
                                    Uint64 elem = bigint[i];

                                    Uint32 hi = cast_val(Uint32, elem >> 32);
                                    Uint32 lo = cast_val(Uint32, elem);

                                    Uint64 prod_lo = cast_val(Uint64, lo) * 10u;
                                    Uint32 prod_lo_mod = cast_val(Uint32, prod_lo);
                                    Uint32 elem_lo = carry + prod_lo_mod;
                                    Uint32 carry_lo = cast_val(Uint32, prod_lo >> 32);
                                    if (elem_lo < prod_lo_mod) { // Overflow when adding carry
                                        carry_lo++;
                                    }

                                    Uint64 prod_hi = cast_val(Uint64, hi) * 10u;
                                    Uint32 prod_hi_mod = cast_val(Uint32, prod_hi);
                                    Uint32 elem_hi = carry_lo + prod_hi_mod;
                                    Uint32 carry_hi = cast_val(Uint32, prod_hi >> 32);
                                    if (elem_hi < prod_hi_mod) { // Overflow when adding carry
                                        carry_hi++;
                                    }

                                    bigint[i] = (cast_val(Uint64, elem_hi) << 32) | elem_lo;
                                    carry = carry_hi;

                                    Bool is_zero = bigint[i] == 0;
                                    if ((i == fraction_starts_at) && is_zero) {
                                        fraction_starts_at++;
                                    }

                                    non_zero_digits_left |= !is_zero;
                                }

                                if (digit_count < (cast_val(Uint, precision) + 1)) {
                                    debugAssert(carry < 10);
                                    *str++ = cast_val(Char, carry) + '0';
                                    digit_count++;
                                } else {
                                    // Even after printing (precision + 1) digits, the remaining fraction is
                                    // not zero. So we mark that condition by setting digit_count to a value
                                    // one greater than that.
                                    digit_count = cast_val(Uint, precision) + 2;
                                }
                            } while (non_zero_digits_left && (digit_count <= (cast_val(Uint, precision) + 1)));

                            str_frc_end = str - 1;
                            unused_variable(str_frc_end); // Prevent "set but unused" warnings

                            // TODO(naman): Add support for printing floats in exponential notation (A.BCe+D)
                            // This may be useful: https://gist.github.com/forksnd/c9a91957e9e5c933635148d3b74f7d65

                            // Now, do rounding based on round-half-to-even (Banker's rounding)
                            // https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
                            // The basic algorithm is this, given precision after decimal point is p,
                            //     1. Let rounding_digit be the digit at position (p+1)
                            //     2. Let tail be all digits that come after rounding_digit
                            //     3. If rounding_digit > 5, round up
                            //     4. If rounding_digit < 5, round down (do nothing)
                            //     5. If rounding_digit == 5,
                            //            A. If tail has any non-zero digits: round up
                            //            B. If tail is all zeroes,
                            //                a) If digit at position p is odd, round up
                            //                b) If digit at position p is even, do nothing
                            // The edge cases to keep in mind are:
                            //     1. precision == 0, meaning there is no fractional part
                            //     2. Round-up cascade reaching into integral part from fractional part
                            //     3. For negative numbers, rounding is done to magnitude and sign is added later

                            Bool round_up;

                            if (digit_count <= cast_val(Uint, precision)) { // We needed more fractional digits
                                // We didn't collect (precision + 1) digits, so non_zero_digits_left must have
                                // become false, meaning that there is a bunch of zeroes that should trail our
                                // number.
                                if (cast_val(Uint, precision) > digit_count) {
                                    Uint zero_count = cast_val(Uint, precision) - digit_count;
                                    // Trailing zeroes
                                    memset(str, '0', zero_count);
                                    str += zero_count;
                                    str_frc_end = str - 1;
                                }

                                // Since the fraction ended prematurely, there is no rounding needed either.
                                round_up = false;
                            } else { // We got all the fractional digits we needed
                                const Char rounding_digit = str_frc_begin[precision];

                                // Bool tail_is_nonzero = ((digit_count > (cast_val(Uint, precision) + 1)) ||
                                //                         non_zero_digits_left);

                                Bool tail_is_nonzero;
                                if (digit_count > (cast_val(Uint, precision) + 1)) {
                                    // We captured more than precision+1 digits => there is a non-zero tail
                                    tail_is_nonzero = true;
                                } else if (digit_count < (cast_val(Uint, precision) + 1)) {
                                    // We didn't even capture the rounding digit => fraction ended early => no tail
                                    tail_is_nonzero = false;
                                } else {
                                    // exactly precision+1 digits were captured — use the post-multiply flag
                                    tail_is_nonzero = non_zero_digits_left;
                                }

                                if (rounding_digit > '5') {
                                    round_up = true;
                                } else if (rounding_digit < '5') {
                                    round_up = false;
                                } else { // rounding_digit == 5
                                    if (tail_is_nonzero) {
                                        round_up = true;
                                    } else {
                                        Char prev;
                                        if (precision == 0) { // No fraction digits, rounding baed on integer
                                            // We have still printed one fraction digit in the (precision + 1)
                                            // slot, which is why the (rounding_digit == 5) condition is hit.
                                            prev = *str_int_end;
                                        } else { // Fraction digit present, rounding based on them
                                            prev = str_frc_begin[precision - 1];
                                        }

                                        Bool prev_is_odd = (prev - '0') & 1;
                                        round_up = prev_is_odd;
                                    }
                                }

                                if (round_up) {
                                    Sint k = precision - 1; // -1 if precision == 0
                                    while (k >= 0) { // Only run if there are fractional digits to print
                                        if (str_frc_begin[k] < '9') { // Can be rounded up in place
                                            str_frc_begin[k]++; // Round up
                                            break; // and we are done
                                        } else { // str_frc_begin[k] == '9'
                                            debugAssert(str_frc_begin[k] == '9');
                                            str_frc_begin[k] = '0'; // 9+1 = 10, so 0 here and 1 carries over
                                            k--; // We need to apply the carry to the digit on the left
                                        }
                                    }

                                    if (k < 0) { // Either no fractional digits, or 1 carried from fractional rounding
                                        Char *int_digit = str_int_end;
                                        while (int_digit >= str_int_begin) {
                                            if (*int_digit < '9') {
                                                (*int_digit)++;
                                                break;
                                            } else {
                                                *int_digit = '0';
                                                int_digit--;
                                            }
                                        }

                                        if (int_digit < str_int_begin) { // One more carry
                                            debugAssert(int_digit == (str_int_begin - 1));
                                            rounding_up_added_a_new_integer_digit_on_the_left = true;
                                            *int_digit = '1';
                                            str_int_begin--;
                                        }
                                    }
                                }

                                str = str_frc_begin + precision;
                            }
                        }

                        *str++ = '\0';
                    }

                    len = cast_val(Uptr, str - num_str_ptr);
                    if (rounding_up_added_a_new_integer_digit_on_the_left) {
                        str = num_str_ptr;
                    } else {
                        str = num_str_ptr + 1;
                    }
                    precision = 0;
                } break;

                case '%': {
                    str = num_str_ptr;
                    str[0] = '%';
                    len = 1;
                    precision = 0;
                } break;

                case '\0': {
                    // If the format string ends prematurely, we print
                    // whatever half-formed conversion specification came through;
                    // to do that, we decrement the pointer since we need to make
                    // sure that we don't end up copying the terminating null byte,
                    // since that will be put in the final output at the end. We
                    // also want to decrement since after the next fmt++ (at the
                    // end of the function), we want fmt to point to the null byte
                    // so that the priting will properly end.
                    fmt--;
                    goto nlib_print_unimplemented_feature;
                } break;

                default: { // unknown, just copy conversion specification
                nlib_print_unimplemented_feature:
                    str = num_str_ptr;
                    while (format_pointer <= fmt) {
                        str[0] = format_pointer[0];
                        format_pointer++;
                        str++;
                    }

                    len = cast_val(Size, str - num_str_ptr);
                    str = num_str_ptr;
                    precision = 0;
                } break;
            }

            Size head_size = head_index;
            head_index = 0;
            Size tail_size = tail_index;
            tail_index = 0;

            // get minimum_width=leading/trailing space, precision=leading zeros
            if (cast_val(Size, precision) < len) {
                precision = cast_val(Sint, len);
            }

            Sint zeros_head_tail = (precision +
                                    cast_val(Sint, head_size) +
                                    cast_val(Sint, tail_size));
            if (minimum_width < zeros_head_tail) {
                minimum_width = zeros_head_tail;
            }

            minimum_width -= zeros_head_tail;
            precision -= cast_val(Sint, len);

            // handle right justify and leading zeros
            if ((flags & Print_Flags_LEFT_JUSTIFIED) == 0) {
                if (flags & Print_Flags_LEADING_ZERO) { // then everything is in precision
                    precision = (minimum_width > precision) ? minimum_width : precision;
                    minimum_width = 0;
                }
            }

            // copy leading spaces
            if ((minimum_width + precision) > 0) {
                // copy leading spaces (or when doing %8.4d stuff)
                if ((flags & Print_Flags_LEFT_JUSTIFIED) == 0) {
                    for (Sint i = 0; i < minimum_width; i++) {
                        PRINT_ADD_CHAR(' ');
                    }
                }

                for (Size i = 0; i < head_size; i++) { // copy the head
                    PRINT_ADD_CHAR(head_str[i]);
                }
                head_index += head_size;

                for (Sint i = 0; i < precision; i++) {  // copy leading zeros
                    PRINT_ADD_CHAR('0');
                }
            }

            if (head_index < head_size) {  // copy the head
                Size repeat = head_size - head_index;
                for (Size i = 0; i < repeat; i++) {
                    PRINT_ADD_CHAR(head_str[i]);
                }
                head_index += repeat;
            }

            for (Size i = 0; i < len; i++) { // copy the string
                PRINT_ADD_CHAR(str[i]);
            }
            str += len;

            for (Size i = 0; i < tail_size; i++) { // copy the tail
                PRINT_ADD_CHAR(tail_str[i]);
            }

            // handle the left justify
            if (flags & Print_Flags_LEFT_JUSTIFIED) {
                for (Sint i = 0; i < minimum_width; i++) {
                    PRINT_ADD_CHAR(' ');
                }
            }

            fmt++;
        } else if (fmt[0] =='\0') {
            break;
        }
    }

    if (buffer_index) {
        callback(userdata, buffer, buffer_index, buffer_size);
    }

 #undef PRINT_ADD_CHAR
    return total_size;
 }

 /**************************************************************************************
 * Memory
 */

 header_function
 void* memoryAlignUpPtrTo (void *p, Size align)
 {
    Size k = align - 1LLU;
    Size not_k = ~k;
    Size up = cast_bit(Uptr, p) + k;
    Size result_uptr = up & cast_val(Uptr, not_k);
    void *result = cast_bit(void *, result_uptr);
    return result;
 }

 header_function
 Size memoryAlignUpSizeTo (Size s, Size align)
 {
    Size k = align - 1LLU;
    Size result = (s + k) & (~ k);
    return result;
 }

 header_function
 void* memoryAlignDownPtrTo (void *p, Size align)
 {
    Byte *pb = cast_val(Byte *, p);
    Size k = align - 1LLU;
    Byte *result = cast_val(Byte *, memoryAlignUpPtrTo(pb - k, align));
    return result;
 }

 header_function
 Size memoryAlignDownSizeTo (Size s, Size align)
 {
    Size k = align - 1LLU;
    Size result = memoryAlignUpSizeTo(s - k, align);
    return result;
 }

 #if defined(ENV_LANG_C) && ENV_LANG_C >= 2011
 # define memoryAlignUpTo(x, a)   _Generic((x), void*: memoryAlignUpPtrTo,   Size:memoryAlignUpSizeTo)(x, a)
 # define memoryAlignDownTo(x, a) _Generic((x), void*: memoryAlignDownPtrTo, Size:memoryAlignDownSizeTo)(x, a)
 #elif defined(ENV_LANG_CXX)
 header_function void* memoryAlignUpTo   (void *p, Size align) { return memoryAlignUpPtrTo(p, align);    }
 header_function Size  memoryAlignUpTo   (Size  s, Size align) { return memoryAlignUpSizeTo(s, align);   }
 header_function void* memoryAlignDownTo (void *p, Size align) { return memoryAlignDownPtrTo(p, align);  }
 header_function Size  memoryAlignDownTo (Size  s, Size align) { return memoryAlignDownSizeTo(s, align); }
 #endif

 # define memoryAlignUp(x) (memoryAlignUpTo(x, alignof(max_align_t)))
 # define memoryAlignDown(x) (memoryAlignDownTo(x, alignof(max_align_t)))

 #define aligned_sizeof(x) memoryAlignUp(sizeof(x))

 typedef struct Memory_Allocator_Interface Memory_Allocator_Interface;
 #define MEMORY_ALLOCATOR_REQUEST_FUNC(name) void* name (void *userdata, Size amount)
 #define MEMORY_ALLOCATOR_RESCIND_FUNC(name) void  name (void *userdata, void *ptr)
 struct Memory_Allocator_Interface {
    void *userdata;
    MEMORY_ALLOCATOR_REQUEST_FUNC((*request));
    MEMORY_ALLOCATOR_RESCIND_FUNC((*rescind));

 #if defined(ENV_SANITIZER_ADDRESS)
    Bool skip_address_sanitizer;
 #endif
 };

 header_function
 Memory_Allocator_Interface memICreate (void *userdata, MEMORY_ALLOCATOR_REQUEST_FUNC((*request)), MEMORY_ALLOCATOR_RESCIND_FUNC((*rescind)))
 {
    Memory_Allocator_Interface mif;
    memzero(&mif);
    mif.userdata = userdata;
    mif.request = request;
    mif.rescind = rescind;
    return mif;
 }

 // NOTE(naman): Since memIRescind poisons the memory in order to prevent use after free, we might get an
 // "use after poisoning" error if that memory was to be reallocated. This might happen if the miface is being
 // used to wrap a libc/system allocator. Thus, in such case, this function should be called to ensure that
 // we don't end up with strange crashes upon memory allocation.
 header_function
 void memIDisableASAN (Memory_Allocator_Interface miface)
 {
    unused_variable(miface);

 #if defined(ENV_SANITIZER_ADDRESS)
    miface.skip_address_sanitizer = true;
 #endif
 }

 header_function
 void* memIRequest (Memory_Allocator_Interface miface, Size size)
 {
 #if defined(ENV_SANITIZER_ADDRESS)
    Size real_size = size + alignof(max_align_t);
    Byte *orig_ptr = cast_val(Byte*, miface.request(miface.userdata, real_size));
    Byte *ptr = orig_ptr + (real_size - size);

    if (miface.skip_address_sanitizer == false) {
        ASAN_UNPOISON_MEMORY_REGION(orig_ptr, real_size);
    }

    Size *s = cast_val(Size*, cast_val(void*, orig_ptr));
    static_assert(sizeof(*s) <= alignof(max_align_t), "Header is too small to store size");
    *s = size;

    if (miface.skip_address_sanitizer == false) {
        ASAN_POISON_MEMORY_REGION(orig_ptr, alignof(max_align_t));
    }

    return ptr;
 #else
    return miface.request(miface.userdata, size);
 #endif
 }

 header_function
 void memIRescind (Memory_Allocator_Interface miface, void *ptr)
 {
 #if defined(ENV_SANITIZER_ADDRESS)
    Byte *p = cast_val(Byte*, ptr);
    void *orig_ptr = p - alignof(max_align_t);

    if (miface.skip_address_sanitizer == false) {
        ASAN_UNPOISON_MEMORY_REGION(orig_ptr, alignof(max_align_t));
    }

    Size *s = cast_val(Size*, orig_ptr);
    static_assert(sizeof(*s) <= alignof(max_align_t), "Header is too small to fetch size");
    Size size = *s;
    Size real_size = size + alignof(max_align_t);

    if (miface.skip_address_sanitizer == false) {
        ASAN_POISON_MEMORY_REGION(orig_ptr, real_size);
    }

    miface.rescind(miface.userdata, orig_ptr);
 #else
    miface.rescind(miface.userdata, ptr);
 #endif
 }

 /*****************************************************************************************
 * Memory Push Allocator
 */

 typedef struct Memory_Push_Block_ {
    struct Memory_Push_Block_ *next;
    Size  cap;
    Size  len;
    Byte _pad[8];
    Byte data[];
 } Memory_Push_Block_;

 static_assert(offsetof(Memory_Push_Block_, data) % alignof(max_align_t) == 0, "The data member has to be properly aligned");

 typedef struct Memory_Push {
    Memory_Allocator_Interface parent_miface;
    Memory_Allocator_Interface miface;
    Memory_Push_Block_ *first, *last;
    Size size;
 } Memory_Push;

 header_function void* memPushAllot (Memory_Push *push, Size size);

 header_function
 MEMORY_ALLOCATOR_REQUEST_FUNC(memPush_IfaceAllocate)
 {
    Memory_Push *mp = cast_val(Memory_Push*, userdata);
    void *ptr = memPushAllot(mp, amount);
    return ptr;
 }

 header_function
 MEMORY_ALLOCATOR_RESCIND_FUNC(memPush_IfaceDeallocate)
 {
    Memory_Push *mp = cast_val(Memory_Push*, userdata);
    unused_variable(mp);
    unused_variable(ptr);
 }

 header_function
 Memory_Allocator_Interface memPushGetInterface (Memory_Push *mp)
 {
    return mp->miface;
 }

 header_function
 Size memPush_PaddingForAlignment (void *ptr, Size offset, Size alignment)
 {
    Uptr old_location = cast_bit(Uptr, ptr) + offset;

    if (old_location % alignment) {
        Uptr next_location = old_location + alignment;
        Uptr align_location = (next_location/alignment)*alignment;
        Uptr diff = align_location - old_location;

        return diff;
    }

    return 0;
 }

 header_function
 void* memPush_AcquireMemory (Memory_Allocator_Interface miface, Size size)
 {
    Size size_total = sizeof(Memory_Push_Block_) + size;

    Memory_Push_Block_ *block = cast_val(Memory_Push_Block_ *, memIRequest(miface, size_total));
    memzero(block);
    block->cap = size;
    block->len = 0;

 #if defined(ENV_SANITIZER_ADDRESS)
    ASAN_POISON_MEMORY_REGION(block->data, block->cap);
 #endif

    return block;
 }

 header_function
 void memPushCreate (Memory_Push *mp, Memory_Allocator_Interface miface, Size size)
 {
    memzero(mp);

    mp->parent_miface = miface;
    mp->miface = memICreate(mp, memPush_IfaceAllocate, memPush_IfaceDeallocate);
    mp->size = size;
    mp->first = cast_val(Memory_Push_Block_ *, memPush_AcquireMemory(miface, size));
    mp->last = mp->first;

    memzero(mp->first);
    mp->first->cap = size;
 }

 header_function
 void memPushDelete (Memory_Push *push)
 {
    for (Memory_Push_Block_ *mpb = push->first, *mpbn; mpb != nullptr; mpb = mpbn) {
        mpbn = mpb->next;

 #if defined(ENV_SANITIZER_ADDRESS)
        ASAN_UNPOISON_MEMORY_REGION(mpb->data, mpb->cap);
 #endif

        memIRescind(push->parent_miface, mpb);
    }
    memzero(push);
 }

 header_function
 void* memPushAllotA (Memory_Push *push, Size size, Size alignment)
 {
    Size pad = memPush_PaddingForAlignment(push->last->data, push->last->len, alignment);

    if ((push->last->len + pad + size) > push->last->cap) {
        if (push->last->next == nullptr) {
            Size s = maximum(push->size, size) + alignment;
            push->last->next = cast_val(Memory_Push_Block_ *, memPush_AcquireMemory(push->parent_miface, s));
            push->last->next->cap = s;
        }
        push->last = push->last->next;
    }

    pad = memPush_PaddingForAlignment(push->last->data, push->last->len, alignment);
    push->last->len += pad;
    void *m = push->last->data + push->last->len;
    push->last->len += size;

 #if defined(ENV_SANITIZER_ADDRESS)
    ASAN_UNPOISON_MEMORY_REGION(m, size);
 #endif

    return m;
 }

 header_function
 void* memPushAllot (Memory_Push *push, Size size)
 {
    return memPushAllotA(push, size, alignof(max_align_t));
 }

 header_function
 void memPushRemitAll (Memory_Push *push)
 {
    for (Memory_Push_Block_ *mpb = push->first; mpb != nullptr; mpb = mpb->next) {
        mpb->len = 0;

 #if defined(ENV_SANITIZER_ADDRESS)
        ASAN_POISON_MEMORY_REGION(mpb->data, mpb->cap);
 #endif
    }
    push->last = push->first;
 }

 /************************************************************************************************
 * Memory Slab Allocator
 */

 typedef struct Memory_Slab {
    Free_Grid *fg;
    Memory_Allocator_Interface miface;
    void *buffer;
    Size memory_size;
    Size elem_size;
    Size elem_count;
    Size elem_allocated;
 } Memory_Slab;

 header_function
 void slabCreate (Memory_Slab *s, Memory_Allocator_Interface miface, Uint32 elem_count, Size elem_size)
 {
    memzero(s);
    s->miface = miface;
    s->elem_count = elem_count;
    s->elem_size = elem_size;
    s->memory_size = elem_size * elem_count;
    s->fg = cast_val(Free_Grid*, memIRequest(miface, sizeof(*s->fg)));
    fgCreate(s->fg, elem_count);

    s->buffer = memIRequest(miface, s->memory_size);

 #if defined(ENV_SANITIZER_ADDRESS)
    ASAN_POISON_MEMORY_REGION(s->buffer, s->memory_size);
 #endif
 }

 header_function
 void slabDelete (Memory_Slab *s)
 {
 #if defined(ENV_SANITIZER_ADDRESS)
    ASAN_UNPOISON_MEMORY_REGION(s->buffer, s->memory_size);
 #endif

    memIRescind(s->miface, s->buffer);
    fgDelete(s->fg);
 }

 header_function
 Size slabGetIndexOfPtr (Memory_Slab *s, void *m)
 {
    Uptr offset = cast_bit(Uptr, m) - cast_bit(Uptr, s->buffer);
    Uptr index = offset / s->elem_size;
    return index;
 }

 header_function
 void* slabGetPtrOfIndex (Memory_Slab *s, Size index)
 {
    void *result = cast_val(Byte *, s->buffer) + (index * s->elem_size);
    return result;
 }

 header_function
 void* slabRequest (Memory_Slab *s)
 {
    debugAssert(s->elem_allocated < s->elem_count);
    Uint32 index;
    debugAssert(fgAllocate(s->fg, &index));
    void *result = slabGetPtrOfIndex(s, index);

 #if defined(ENV_SANITIZER_ADDRESS)
    ASAN_UNPOISON_MEMORY_REGION(result, s->elem_size);
 #endif

    s->elem_allocated++;
    return result;
 }

 header_function
 void slabRescind (Memory_Slab *s, void *m)
 {
    debugAssert(s->elem_allocated);
 #if defined(ENV_SANITIZER_ADDRESS)
    ASAN_POISON_MEMORY_REGION(m, s->elem_size);
 #endif

    Uint32 index = cast_val(Uint32, (cast_val(Dptr, (cast_val(Byte *, m) - cast_val(Byte *, s->buffer)))/cast_val(Dptr, s->elem_size)));
    fgFree(s->fg, index);
    s->elem_allocated--;
 }

 /************************************************************************************************
 * TLSF Allocator
 * --------------
 *
 * The following is a sample table with buffer_size = GiB(4), min_alloc = MiB(1) and increment_step = 8
 * Each row is primary level, where the buffer is divided into units of size being consecutive powers of two
 * (geometric progression). Each column them breaks each unit into a linear sub-unit of size being a constant
 * increment (arithmetic progression).
 *
 * ________________________________________________________________________________________________________________________
 * |    |  0    |        1      |       2      |       3      |       4      |       5      |       6      |        7     |
 * |----|-------|---------------|--------------|--------------|--------------|--------------|--------------|--------------|
 * | 31 | 2G    |  2G+256M      | 2G+512M      | 2G+768M      | 3G           | 3G+256M      | 3G+512M      | 3G+768M      |
 * | 30 | 1G    |  1G+128M      | 1G+256M      | 1G+384M      | 1G+512M      | 1G+640M      | 1G+768M      | 1G+896M      |
 * | 29 | 512M  |     576M      |    640M      |    704M      |    768M      |    832M      |    896M      |    960M      |
 * | 28 | 256M  |     288M      |    320M      |    352M      |    384M      |    416M      |    448M      |    480M      |
 * | 27 | 128M  |     144M      |    160M      |    176M      |    192M      |    208M      |    224M      |    240M      |
 * | 26 |  64M  |      72M      |     80M      |     88M      |     96M      |    104M      |    112M      |    120M      |
 * | 25 |  32M  |      36M      |     40M      |     44M      |     48M      |     52M      |     56M      |     60M      |
 * | 24 |  16M  |      18M      |     20M      |     22M      |     24M      |     26M      |     28M      |     30M      |
 * | 23 |   8M  |       9M      |     10M      |     11M      |     12M      |     13M      |     14M      |     15M      |
 * | 22 |   4M  |       4M+512K |      5M      |      5M+512K |      6M      |      6M+512K |      7M      |      7M+512K |
 * | 21 |   2M  |       2M+256K |      2M+512K |      2M+768K |      3M      |      3M+256K |      3M+512K |      3M+768K |
 * | 20 |   1M  |       1M+128K |      1M+256K |      1M+384K |      1M+512K |      1M+640K |      1M+768K |      1M+896K |
 * |------------|---------------|--------------|--------------|--------------|--------------|--------------|--------------|
 *
 * Reference:
 * [1] : http://www.gii.upv.es/tlsf/files/papers/tlsf_desc.pdf
 * [2] : http://www.gii.upv.es/tlsf/files/papers/ecrts04_tlsf.pdf
 * [3] : http://www.gii.upv.es/tlsf/files/papers/jrts2008.pdf
 */

 // TODO(naman): Right now, this is 64-bytes large. Once I am sure that the allocator is bug-free (i.e., once it has been
 // tested enough), rename `next_slot_available_for_dispensing` and `next_free` to `next` and `prev_free` to `prev`, since a slot can either
 // be empty or house a free node but not both. Also, store the `allocated` bool in the top bit of `size`. That will drop the
 // size of the struct to 48-bytes, reducing the metadata size by close to 25%.
 typedef struct TLSF_Node {
    Uint64 offset;
    Uint64 size;

    // Contains the pointer to the next slot from which a node can be dispensed in case of out-of-band allocation
    struct TLSF_Node *next_slot_available_for_dispensing;

    // For free blocks. The next member of the node contains the index of the next node in chain corresponding to another
    // free block in the same size class. The index of chain's head is stored in the free_table.
    struct TLSF_Node *next_free, *prev_free;

    // For allocated and free blocks. The following members contain the index of the nodes associated with the blocks right
    // after and before them in the address space.
    struct TLSF_Node *next_mem, *prev_mem;

    Bool allocated; // false = free (make sure it's not left true for empty slots)

    Byte _pad[7];
 } TLSF_Node;

 static_assert(sizeof(TLSF_Node) % alignof(max_align_t) == 0, \
              "TLSF Nodes need to be of the same size as alignment, " \
              "so that they can be packed in the node_dispenser, etc.");

 #define TLSF_INCREMENT_COUNT_PER_ROW 16

 typedef struct TLSF {
    Memory_Allocator_Interface miface;

    Uint64 buffer_size;
    Uint64 min_alloc;
    Uint32 rows, columns;

    // One bit for each row, to show if any bit in the corresponding row_bitmap is set.
    // Bitmap's LSB->MSB is bottom->top row
    Uint64 column_bitap;
    // One bitmap per row, with each bit standing for one cell.
    // Index 0->N maps to bottom->top row, bitmap's LSB->MSB is left->right cell
    Uint64 row_bitmaps[64];

    TLSF_Node* free_table[64][TLSF_INCREMENT_COUNT_PER_ROW];

    union { // Both point to TLSF->data
        Byte *memory; // In-place memory allocator
        TLSF_Node *node_dispenser; // Out-of-band 64-bit address space allocator
    };
    TLSF_Node *first_slot_available_for_dispensing; // If this is NULL, we are dealing with in-place memory allocator

    Byte data[];
 } TLSF;

 #define TLSF_HEAD_OF_FREE_LIST_(t, r, c) ((t)->free_table[(r)][(c)])

 // sl = log2(size)
 // ml = log2(min_alloc)
 // il = log2(increment)
 //
 // r = sl - ml
 // c = (size - 2^sl) / ((2*2^sl - 2^sl)/2^il)
 //   = (size - 2^sl) / (2^sl*(2 - 1) / 2^il)
 //   = (size - 2^sl) / (2^sl / 2^il)
 //   = (size - 2^sl) / (2^(sl-il))
 //   = (size/2^(sl-il)) - (2^sl-(2^(sl-il)))
 //   = (size/2^(sl-il)) - 2^il
 //   = (size >> (sl-il)) - 2^il
 //   = (size >> (sl-il)) - (1 << il)
 #define TLSF_ROW_(t, s) cast_val(Uint8, (bitLog2U64(s) - bitLog2U64((t)->min_alloc)))
 #define TLSF_COL_(t, s) cast_val(Uint8, (((s) >> (bitLog2U64(s) - bitLog2U64(cast_val(Uint64, ((t)->columns))))) - \
                                         (1ULL << bitLog2U64(cast_val(Uint64, ((t)->columns))))))
 #define TLSF_OOB_(t) ((t)->first_slot_available_for_dispensing)
 // NOTE(naman): All three parameters should be powers of two with value >= 2.
 // buffer_size should be greater than min_alloc
 #define tlsfCreate(_miface, _bufsz, _minal) tlsfCreateEx(_miface, _bufsz, _minal, true)
 header_function
 TLSF* tlsfCreateEx (Memory_Allocator_Interface miface,
                    Uint64 buffer_size, Uint64 min_alloc,
                    Bool out_of_band_metadata)
 {
    Uint8 increment_steps = 16;

    Uint8 buffer_size_bits     = bitLog2U64(buffer_size);
    Uint8 min_alloc_bits       = bitLog2U64(min_alloc);
    Uint8 row_count = cast_val(Uint8, buffer_size_bits - min_alloc_bits);
    Size maximum_allocation_count = buffer_size / min_alloc;

    Size total_size = memoryAlignUpTo(sizeof(TLSF), alignof(max_align_t));

    if (out_of_band_metadata) {
        Size list_size = maximum_allocation_count * sizeof(TLSF_Node);
        total_size += memoryAlignUpTo(list_size, alignof(max_align_t));
    } else {
        total_size += memoryAlignUpTo(buffer_size, alignof(max_align_t));
    }

    TLSF *tlsf = cast_val(TLSF *, memIRequest(miface, total_size));
    memzero(tlsf);

    tlsf->miface = miface;
    tlsf->buffer_size = buffer_size;
    tlsf->min_alloc = min_alloc;
    tlsf->rows = row_count;
    tlsf->columns = increment_steps;

    if (out_of_band_metadata) {
        pragma_clang("clang diagnostic push");
        pragma_clang("clang diagnostic ignored \"-Wcast-align\"");
        tlsf->node_dispenser = cast_ptr(TLSF_Node *, tlsf->data);
        pragma_clang("clang diagnostic pop");

        tlsf->first_slot_available_for_dispensing = &tlsf->node_dispenser[0];

        // Put all available nodes in a list so that we can find the next one in O(1) for real-time
        for (Size i = 0; i < maximum_allocation_count-1; i++) {
            memzero(&tlsf->node_dispenser[i]);
            tlsf->node_dispenser[i].next_slot_available_for_dispensing = &tlsf->node_dispenser[i] + 1;
        }
    } else {
        tlsf->memory = tlsf->data;
        tlsf->first_slot_available_for_dispensing = nullptr;
        tlsf->min_alloc += sizeof(TLSF_Node); // min_alloc is only used for checking if enough memory is available
    }

    { // Adding the full buffer in free_table
        TLSF_Node node;
        memzero(&node);
        node.offset = 0;
        node.size = buffer_size - 1;

        Uint8 row = TLSF_ROW_(tlsf, node.size);
        Uint8 col = TLSF_COL_(tlsf, node.size);
        TLSF_Node *slot_for_full_buffer = nullptr;
        if (out_of_band_metadata) {
            slot_for_full_buffer = tlsf->first_slot_available_for_dispensing;
            tlsf->first_slot_available_for_dispensing = slot_for_full_buffer->next_slot_available_for_dispensing;
        } else {
            pragma_clang("clang diagnostic push");
            pragma_clang("clang diagnostic ignored \"-Wcast-align\"");
            slot_for_full_buffer = cast_ptr(TLSF_Node *, tlsf->memory);
            pragma_clang("clang diagnostic pop");
        }
        *slot_for_full_buffer = node;
        TLSF_HEAD_OF_FREE_LIST_(tlsf, row, col) = slot_for_full_buffer;

        tlsf->column_bitap     |= (1ULL << row);
        tlsf->row_bitmaps[row] |= (1ULL << col);
    }

    return tlsf;
 }

 header_function
 void tlsfDelete (TLSF *tlsf)
 {
    Memory_Allocator_Interface miface = tlsf->miface;
    memIRescind(miface, tlsf);
 }

 header_function
 void tlsf_AddNodeToFreeList (TLSF *tlsf, TLSF_Node *node)
 {
    Size row = TLSF_ROW_(tlsf, node->size); // fl in [1]
    Size col = TLSF_COL_(tlsf, node->size); // sl in [1]

    // Add node to free list
    node->prev_free = nullptr;
    node->next_free = TLSF_HEAD_OF_FREE_LIST_(tlsf, row, col);
    if (TLSF_HEAD_OF_FREE_LIST_(tlsf, row, col)) {
        TLSF_HEAD_OF_FREE_LIST_(tlsf, row, col)->prev_free = node;
    }

    // Add node to table
    TLSF_HEAD_OF_FREE_LIST_(tlsf, row, col) = node;

    // Update bitmap
    tlsf->row_bitmaps[row] |= (1ULL << col);
    tlsf->column_bitap     |= (1ULL << row);
 }

 header_function
 void tlsf_RemoveNodeFromFreeList (TLSF *tlsf, TLSF_Node *node)
 {
    Size row = TLSF_ROW_(tlsf, node->size); // fl in [1]
    Size col = TLSF_COL_(tlsf, node->size); // sl in [1]

    // Remove node from the free list
    if (node->next_free) node->next_free->prev_free = node->prev_free;
    if (node->prev_free) node->prev_free->next_free = node->next_free;

    // Remove node from table
    if (TLSF_HEAD_OF_FREE_LIST_(tlsf, row, col) == node) {
        TLSF_HEAD_OF_FREE_LIST_(tlsf, row, col) = node->next_free;
    }

    // Update bitmap
    if (TLSF_HEAD_OF_FREE_LIST_(tlsf, row, col) == nullptr) { // No free blocks left in that size range
        tlsf->row_bitmaps[row] &= ~(1ULL << col);
        if (tlsf->row_bitmaps[row] == 0) {
            tlsf->column_bitap &= ~(1ULL << row);
        }
    }
 }

 header_function
 TLSF_Node* tlsf_AcquireNode (TLSF *tlsf, Size offset)
 {
    TLSF_Node *result;
    // Find a empty slot in list and put the splitted node in it
    if (TLSF_OOB_(tlsf)) {
        result = tlsf->first_slot_available_for_dispensing;
        tlsf->first_slot_available_for_dispensing = result->next_slot_available_for_dispensing;
    } else {
        pragma_clang("clang diagnostic push");
        pragma_clang("clang diagnostic ignored \"-Wcast-align\"");
        result = cast_ptr(TLSF_Node *, &tlsf->data[offset]);
        pragma_clang("clang diagnostic pop");
    }
    memzero(result);
    return result;
 }

 header_function
 void tlsf_ReleaseNode (TLSF *tlsf, TLSF_Node *node)
 {
    if (TLSF_OOB_(tlsf)) { // Add node into the slot dispenser
        node->next_slot_available_for_dispensing = tlsf->first_slot_available_for_dispensing;
        tlsf->first_slot_available_for_dispensing = node;
    }
 }

 // WARN(naman): When using with in-band memory, make sure to add the sizeof(TLSF_Node) to size_requested
 //              before calling this function.
 header_function
 TLSF_Node* tlsfAllocate (TLSF *tlsf, Size size_requested)
 {
    if (size_requested > tlsf->buffer_size) return nullptr;

    Size size = maximum(size_requested, tlsf->min_alloc);

    // The next slot for size (so that we do a loop-less good fit for real-time instead of a looped best fit)
    // sl = log2(size)
    // il = log2(increment)
    //
    // gap = size_of_each_column_in_that_row = ((2*2^sl - 2^sl)/2^il) = 2^(sl-il)
    // size_next = size + 2^(sl-il)
    // However, in case of size = 2^sl, we don't want to go to the next block. Thus, we should subtract 1 from size_next,
    // so that we end up back in the same table slot in that particular case.
    // => size_next = size + 2^(sl-il) - 1
    Size size_class = size + (1ULL << (bitLog2U64(size) - bitLog2U64(cast_val(Uint64, tlsf->columns)))) - 1;
    size_class = minimum(size_class, tlsf->buffer_size-1);

    Size row = TLSF_ROW_(tlsf, size_class); // fl in [1]
    Size col = TLSF_COL_(tlsf, size_class); // sl in [1]

    Uint64 row_bitmap = tlsf->row_bitmaps[row] & (0xFFFFFFFFFFFFFFFFULL << col);
    Size row_available, col_available;
    if (row_bitmap) {
        col_available = bitLSBU64(row_bitmap);
        row_available = row;
    } else {
        Uint64 col_bitmap = tlsf->column_bitap & (0xFFFFFFFFFFFFFFFFULL << (row + 1));
        if (col_bitmap) {
            row_available = bitLSBU64(col_bitmap);
            col_available = bitLSBU64(tlsf->row_bitmaps[row_available]);
        } else {
            return nullptr;
        }
    }

    // Get the free node of the free list and mark it used
    TLSF_Node *free_node = TLSF_HEAD_OF_FREE_LIST_(tlsf, row_available, col_available); // This will definitely succeed, so free_node != nullptr
    // WARN(naman): Don't zero out free_node. Some of it's contents are used below and some are carried forward.
    tlsf_RemoveNodeFromFreeList(tlsf, free_node);

    TLSF_Node *split_node_slot = nullptr;
    if (free_node->size >= (size + tlsf->min_alloc)) { // If the acquired block is too big
        Size split_node_size = free_node->size - size;
        Size split_offset = free_node->offset + size;

        split_node_slot = tlsf_AcquireNode(tlsf, split_offset);

        split_node_slot->offset = split_offset;
        split_node_slot->size = split_node_size;
        split_node_slot->prev_mem = free_node;
        split_node_slot->next_mem = free_node->next_mem;

        tlsf_AddNodeToFreeList(tlsf, split_node_slot);
    }

    { // Create a node for the allocated block
        // free_node->offset = offset; // Already set
        free_node->size = size;
        free_node->next_slot_available_for_dispensing = nullptr;
        free_node->next_free = nullptr;
        free_node->prev_free = nullptr;
        free_node->next_mem = split_node_slot;
        // free_node->prev_mem = prev_mem; // Already set
        free_node->allocated = true;
    }

    return free_node;
 }

 header_function
 TLSF_Node* tlsf_MergeLeft (TLSF *tlsf, TLSF_Node *node)
 {
    TLSF_Node *left = node->prev_mem;

    if (left && !left->allocated) {
        Size node_size_old = node->size;
        Size left_size_old = left->size;
        Size left_size_new = node_size_old + left_size_old;

        tlsf_RemoveNodeFromFreeList(tlsf, left);
        { // Remove node from the memory list
            left->next_mem = node->next_mem;
            if (left->next_mem) left->next_mem->prev_mem = left;
        }
        tlsf_ReleaseNode(tlsf, node);

        if (!node->allocated) { // If the node to be expanded is free (e.g., during deallocation)
            tlsf_RemoveNodeFromFreeList(tlsf, node);
            left->size = left_size_new; // Updated here, so that tlsf_AddNodeToFreeList below works
            tlsf_AddNodeToFreeList(tlsf, left);
        } else {
            left->allocated = true;
            node->allocated = false;
            left->size = left_size_new; // Still has to be updated if node->allocated == true
        }

        return left;
    } else {
        return node;
    }
 }

 header_function
 TLSF_Node* tlsf_MergeRight (TLSF *tlsf, TLSF_Node *node)
 {
    TLSF_Node *right = node->next_mem;
    if (right && !right->allocated) {
        Size node_size_old = node->size;
        Size right_size_old = right->size;
        Size node_size_new = node_size_old + right_size_old;

        tlsf_RemoveNodeFromFreeList(tlsf, right);
        { // Remove right from the memory list
            node->next_mem = right->next_mem;
            if (node->next_mem) node->next_mem->prev_mem = node;
        }
        tlsf_ReleaseNode(tlsf, right);

        if (!node->allocated) { // If the node to be expanded is free (e.g., during deallocation)
            tlsf_RemoveNodeFromFreeList(tlsf, node);
            node->size = node_size_new; // Updated here, so that tlsf_AddNodeToFreeList below works
            tlsf_AddNodeToFreeList(tlsf, node);
        } else {
            node->size = node_size_new; // Still has to be updated if node->allocated == true
        }
    }

    return node;
 }

 // WARN(naman): When using with in-band memory, make sure to add the sizeof(TLSF_Node) to size_requested
 //              before calling this function.
 header_function
 Bool tlsfAdjustLeft (TLSF *tlsf, TLSF_Node **node, Size new_size)
 {
    if (!(*node)->allocated) return false;

    *node = tlsf_MergeLeft(tlsf, *node);

    new_size = maximum(new_size, tlsf->min_alloc);

    if ((*node)->size < new_size) return false;

    Size remnant_offset = (*node)->offset;
    Size remnant_size   = (*node)->size - new_size;

    if (remnant_size == 0) return true;
    if (remnant_size < tlsf->min_alloc) return true;

    { // Add a remnnant node
        TLSF_Node *remnant = tlsf_AcquireNode(tlsf, remnant_offset);

        // Size it properly
        remnant->offset = remnant_offset;
        remnant->size = remnant_size;

        (*node)->offset += remnant->size;
        (*node)->size = new_size;

        // Add to memory list
        remnant->next_mem = *node;
        remnant->prev_mem = (*node)->prev_mem;
        if ((*node)->prev_mem) (*node)->prev_mem->next_mem = remnant;
        (*node)->prev_mem = remnant;

        tlsf_AddNodeToFreeList(tlsf, remnant);
    }

    return true;
 }

 // WARN(naman): When using with in-band memory, make sure to add the sizeof(TLSF_Node) to size_requested
 //              before calling this function.
 header_function
 Bool tlsfAdjustRight (TLSF *tlsf, TLSF_Node **node, Size new_size)
 {
    if (!(*node)->allocated) return false;

    *node = tlsf_MergeRight(tlsf, *node);

    new_size = maximum(new_size, tlsf->min_alloc);

    if ((*node)->size < new_size) return false;

    Size remnant_offset = (*node)->offset + new_size;
    Size remnant_size   = (*node)->size   - new_size;

    if (remnant_size == 0) return true;
    if (remnant_size < tlsf->min_alloc) return true;

    { // Add a remnnant node
        TLSF_Node *remnant = tlsf_AcquireNode(tlsf, remnant_offset);

        // Size it properly
        remnant->offset = remnant_offset;
        remnant->size = remnant_size;

        // node->offset doesn't change
        (*node)->size = new_size;

        // Add to memory list
        remnant->prev_mem = *node;
        remnant->next_mem = (*node)->next_mem;
        if ((*node)->next_mem) (*node)->next_mem->prev_mem = remnant;
        (*node)->next_mem = remnant;

        tlsf_AddNodeToFreeList(tlsf, remnant);
    }

    return true;
 }

 header_function
 Bool tlsfFree (TLSF *tlsf, TLSF_Node *node)
 {
    if (!node->allocated) return false;

    node = tlsf_MergeLeft(tlsf, node);
    node = tlsf_MergeRight(tlsf, node);

    tlsf_AddNodeToFreeList(tlsf, node);

    node->allocated = false;

    return true;
 }

 header_function
 Size tlsfGetSize (TLSF *tlsf, TLSF_Node *node)
 {
    unused_variable(tlsf);

    if (!node->allocated) return 0;

    return node->size;
 }

 /************************************************************************************************
 * TLSF-based General Purpose Memory Allocator
 */

 typedef struct Memory {
    TLSF *tlsf;
    Memory_Allocator_Interface parent_miface;
    Size max_mem;
 } Memory;

 // NOTE(naman): With (512 MiB, 32 KiB, 16), it will need slightly more than 1 MiB of metadata.
 header_function
 Memory memCreate (Memory_Allocator_Interface parent_miface, Size memory_size)
 {
    Size max_mem = memory_size;
    Size min_mem = bitNextPowerOf2(sizeof(TLSF_Node)); // To get atleast 50% load

    debugAssert(max_mem > min_mem);

    debugAssert(bitIsPowerOf2(max_mem));
    debugAssert(bitIsPowerOf2(min_mem));

    TLSF *tlsf = tlsfCreateEx(parent_miface, max_mem, min_mem, false);

    Memory memory = {
        .tlsf = tlsf,
        .parent_miface = parent_miface,
        .max_mem = max_mem,
    };

    return memory;
 }

 header_function
 void memRemove (Memory *mt)
 {
    tlsfDelete(mt->tlsf);
 }

 header_function
 void* memAllocate (Memory *mt, Size size)
 {
    Size real_size = size + sizeof(TLSF_Node);
 #if defined(ENV_SANITIZER_ADDRESS)
    real_size += (2 * alignof(max_align_t));
 #endif

    TLSF_Node *node = tlsfAllocate(mt->tlsf, real_size);

    Byte *result = cast_ptr(Byte *, node + 1);

 #if defined(ENV_SANITIZER_ADDRESS)
    Byte *san_header = result;
    ASAN_POISON_MEMORY_REGION(san_header, alignof(max_align_t));

    result += alignof(max_align_t);

    Byte *san_footer = result + size;
    ASAN_POISON_MEMORY_REGION(san_footer, alignof(max_align_t));
 #endif

    return result;
 }

 // For Memory_Allocator_Interface
 header_function
 void* mem_Allocate (void *mt, Size size) {
    return memAllocate(cast_val(Memory*, mt), size);
 }

 // Return nullptr if extention failed, so that the application knows that more memory is not available
 header_function
 void* memExtend (Memory *mt, void *ptr, Size size)
 {
 #if defined(ENV_SANITIZER_ADDRESS)
    Byte *san_header_old = cast_ptr(Byte *, ptr) - alignof(max_align_t);
    ASAN_UNPOISON_MEMORY_REGION(san_header_old, alignof(max_align_t));
    TLSF_Node *node = cast_ptr(TLSF_Node *, san_header_old) - 1;
    Byte *san_footer_old = cast_ptr(Byte *, node) + tlsfGetSize(node) - alignof(max_align_t);
    ASAN_UNPOISON_MEMORY_REGION(san_footer_old, alignof(max_align_t));
 #else
    TLSF_Node *node = cast_ptr(TLSF_Node *, ptr) - 1;
 #endif

    Size real_size = size + sizeof(TLSF_Node);
 #if defined(ENV_SANITIZER_ADDRESS)
    real_size += (2 * alignof(max_align_t));
 #endif

    Bool adjusted = tlsfAdjustLeft(mt->tlsf, &node, real_size);

    // NOTE(naman): If adjust succeeded, we need to poison the header and footer on newly allocated memory.
    //              If it failed, we need to poison for old one, since we unpoisoned it above.

    Byte *result = cast_ptr(Byte *, node + 1);

 #if defined(ENV_SANITIZER_ADDRESS)
    Byte *san_header = result;
    ASAN_POISON_MEMORY_REGION(san_header, alignof(max_align_t));

    result += alignof(max_align_t);

    Byte *san_footer = result + size;
    ASAN_POISON_MEMORY_REGION(san_footer, alignof(max_align_t));
 #endif

    return adjusted ? result : nullptr;
 }

 header_function
 void memFree (Memory *mt, void *ptr)
 {
 #if defined(ENV_SANITIZER_ADDRESS)
    Byte *san_header_old = cast_ptr(Byte *, ptr) - alignof(max_align_t);
    ASAN_UNPOISON_MEMORY_REGION(san_header_old, alignof(max_align_t));
    TLSF_Node *node = cast_ptr(TLSF_Node *, san_header_old) - 1;
    Byte *san_footer_old = cast_ptr(Byte *, node) + tlsfGetSize(node) - alignof(max_align_t);
    ASAN_UNPOISON_MEMORY_REGION(san_footer_old, alignof(max_align_t));
 #else
    TLSF_Node *node = cast_ptr(TLSF_Node *, ptr) - 1;
 #endif

    tlsfFree(mt->tlsf, node);
 }

 // For Memory_Allocator_Interface
 header_function
 void mem_Free (void *mt, void *ptr)
 {
    memFree(cast_val(Memory*, mt), ptr);
 }

 header_function
 Size memGetSize (Memory *mt, void *ptr)
 {
 #if defined(ENV_SANITIZER_ADDRESS)
    Byte *san_header_old = cast_ptr(Byte *, ptr) - alignof(max_align_t);
    TLSF_Node *node = cast_ptr(TLSF_Node *, san_header_old) - 1;
 #else
    TLSF_Node *node = cast_ptr(TLSF_Node *, ptr) - 1;
 #endif

    return tlsfGetSize(mt->tlsf, node);
 }

 header_function
 Memory_Allocator_Interface memGetInterface (Memory *mt)
 {
    Memory_Allocator_Interface miface = memICreate(mt, mem_Allocate, mem_Free);
    return miface;
 }

 /************************************************************************************************
 * Hashing
 * *******
 */


 // https://en.wikipedia.org/wiki/Pearson_hashing
 header_function
 Uint8 hash8 (const void *data, Size len, Uint8 seed)
 {
    const Byte *d = cast_val(const Byte*, data);

    Uint8 H = seed;

    for (Size i = 0; i < len; i++) {
        const Uint8 A = cast_val(Uint8, H ^ d[i]);
        // Permutation function generated with https://programming.sirrida.de/calcperm.php using input "4 0 3 7 5 2 1 6"
        const Uint8 B = cast_val(Uint8, (((A & 0x41) << 1) |
                                         ((A & 0x04) << 3) |
                                         ((A & 0x02) << 5) |
                                         ((A & 0x90) >> 4) |
                                         ((A & 0x28) >> 1)));
        H = B;
    }

    return H;
 }

 // Reference: Fast CRCs by Gam D. Nguyen (https://arxiv.org/pdf/1009.5949) [Fig. 8, Page 14]
 header_function
 Uint16 hash16 (const void *data, Size len, Uint16 seed)
 {
    const Byte *d = cast_val(const Byte*, data);

    const Uint16 f  = 0x7;
    const Uint16 k = 0x8000;

    Uint16 CRC = seed;

    Size i;
    // Run until zero or one bytes are left
    for (i = 0; (i + 2) <= len; i = i + 2) {
        const Uint16 hword = srlzRead16BE(d + i);

        Uint16 a = cast_val(Uint16, CRC ^ hword);

        Uint16 c;
        if (a & k) {
            c = cast_val(Uint16, (a << 1) ^ f);
        } else {
            c = cast_val(Uint16, a << 1);
        }

        if (c & k) {
            CRC = cast_val(Uint16, (c << 1) ^ f);
        } else {
            CRC = cast_val(Uint16, c << 1);
        }

        CRC = cast_val(Uint16, CRC ^ c ^ a);
    }

    // Finish up the last byte
    if ((i + 1) == len) {
        const Uint16 hword = cast_val(Uint16, cast_val(Uint16, srlzRead8(d + i)) << 8);

        Uint16 a = cast_val(Uint16, CRC ^ hword);

        Uint16 c;
        if (a & k) {
            c = cast_val(Uint16, (a << 1) ^ f);
        } else {
            c = cast_val(Uint16, a << 1);
        }

        if (c & k) {
            CRC = cast_val(Uint16, (c << 1) ^ f);
        } else {
            CRC = cast_val(Uint16, c << 1);
        }

        CRC = cast_val(Uint16, CRC ^ c ^ a);
    }

    return CRC;
 }

 // https://github.com/aappleby/smhasher/blob/0ff96f7835817a27d0487325b6c16033e2992eb5/src/MurmurHash3.cpp#L94-L146
 header_function
 Uint32 hash32 (const void *data, Size len, Uint32 seed)
 {
    const Byte *d = cast_val(const Byte*, data);

    Uint32 H = seed;

    const Uint32 c1 = 0xcc9e2d51;
    const Uint32 c2 = 0x1b873593;

 #if defined(ENV_COMPILER_MSVC)
 # define ROTL32(x,y) _rotl(x,y)
 #elif defined(ENV_COMPILER_GCC) || defined(ENV_COMPILER_CLANG)
 # define ROTL32(x,y) ((x << y) | (x >> (32 - y)))
 #endif

    Size i;
    // Run until zero or one bytes are left
    for (i = 0; (i + 4) <= len; i = i + 4) {
        Uint32 word = srlzRead32BE(d + i);

        word *= c1;
        word = ROTL32(word, 15);
        word *= c2;

        H ^= word;
        H = ROTL32(H, 13);
        H = H * 5 + 0xe6546b64;
    }

    { // Finish up the remaining bytes
        Size remaining = len - i;
        Uint32 k = 0;
        switch (remaining) {
            case 3: k ^= cast_val(Uint32, d[i+2]) << 16; switch_fallthrough;
            case 2: k ^= cast_val(Uint32, d[i+1]) << 8;  switch_fallthrough;
            case 1: k ^= cast_val(Uint32, d[i]);

                k *= c1;
                k = ROTL32(k, 15);
                k *= c2;
                H ^= k;
                switch_fallthrough;
            default: break;
        }
    }

 #undef ROTL32

    H ^= len;

    H ^= H >> 16;
    H *= 0x85ebca6b;
    H ^= H >> 13;
    H *= 0xc2b2ae35;
    H ^= H >> 16;

    return H;
 }

 // https://github.com/aappleby/smhasher/blob/0ff96f7835817a27d0487325b6c16033e2992eb5/src/MurmurHash2.cpp#L88-L137
 header_function
 Uint64 hash64 (const void *data, Size len, Uint64 seed)
 {
    const Byte *d = cast_val(const Byte*, data);

    const Uint64 m = 0xc6a4a7935bd1e995ULL;
    const Sint r = 47;

    Uint64 H = seed ^ (len * m);

    Size i;
    // Run until less than eight bytes are left
    for (i = 0; (i + 8) <= len; i = i + 8) {
        Uint64 dword = srlzRead64BE(d + i);

        dword *= m;
        dword ^= dword >> r;
        dword *= m;

        H ^= dword;
        H *= m;
    }

    { // Finish up the remaining bytes
        Size remaining = len - i;
        switch (remaining) {
            case 7: H ^= cast_val(Uint64, d[i+6]) << 48; switch_fallthrough;
            case 6: H ^= cast_val(Uint64, d[i+5]) << 40; switch_fallthrough;
            case 5: H ^= cast_val(Uint64, d[i+4]) << 32; switch_fallthrough;
            case 4: H ^= cast_val(Uint64, d[i+3]) << 24; switch_fallthrough;
            case 3: H ^= cast_val(Uint64, d[i+2]) << 16; switch_fallthrough;
            case 2: H ^= cast_val(Uint64, d[i+1]) << 8; switch_fallthrough;
            case 1: H ^= cast_val(Uint64, d[i]);
                H *= m;
                switch_fallthrough;
            default: break;
        }
    }

    H ^= H >> r;
    H *= m;
    H ^= H >> r;

    return H;
 }

 /************************************************************************************************
 * Tag and Label (Small immutable stack-allocated strings)
 * ***********************************************
 */

 // We put the string at the end since most reads will be about comparing the hash and length;
 // and even in cases when one has to read the string, the likelihood is that the string will be less
 // than 248 characters long, meaning that later cache lines will never actually load.
 typedef struct Label {
    union { struct { Uint32 h32; Uint16 h16; Uint8 h8; Uint8 len; }; Uint64 _id; };
    Char str[247]; Char _zero; // = 0
 } Label;
 static_assert(sizeof(Label) == 256, "Label incorrectly sized");
 #define labelStrCap() (elemin(valzero(Label).str))

 typedef struct Tag {
    union { struct { Uint16 h16; Uint8 h8; Uint8 len; }; Uint32 _id; };
    Char str[27]; Char _zero; // = 0
 } Tag;
 static_assert(sizeof(Tag) == 32, "Tag incorrectly sized");
 #define tagStrCap() (elemin(valzero(Tag).str))

 header_function
 Label labelMakeL (const Char *str, Uint8 len)
 {
    debugAssert(len <= labelStrCap());
    Label l = {
        .len = len,
    };
    memcpy(l.str, str, l.len);

    l.h32 = hash32(l.str, l.len, 0);
    l.h16 = hash16(l.str, l.len, 0);
    l.h8 = hash8(l.str, l.len, 0);

    return l;
 }

 header_function
 Tag tagMakeL (const Char *str, Uint8 len)
 {
    debugAssert(len <= tagStrCap());
    Tag l = {
        .len = len,
    };
    memcpy(l.str, str, l.len);

    l.h16 = hash16(l.str, l.len, 0);
    l.h8 = hash8(l.str, l.len, 0);

    return l;
 }

 header_function Label labelMake (const Char *str) { return labelMakeL(str, cast_val(Uint8, strlen(str))); }
 header_function Tag   tagMake   (const Char *str) { return tagMakeL  (str, cast_val(Uint8, strlen(str))); }

 header_function Uint8 labelLen (Label l) { return l.len; }
 header_function Uint8 tagLen   (Tag   l) { return l.len; }

 header_function Bool labelIsEqual (Label a, Label b) { return (a._id == b._id) && (memcmp(a.str, b.str, minimum(a.len, b.len)) == 0); }
 header_function Bool tagIsEqual   (Tag   a, Tag   b) { return (a._id == b._id) && (memcmp(a.str, b.str, minimum(a.len, b.len)) == 0); }

 header_function
 Bool labelIsEqualStr (Label a, const Char *b)
 {
    Bool result = strleq(a.str, a.len, b);
    return result;
 }

 header_function
 Bool tagIsEqualStr (Tag a, const Char *b)
 {
    Bool result = strleq(a.str, a.len, b);
    return result;
 }

 header_function Bool labelIsEqualStrL (Label a, const Char *b, Size blen) { return (a.len == blen) && (memcmp(a.str, b, blen) == 0); }
 header_function Bool tagIsEqualStrL   (Tag   a, const Char *b, Size blen) { return (a.len == blen) && (memcmp(a.str, b, blen) == 0); }

 header_function
 Bool marshalLabel (Label *datum, MARSHAL_FUNC((*func)), void *funcdata, Bool write, Bool big_endian) {
    if (!marshalUint8(&datum->len, func, funcdata, write, big_endian)) return false;
    if (!func(datum->str, datum->len, funcdata)) return false;
    if (!write) *datum = labelMakeL(datum->str, datum->len);
     return true;
 }

 header_function
 Bool marshalTag (Tag *datum, MARSHAL_FUNC((*func)), void *funcdata, Bool write, Bool big_endian) {
    if (!marshalUint8(&datum->len, func, funcdata, write, big_endian)) return false;
    if (!func(datum->str, datum->len, funcdata)) return false;
    if (!write) *datum = tagMakeL(datum->str, datum->len);
     return true;
 }

 /************************************************************************************************
 * Txt (String Library)
 * ********************
 */

 typedef struct Txt_Arena {
    Memory_Push mp;
 } Txt_Arena;

 header_function
 Txt_Arena txtArenaCreate (Memory_Allocator_Interface mai)
 {
    Txt_Arena ta = {};
    memPushCreate(&ta.mp, mai, MiB(1));
    return ta;
 }

 header_function
 void txtArenaDelete (Txt_Arena *ta)
 {
    memPushDelete(&ta->mp);
 }

 header_function
 void txtArenaReset (Txt_Arena *ta)
 {
    memPushRemitAll(&ta->mp);
 }

 typedef struct Txt { Char *str; } Txt;

 typedef struct Txt_Header_ {
    Uint32 len;
    Char str[];
 } Txt_Header_;

 header_function
 Txt txt_Allocate (Txt_Arena *ta, Size len)
 {
    Size size = len + 1 + sizeof(Txt_Header_);

    Txt_Header_ *hdr = cast_val(Txt_Header_*, memPushAllotA(&ta->mp, size, alignof(Txt_Header_)));

    debugAssert(len < UINT32_MAX);

    hdr->len = cast_val(Uint32, len);
    hdr->str[len] = '\0';

    Char *s = hdr->str;
    Txt t = {.str = s};
    return t;
 }

 header_function
 Txt txtNewL (Txt_Arena *ta, Char *s, Size len)
 {
    Txt t = txt_Allocate(ta, len);

    memcpy(t.str, s, len);

    return t;
 }

 header_function
 Txt txtNew (Txt_Arena *ta, Char *s)
 {
    Size length = strlen(s);

    Txt result = txtNewL(ta, s, length);
    return result;
 }

 header_function
 Char* txtStr (Txt t)
 {
    Char *s = t.str;
    return s;
 }

 header_function
 Size txtLen (Txt t)
 {
    Txt_Header_ *hdr = containerof(t.str, Txt_Header_, str);
    return hdr->len;
 }

 header_function
 Bool txt_EqN (Txt t1, Txt t2, Size len)
 {
    Bool result = memcmp(t1.str, t2.str, len) == 0;
    return result;
 }

 header_function
 Bool txtEqTT (Txt t1, Txt t2)
 {
    Size t1l = txtLen(t1);
    Size t2l = txtLen(t2);

    if (t1l != t2l) return false;

    Bool result = txt_EqN(t1, t2, t1l);
    return result;
 }

 header_function
 Bool txtEqTC (Txt t1, Char *s2)
 {
    Size t1l = txtLen(t1);
    Size s2l = strlen(s2);

    if (t1l != s2l) return false;

    Bool result = memcmp(txtStr(t1), s2, t1l) == 0;
    return result;
 }

 header_function
 Bool txtEqTL (Txt t1, Label l2)
 {
    Size t1l = txtLen(t1);
    Size l2l = labelLen(l2);

    if (t1l != l2l) return false;

    Bool result = memcmp(txtStr(t1), l2.str, t1l) == 0;
    return result;
 }

 #define txtEq(_a, _b)                           \
    _Generic((_b)                               \
             , Txt : txtEqTT                    \
             , Char* : txtEqTC                  \
             , Label : txtEqTL)(_a, _b)

 header_function
 Char* txtChr (Txt t, Char c)
 {
    Size len = txtLen(t);
    Char *str = txtStr(t);
    for (Size i = 0; i < len; i++) {
        if (str[i] == c) return &str[i];
    }

    return nullptr;
 }

 header_function
 Char* txtChrR (Txt t, Char c)
 {
    Size len = txtLen(t);
    Char *str = txtStr(t);
    for (Size i = len - 1; i < len; i++) {
        if (str[i] == c) return &str[i];
    }

    return nullptr;
 }

 typedef struct Txt_Span {
    Txt t;
    Uint32 begin, end; // begin inclusive, end exclusive { [begin, end) }
    Uint64 hash;
 } Txt_Span;

 header_function
 Txt_Span txtGetSpan (Txt t, Sint64 begin, Sint64 end)
 {
    if (begin < 0) begin = cast_val(Uint32, cast_val(Sint32, txtLen(t)) - (-begin));
    if (end < 0)   end   = cast_val(Uint32, cast_val(Sint32, txtLen(t)) - (-end));

    if (end <= begin) {
        end = begin + 1; // So that we don't end up with unnecessary errors
    }

    if (end > cast_val(Sint, txtLen(t))) {
        end = cast_val(Sint, txtLen(t));
    }

    Txt_Span ts = {
        .t = t,
        .begin = cast_val(Uint32, begin),
        .end = cast_val(Uint32, end),
        .hash = hash64(txtStr(t) + begin, cast_val(Size, end - begin), 0),
    };

    return ts;
 }

 header_function
 Char* txtspanStr(Txt_Span ts)
 {
    Char *str = txtStr(ts.t) + ts.begin;
    return str;
 }

 header_function
 Size txtspanLen (Txt_Span ts)
 {
    Size len = ts.end - ts.begin;
    return len;
 }

 header_function
 Txt txtspanExtract (Txt_Arena *ta, Txt_Span ts)
 {
    return txtNewL(ta, txtspanStr(ts), txtspanLen(ts));
 }

 header_function
 Bool txtspanIsEqualStr (Txt_Span ts, Char *str)
 {
    Size strs = strlen(str);
    Size tss = ts.end - ts.begin;
    return (strs == tss) && (memcmp(str, ts.t.str + ts.begin, tss) == 0);
 }

 header_function
 Bool txtspanIsEqualLabel (Txt_Span ts, Label l)
 {
    return labelIsEqualStrL(l, ts.t.str + ts.begin, ts.end-ts.begin);
 }

 header_function
 Label labelMakeTS (Txt_Span ts)
 {
    return labelMakeL(ts.t.str + ts.begin, cast_val(Uint8, ts.end - ts.begin));
 }

 typedef struct Txt_Fmt_Chunk Txt_Fmt_Chunk;

 typedef struct Txt_Fmt_Pool {
    Memory_Allocator_Interface miface;
    Memory_Slab chunk_allocator;
    Size chunk_count;
 } Txt_Fmt_Pool;

 typedef struct Txt_Fmt {
    Txt_Fmt_Pool *pool;
    Size length;
    Txt_Fmt_Chunk *first_chunk, *last_chunk;
 } Txt_Fmt;

 struct Txt_Fmt_Chunk {
    Txt_Fmt_Chunk *next_chunk;
    Char data[55];
    Uint8 filled;
 };
 static_assert(sizeof(Txt_Fmt_Chunk) == 64, "Txt fmt chunk won't fit in a single cache line");
 #define TXT_FMT_CHARS_PER_CHUNK_ elemin(compound_init(Txt_Fmt_Chunk, {}).data)
 static_assert(TXT_FMT_CHARS_PER_CHUNK_ == 55, "Number of bytes in each Txt fmt chunk is wrong");

 header_function
 void txtfmtPoolCreate (Txt_Fmt_Pool *tp, Memory_Allocator_Interface mem, Uint32 chunk_count)
 {
    // 512 chunks in 32 KiB (same size as the free-grid metadata)
    chunk_count = maximum(chunk_count, 512u);

    // This is initialized like this instead of a compound initialize to avoid the stack from blowing up.
    memset(tp, 0, sizeof(*tp));
    tp->chunk_count = chunk_count;

    slabCreate(&tp->chunk_allocator, mem, chunk_count, sizeof(Txt_Fmt_Chunk));
 }

 header_function
 void txtfmtPoolDelete (Txt_Fmt_Pool *tp)
 {
    slabDelete(&tp->chunk_allocator);
 }

 header_function
 Txt_Fmt txtfmtCreate (Txt_Fmt_Pool *tp)
 {
    Txt_Fmt txt = {
        .pool = tp,
    };

    txt.first_chunk = cast_val(Txt_Fmt_Chunk*, slabRequest(&tp->chunk_allocator));
    memzero(txt.first_chunk);
    txt.last_chunk = txt.first_chunk;

    return txt;
 }

 header_function
 void txtfmtRemove (Txt_Fmt *tf)
 {
    for (Txt_Fmt_Chunk *tc = tf->first_chunk, *tcc; tc != nullptr; tc = tcc) {
        tcc = tc->next_chunk;
        slabRescind(&tf->pool->chunk_allocator, tc);
    }
 }

 header_function
 Txt_Fmt_Chunk* txtfmt_GetLastChunk (Txt_Fmt *tf)
 {
    if (tf->last_chunk->filled == TXT_FMT_CHARS_PER_CHUNK_) {
        tf->last_chunk->next_chunk = cast_val(Txt_Fmt_Chunk*, slabRequest(&tf->pool->chunk_allocator));
        memzero(tf->last_chunk->next_chunk);
        tf->last_chunk = tf->last_chunk->next_chunk;
    }

    return tf->last_chunk;
 }

 header_function
 Size txtfmtAffix (Txt_Fmt *tf, Char c)
 {
    Txt_Fmt_Chunk *s = txtfmt_GetLastChunk(tf);
    s->data[s->filled] = c;
    s->filled++;
    tf->length++;
    return 1;
 }

 header_function
 PRINT_CALLBACK_FUNC(txtfmt_PrintCallback)
 {
    unused_variable(buffer_cap);

    Txt_Fmt *tf = cast_val(Txt_Fmt*, userdata);
    Txt_Fmt_Chunk *s = txtfmt_GetLastChunk(tf);
    debugAssert(s);

    Size length = cast_val(Size, buffer_len);
    Size empty = TXT_FMT_CHARS_PER_CHUNK_ - s->filled;

    if (length > empty) {
        memcpy(s->data + s->filled, filled_buffer, empty);
        s->filled += cast_val(Uint8, empty);

        if (length - empty) {
            s = txtfmt_GetLastChunk(tf);
            memcpy(s->data, filled_buffer + empty, length - empty);
            s->filled += cast_val(Uint8, length - empty);
        }
    } else if (length != 0) {
        memcpy(s->data + s->filled, filled_buffer, length);
        s->filled += cast_val(Uint8, length);
    }

    tf->length += length;
 }

 with_clang_gcc(__attribute__(( format( __printf__, 2, 0 ))))
 header_function
 Size txtfmtAppendFV (Txt_Fmt *tf, const Char *fmt, va_list va)
 {
    Char buf[TXT_FMT_CHARS_PER_CHUNK_] = {};
    Size size = printStringVarArg(buf, elemin(buf), fmt, va, txtfmt_PrintCallback, tf);
    return size;
 }

 with_clang_gcc(__attribute__ (( format( __printf__, 2, 3 ))))
 header_function
 Size txtfmtAppendF (Txt_Fmt *tf, const Char *fmt, ...)
 {
    va_list args;

    va_start(args, fmt);
    Size result = txtfmtAppendFV(tf, fmt, args);
    va_end(args);

    return result;
 }

 header_function
 Size txtfmtAppendCL (Txt_Fmt *tf, const Char *tc, Size length)
 {
    Size copied = 0;

    Size len = length;

    while (len) {
        Txt_Fmt_Chunk *s = txtfmt_GetLastChunk(tf);
        Size l = minimum(len, TXT_FMT_CHARS_PER_CHUNK_);
        l = minimum(l, TXT_FMT_CHARS_PER_CHUNK_ - s->filled);

        memcpy(s->data + s->filled, tc + copied, l);

        copied += l;
        len -= l;

        s->filled += cast_val(Uint8, l);
    }

    tf->length += length;

    return copied;
 }

 header_function
 Size txtfmtAppendC (Txt_Fmt *tf, const Char *tc)
 {
    Size len = strlen(tc);
    Size result = txtfmtAppendCL(tf, tc, len);
    return result;
 }

 header_function
 Size txtfmtAppendT (Txt_Fmt *tf, Txt txt)
 {
    Size result = txtfmtAppendCL(tf, txtStr(txt), txtLen(txt));
    return result;
 }

 header_function
 Size txtfmtAppendTs (Txt_Fmt *tf, Txt_Span ts)
 {
    Size result = txtfmtAppendCL(tf, txtStr(ts.t) + ts.begin, cast_val(Size, ts.end - ts.begin));
    return result;
 }

 header_function
 Size txtfmtAppendLbl (Txt_Fmt *tf, Label lbl)
 {
    Size result = txtfmtAppendCL(tf, lbl.str, lbl.len);
    return result;
 }

 header_function
 Txt txtfmtFinish (Txt_Fmt *tf, Txt_Arena *ta)
 {
    Txt t = txt_Allocate(ta, tf->length);

    Size filled = 0;
    for (Txt_Fmt_Chunk *s = tf->first_chunk; s != nullptr; s = s->next_chunk) {
        debugAssert((filled + s->filled) < UINT32_MAX);
        memcpy(txtStr(t) + filled, s->data, s->filled);
        filled += s->filled;
    }

    (txtStr(t))[filled] = '\0';

    txtfmtRemove(tf);

    return t;
 }

 #undef TXT_FMT_BYTES_PER_CHUNK_

 /************************************************************************************************
 * Ravel (Print to string)
 *
 * What can change the nature of a man?
 */

 header_function
 Size ravelSint8 (Txt_Fmt *fmt, Sint8 val)
 {
    Size s = txtfmtAppendF(fmt, "%d", cast_val(Sint, val));
    return s;
 }

 header_function
 Size ravelUint8 (Txt_Fmt *fmt, Uint8 val)
 {
    Size s = txtfmtAppendF(fmt, "%u", cast_val(Uint, val));
    return s;
 }

 header_function
 Size ravelSint16 (Txt_Fmt *fmt, Sint16 val)
 {
    Size s = txtfmtAppendF(fmt, "%d", cast_val(Sint, val));
    return s;
 }

 header_function
 Size ravelUint16 (Txt_Fmt *fmt, Uint16 val)
 {
    Size s = txtfmtAppendF(fmt, "%u", cast_val(Uint, val));
    return s;
 }

 header_function
 Size ravelSint32 (Txt_Fmt *fmt, Sint32 val)
 {
    Size s = txtfmtAppendF(fmt, "%d", val);
    return s;
 }

 header_function
 Size ravelUint32 (Txt_Fmt *fmt, Uint32 val)
 {
    Size s = txtfmtAppendF(fmt, "%u", val);
    return s;
 }

 header_function
 Size ravelFloat32 (Txt_Fmt *fmt, Float32 val)
 {
    Size s = txtfmtAppendF(fmt, "%f", cast_val(Float64, val));
    return s;
 }

 header_function
 Size ravelSint64 (Txt_Fmt *fmt, Sint64 val)
 {
    Size s = txtfmtAppendF(fmt, "%lld", val);
    return s;
 }

 header_function
 Size ravelUint64 (Txt_Fmt *fmt, Uint64 val)
 {
    Size s = txtfmtAppendF(fmt, "%llu", val);
    return s;
 }

 header_function
 Size ravelFloat64 (Txt_Fmt *fmt, Float64 val)
 {
    Size s = txtfmtAppendF(fmt, "%f", val);
    return s;
 }

 header_function
 Size ravelStr (Txt_Fmt *fmt, Char *str)
 {
    Size s = txtfmtAppendF(fmt, "\"%s\"", str);
    return s;
 }

 header_function
 Size ravelLabel (Txt_Fmt *fmt, Label lbl)
 {
    Size s = txtfmtAppendF(fmt, "\"%.*s\"", cast_val(Sint, lbl.len), lbl.str);
    return s;
 }

 header_function
 Size ravelTxt (Txt_Fmt *fmt, Txt t)
 {
    Size s = txtfmtAppendF(fmt, "\"%.*s\"", cast_val(Sint, txtLen(t)), txtStr(t));
    return s;
 }

 header_function
 Size ravelTxtSpan (Txt_Fmt *fmt, Txt_Span ts)
 {
    Size s = txtfmtAppendF(fmt, "\"%.*s\"", cast_val(Sint, txtspanLen(ts)), txtspanStr(ts));
    return s;
 }

 #define ravel(_fmt, _val)                       \
    _Generic((_val)                             \
             , Sint8    : ravelSint8            \
             , Uint8    : ravelUint8            \
             , Sint16   : ravelSint16           \
             , Uint16   : ravelUint16           \
             , Sint32   : ravelSint32           \
             , Uint32   : ravelUint32           \
             , Float32  : ravelFloat32          \
             , Sint64   : ravelSint64           \
             , Uint64   : ravelUint64           \
             , Float64  : ravelFloat64          \
             , Char*    : ravelStr              \
             , Label    : ravelLabel            \
             , Txt      : ravelTxt              \
             , Txt_Span : ravelTxtSpan          \
        )(_fmt, _val)

 /************************************************************************************************
 * B-Tree
 */

 // B-Tree implementation
 // Reference: Chapter 18 of Introduction to Algorithms - Fourth Edition by Cormen, et al.
 // Some changes have been made to make the implementation more amenable to future optimizations
 // (SIMD or otherwise).
 // First, the idea of "minimum degree" has been discarded in favour of having a maximum count.
 // This means that we store an even number of keys instead of an odd number.
 // Second, there is no leaf boolean; the leaf property is known by comparing the first child to nullptr.

 // WARN(naman): This BTree doesn't rebalances itself when keys are dropped. This means that the size
 // of the tree only ever goes up. Thus, don't use this in places where a lot of deletions will
 // be taking place, since a unnecessary large table will slow down the lookups.

 // NOTE(naman): All dynamic allocations are of the same size (sizeof(BTree_Node *)), so a slab
 // allocator can be used. Alternatively, since not too many deletions would be performed (see above),
 // a simple stack allocator will also suffice.

 // The value of M is chosen so that the all keys can be compared using 5 256-wide SSE instructions.
 // In addition, this also makes the BTree_Node align with cache line sizes of 64, 128, 256 and 512 bytes.
 #define M 20

 typedef struct BTree_Node {
    // In the keys and data array, [M/2 - 1, M] elements have to be present

    // NOTE(naman): More often used members are on top together for cache friendliness.
    Uint64 keys[M]; // Called k in Cormen
    Size count; // Called n in Cormen
    struct BTree_Node *children[M + 1]; // Called c in Cormen

    void  *data[M];
    struct BTree_Node *next, *prev; // Unused for now
 } BTree_Node;

 static_assert(sizeof(BTree_Node) == 512, "BTree_Node's size is not expected");

 typedef struct BTree {
    Memory_Allocator_Interface miface;
    BTree_Node *root;
 } BTree;

 header_function
 void btreeCreate (BTree *b, Memory_Allocator_Interface miface)
 {
    memzero(b);
    b->miface = miface;

    b->root = cast_val(BTree_Node *, memIRequest(b->miface, sizeof(*b->root)));
 }

 header_function
 void btreeDelete (BTree *b)
 {
    memIRescind(b->miface, b->root);
 }

 header_function
 void* btree_FindReplace (BTree *b, Uint64 key, Bool replace_value, void *replacement)
 {
    BTree_Node *n = b->root;

    while (n != nullptr) {
        Size i;
        Bool found_equal = false;

        for (i = 0; i < n->count; i++) {
            if (n->keys[i] == key) {
                found_equal = true;
                break;
            } else if (n->keys[i] > key) {
                break;
            }
        }

        if (found_equal) {
            void *old = n->data[i];
            if (replace_value) n->data[i] = replacement;
            return old;
        } else if (n->children[0] == nullptr) { // Leaf Node
            return nullptr;
        } else {
            // If greater found, i is pointing to the key just greater, meaning the i-th child is
            // between the just smaller and just greater keys. If greater was not found, then i is
            // equal to n->count, which also points to the last child.
            n = n->children[i];
        }
    }

    return nullptr;
 }

 header_function
 void* btreeSearch (BTree *b, Uint64 key) {
    void *result = btree_FindReplace (b, key, false, nullptr);
    return result;
 }

 header_function
 void* btreeReplace (BTree *b, Uint64 key, void *new_value) {
    void *result = btree_FindReplace (b, key, true, new_value);
    return result;
 }

 header_function
 void* btreeRemove (BTree *b, Uint64 key) {
    void *result = btree_FindReplace (b, key, true, nullptr);
    return result;
 }

 // NOTE(naman): Function returns the previous value, if any.
 header_function
 void btree_SplitChild (BTree *b, BTree_Node *parent, Size i)
 {
    // All indices in this function's comments (I, T, etc.) are zero indexed. Play close attention.
    // To mark them against regular numbers, they are written in capital. `i` and `I` are both the same,
    // `i` is used when talking about numbers (count, etc.), while `I` is used when talking about indices.

    // This function gets called if the I-th child of the node has become full (i.e., has m keys)
    // and need to be split.

    BTree_Node *full = parent->children[i];

    BTree_Node *newnode = cast_val(BTree_Node *, memIRequest(b->miface, sizeof(*newnode)));

    // We will move the last m/2-1 keys from `full` into `newnode`. Since `full` had m keys, this means that
    // `full` will be left with m/2+1 keys from [0...M/2] and m/2-1 keys from [M/2+1...M-1] wil become `newnode`'s keys
    // from [0..M/2-1].

    newnode->count = M/2 - 1;
    memcpy(&newnode->keys[0], &full->keys[M/2 + 1], newnode->count * sizeof(newnode->keys[0]));
    memcpy(&newnode->data[0], &full->data[M/2 + 1], newnode->count * sizeof(newnode->data[0]));

    // If the `full` node is not a leaf, it has a bunch of children. Since the trailing keys got moved,
    // the children whose points fall between those keys also have to be moved. Because we moved m/2-1 keys,
    // there would be m/2 children associated with them that also need to be moved. This means that the
    // M/2-th key in `full` will have no right-child anymore. But that's okay, since the M/2-th
    // key will get get moved to `parent` anyways. So, we will move m/2 children from `full`'s [M/2+1...M]
    // to `newnode`'s [0...M/2-1].

    if (full->children[0] != nullptr) { // Not a leaf
        memcpy(&newnode->children[0], &full->children[M/2+1], (newnode->count + 1) * sizeof(newnode->children[0]));
    }

    // Finally, we set the `full`'s count to m/2. Remember that `full` actually still has a total of m/2+1 elements.
    // However, the element on index M/2 will soon get moved to the `parent`.

    full->count = M/2;

    // But before we can move `full`'s M/2 element and attach `newnode` as a new child, we have to make space
    // for it in `parent`. First, let's move the children. Let parent->count be n. If the total number
    // of keys that the parent has is n from 0 to N-1, the children would be n+1 from 0 to N. Since the
    // `full` was attached at index I, we nee to move all children from [I+1...N] to [I+2...N+1], so that
    // we can add `newnode` as a child at index I+1.

    for (Size j = (parent->count + 1); j >= (i + 2); j--) {
        parent->children[j] = parent->children[j-1];
    }

    parent->children[i+1] = newnode;

    // Now, the turn for keys. Since `full`'s M/2 element has to be move to `parent`'s I element (right between
    // the I-th and (I+1)-th child), we need to make space for it by moving elements from [I...N-1] to [I+1...N].

    for (Size j = parent->count; j >= (i + 1); j--) {
        parent->keys[j] = parent->keys[j-1];
        parent->data[j] = parent->data[j-1];
    }

    // Now that there is a space in `parent`'s key array, let's put `full`'s M/2 element there.

    parent->keys[i] = full->keys[M/2];
    parent->data[i] = full->data[M/2];

    parent->count++;
 }

 pragma_msvc("warning ( push )");
 pragma_msvc("warning ( disable: 6385 )"); // Analyze: READ_OVERRUN (Reading invalid data from 'n->keys' and 'n->children')
 header_function
 void* btreeInsert (BTree *b, Uint64 key, void *datum)
 {
    if (b->root->count == M) {
        // This is a clever hack. Cormen wanted to use the existing splitting function (btree_SplitChild) to spit the root too.
        // But splitting requires transferring one item to our parent, and the root has no parent. So, they temporarily create
        // an invalid tree by creating a new node with zero elements and setting the root as its child. But the splitting process
        // adds one element and one more child (the new splitted node) to the new root, thus making the tree valid again.
        BTree_Node *s = cast_val(BTree_Node *, memIRequest(b->miface, sizeof(*s)));

        s->children[0] = b->root;
        b->root = s;

        btree_SplitChild(b, b->root, 0);
    }

    BTree_Node *n = b->root;

    while (n != nullptr) {
        Size i = n->count;

        if (n->children[0] == nullptr) { // Leaf
            if ((key >= n->keys[0]) && (key <= n->keys[i-1])) { // If the key is in the space of this node
                for (Size j = 0; j < i; j++) { // Check if the key has already been inserted
                    if (key == n->keys[j]) {
                        void *result = n->data[j];
                        n->data[j] = datum;
                        return result;
                    }
                }
            }

            while ((i >= 1) && (key < n->keys[i-1])) { // Else move the data
                n->keys[i] = n->keys[i-1];
                n->data[i] = n->data[i-1];
                i--;
            }

            n->keys[i] = key; // And copy the key
            n->data[i] = datum;

            n->count++;

            return nullptr;
        } else {
            while ((i >= 1) && (key < n->keys[i-1])) {
                i--;
            }

            if (n->children[i]->count == M) {
                btree_SplitChild(b, n, i);
                if (key > n->keys[i]) {
                    i++;
                }
            }

            n = n->children[i];
        }
    }

    return nullptr;
 }
 pragma_msvc("warning ( pop )");

 header_function
 void* btreeSwitch (BTree *b, Uint64 key, void *oldval, void *newval)
 {
    return oldval ? btreeReplace(b, key, newval) : btreeInsert(b, key, newval);
 }

 // Return false to prematuerely stop iteration
 #define BTREE_ITERATOR_FUNC(name) Bool name (void *funcdata, Uint64 key, void* value)

 header_function
 Bool btree_IterateNode (BTree_Node *bn, BTREE_ITERATOR_FUNC((*func)), void *funcdata)
 {
    if (bn == nullptr) return true;

    Size i;
    for (i = 0; i < bn->count; i++) {
        if (btree_IterateNode(bn->children[i], func, funcdata) == false) return false;
        if (func(funcdata, bn->keys[i], bn->data[i]) == false) return false;
    }
    if (btree_IterateNode(bn->children[i], func, funcdata) == false) return false;
    return true;
 }

 header_function
 Bool btreeIterate (BTree *b, BTREE_ITERATOR_FUNC((*func)), void *funcdata)
 {
    Bool result = btree_IterateNode(b->root, func, funcdata);
    return result;
 }

 #undef M

 /************************************************************************************************
 * Hashmap
 */

 // WARN(naman): Same performance characteristics as the BTree. See it's documentation for more details

 typedef struct Hashmap_Entry {
    struct Hashmap_Entry *next;
    Uint64 hash;
    Uint64 key;
    void *value;
 } Hashmap_Entry;

 typedef struct Hashmap {
    BTree btree;
    COMPARE_FUNC((*compare_func));
    Memory_Allocator_Interface entry_alloc; // Allocates sizeof(Hashmap_Entry) = 16 bytes at a time
 } Hashmap;

 header_function
 void hashmapCreate (Hashmap *hm, COMPARE_FUNC((*compare_func)),
                    Memory_Allocator_Interface entry_alloc, Memory_Allocator_Interface node_alloc)
 {
    *hm = compound_init(Hashmap, {
            .compare_func = compare_func,
            .entry_alloc = entry_alloc,
        });
    btreeCreate(&hm->btree, node_alloc);
 }

 // Return false to prematuerely stop iteration
 #define HASHMAP_ITERATOR_FUNC(name) Bool name (void *funcdata, Hashmap_Entry *entry)

 struct Hashmap_Iterator_Data {
    HASHMAP_ITERATOR_FUNC((*func));
    void *funcdata;
 };

 header_function
 BTREE_ITERATOR_FUNC(hashmap_IterateNode)
 {
    unused_variable(key);

    struct Hashmap_Iterator_Data *data = cast_val(struct Hashmap_Iterator_Data *, funcdata);
    Hashmap_Entry *list = cast_val(Hashmap_Entry *, value);

    for (Hashmap_Entry *e = list, *en; e != nullptr; e = en) {
        en = e->next;
        if ((data->func)(data->funcdata, e) == false) return false;
    }

    return true;
 }

 header_function
 Bool hashmapIterate (Hashmap *hm, HASHMAP_ITERATOR_FUNC((*func)), void *funcdata)
 {
    struct Hashmap_Iterator_Data data = {
        .func = func,
        .funcdata = funcdata,
    };

    Bool result = btree_IterateNode(hm->btree.root, hashmap_IterateNode, &data);
    return result;
 }

 header_function
 HASHMAP_ITERATOR_FUNC(hashmap_DeleteNodeList)
 {
    Hashmap *hm = cast_val(Hashmap *, funcdata);
    memIRescind(hm->entry_alloc, entry);
    return true;
 }

 header_function
 void hashmapDelete (Hashmap *hm)
 {
    hashmapIterate(hm, hashmap_DeleteNodeList, hm);
    btreeDelete(&hm->btree);
    memzero(hm);
 }

 // Returns the existing data from BTree and saves out matched entry into matched pointer
 header_function
 Hashmap_Entry* hashmap_FindEntry (Hashmap *hm, Uint64 hash, Uint64 key, Hashmap_Entry **matched)
 {
    Hashmap_Entry *list = cast_val(Hashmap_Entry *, btreeSearch(&hm->btree, hash));

    for (Hashmap_Entry *ei = list; ei != nullptr; ei = ei->next) {
        if (hm->compare_func(cast_bit(void *, ei->key), cast_bit(void *, key)) == 0) {
            if (matched) *matched = ei;
            return list;
        }
    }

    return list;
 }

 header_function
 void hashmapInsert (Hashmap *hm, Uint64 hash, Uint64 key, void *value)
 {
    Hashmap_Entry *matched = nullptr;
    Hashmap_Entry *list = hashmap_FindEntry(hm, hash, key, &matched);
    if (matched) {
        matched->value = value;
        return;
    }

    Hashmap_Entry *e = cast_val(Hashmap_Entry *, memIRequest(hm->entry_alloc, sizeof(*e)));
    *e = compound_init(Hashmap_Entry, {
            .next = list,
            .hash = hash,
            .key = key,
            .value = value,
        });

    btreeSwitch(&hm->btree, hash, list, e);
 }

 header_function
 void* hashmapSearch (Hashmap *hm, Uint64 hash, Uint64 key)
 {
    Hashmap_Entry *matched = nullptr;
    hashmap_FindEntry(hm, hash, key, &matched);
    return matched ? matched->value : nullptr;
 }

 header_function
 void hashmapRemove (Hashmap *hm, Uint64 hash, Uint64 key)
 {
    Hashmap_Entry *matched = nullptr;
    Hashmap_Entry *list = hashmap_FindEntry(hm, hash, key, &matched);
    if (matched) {
        if (list == matched) {
            btreeSwitch(&hm->btree, hash, list, list->next);
        } else {
            for (Hashmap_Entry *e = list; e != nullptr; e = e->next) {
                if (e->next == matched) {
                    Hashmap_Entry *old = e->next;
                    e->next = e->next->next;
                    memIRescind(hm->entry_alloc, old);
                    return;
                }
            }
        }
    }
 }

 /******************************************************************************************
 * Intrusive Circular Doubly Linked List
 * Inspired from github.com/torvalds/linux/blob/master/include/linux/list.h
 */

 typedef struct List_Node {
    struct List_Node *next, *prev;
 } List_Node;

 /* To get the node (container struct) holding the linked list node */
 #define listThisNode(nodeptr, type, member) containerof((nodeptr),       type, member)
 #define listNextNode(nodeptr, type, member) containerof((nodeptr)->next, type, member)
 #define listPrevNode(nodeptr, type, member) containerof((nodeptr)->prev, type, member)

 /*
 * Initialize the list ike this:
 * *****************************
 * typedef struct N {
 *     List_Node l;
 * } N;
 *
 * N n;
 * n.l = listInit(n.l);
 */
 #define listInit(name) (List_Node) {&(name), &(name)}

 #define listReset(ptr) do{(ptr)->next = (ptr); (ptr)->prev = (ptr);}while(0)

 header_function
 void list_Add (List_Node *newnode, List_Node *prev, List_Node *next)
 {
    next->prev = newnode;
    newnode->next = next;
    newnode->prev = prev;
    prev->next = newnode;
 }

 header_function
 void listAddAfter (List_Node *newnode, List_Node *after_this)
 {
    list_Add(newnode, after_this, after_this->next);
 }

 header_function
 void listAddBefore (List_Node *newnode, List_Node *before_this)
 {
    list_Add(newnode, before_this->prev, before_this);
 }

 header_function
 void list_RemoveNodeBetween (List_Node * prev, List_Node * next)
 {
    next->prev = prev;
    prev->next = next;
 }

 header_function
 void listRemove (List_Node *entry)
 {
    list_RemoveNodeBetween(entry->prev, entry->next);
    entry->next = nullptr;
    entry->prev = nullptr;
 }

 header_function
 void listRemoveAndInit (List_Node *entry)
 {
    list_RemoveNodeBetween(entry->prev, entry->next);
    listReset(entry);
 }

 header_function
 void listReplace(List_Node *old, List_Node *newnode)
 {
    newnode->next = old->next;
    newnode->next->prev = newnode;
    newnode->prev = old->prev;
    newnode->prev->next = newnode;
 }

 header_function
 void listReplaceAndInit(List_Node *old, List_Node *newnode)
 {
    listReplace(old, newnode);
    listReset(old);
 }

 header_function
 void listSwap(List_Node *entry1, List_Node *entry2)
 {
    List_Node *pos = entry2->prev;

    listRemove(entry2);
    listReplace(entry1, entry2);
    if (pos == entry1) pos = entry2;
    listAddAfter(entry1, pos);
 }

 header_function
 void listMoveAfter (List_Node *list, List_Node *after_this)
 {
    list_RemoveNodeBetween(list->prev, list->next);
    listAddAfter(list, after_this);
 }

 header_function
 void listMoveBefore (List_Node *list, List_Node *before_this)
 {
    list_RemoveNodeBetween(list->prev, list->next);
    listAddBefore(list, before_this);
 }

 header_function
 Bool listIsEmpty (List_Node *node)
 {
    Bool result = (node->next == node);
    return result;
 }

 // Splice in a List `list`, between the Nodes `node` and `node->next`
 header_function
 void list_Splice (List_Node *list, List_Node *node)
 {
    List_Node *first = list->next;
    List_Node *last = list->prev;
    List_Node *at = node->next;

    first->prev = node;
    node->next = first;

    last->next = at;
    at->prev = last;
 }

 // Splice in a List `list`, between the Nodes `node` and `node->next`
 header_function
 void listSplice (List_Node *list, List_Node *node)
 {
    if (!listIsEmpty(list)) list_Splice(list, node);
 }

 header_function
 void listSpliceInit (List_Node *list, List_Node *node)
 {
    if (!listIsEmpty(list)) {
        list_Splice(list, node);
        listReset(list);
    }
 }

 // NOTE(naman): The list head should be a placeholder terminal node with no associated metadata. Not doing this makes
 // safe iteration very difficult and/or tedious. Trust me, I'm wasted too much time trying various schemes out.
 # define LIST_FOR_EACH(_idx, _head)                             \
    for (List_Node                                              \
             * _idx = (_head)->next,                            \
             * _idx##_2_ = (_head)->next->next;                 \
         _idx != (_head);                                       \
         _idx = _idx##_2_, _idx##_2_ = _idx##_2_ -> next)
 # define LIST_FOR_EACH_REVERSE(_idx, _head)                     \
    for (List_Node                                              \
             * _idx = (_head)->prev,                            \
             * _idx##_2_ = (_head)->prev->prev;                 \
         _idx != (_head);                                       \
         _idx = _idx##_2_, _idx##_2_ = _idx##_2_ -> prev)


 /******************************************************************************************
 * Vinyas (.viny)
 * Text Serialization Format
 *
 * Viny* vinyCreate (Memory_Allocator_Interface miface)
 * void vinyDelete (Viny *v)
 * Bool vinyParseTxt (Viny *v, Txt t)
 * void vinyPrint (Viny *v, PRINT_FUNC((*print_func)), void *print_userdata)
 *
 */

 typedef struct Viny_Table {
    BTree btree;
    List_Node sequential;
 } Viny_Table;

 typedef struct Viny_Entry {
    List_Node bucket_list;
    List_Node sequential;

    Txt_Span key;

    struct {
        Viny_Table *table;
        Txt_Span    string;
    } value;
 } Viny_Entry;

 typedef struct Viny {
    Memory_Allocator_Interface system_miface;
    Memory_Allocator_Interface push_miface;
    Memory_Push mp;
    Viny_Table global_table;

    struct {
        Char *msg;
        Uint32 line, column;
    } err;

    Viny_Entry global_entry;
 } Viny;

 header_function
 MEMORY_ALLOCATOR_REQUEST_FUNC(vinyMemoryAllocate)
 {
    Memory_Push *mp = cast_val(Memory_Push*, userdata);
    void *ptr = memPushAllot(mp, amount);
    return ptr;
 }

 header_function
 MEMORY_ALLOCATOR_RESCIND_FUNC(vinyMemoryDeallocate)
 {
    Memory_Push *mp = cast_val(Memory_Push*, userdata);
    unused_variable(mp);
    unused_variable(ptr);
 }

 header_function
 Viny* vinyCreate (Memory_Allocator_Interface miface)
 {
    Viny *v = cast_val(Viny*, memIRequest(miface, sizeof(*v)));
    v->system_miface = miface;

    memPushCreate(&v->mp, miface, MiB(1));
    v->push_miface = memICreate(&v->mp, vinyMemoryAllocate, vinyMemoryDeallocate);
    btreeCreate(&v->global_table.btree, v->push_miface);

    v->global_entry.value.table = &v->global_table;

    return v;
 }

 header_function
 void vinyDelete (Viny *v)
 {
    btreeDelete(&v->global_table.btree);
    memPushDelete(&v->mp);
    Memory_Allocator_Interface miface = v->system_miface;
    memIRescind(miface, v);
 }

 typedef enum Viny_Token_Kind_ {
    Viny_Token_Kind_EOF_ = 0,
    Viny_Token_Kind_STRING_,
    Viny_Token_Kind_COLON_,
    Viny_Token_Kind_BRACE_OPEN_,
    Viny_Token_Kind_BRACE_CLOSE_,
 } Viny_Token_Kind_;

 typedef struct Viny_Token_ {
    Txt_Span ts;
    Viny_Token_Kind_ kind;

    Byte _pad[4];
 } Viny_Token_;

 typedef struct Viny_Tokenizer_ {
    Txt t;
    Viny_Token_ token;
    Sint64 cursor;
    Uint32 line, column;
    Label err_msg;
 } Viny_Tokenizer_;

 header_function
 Bool viny_TokerError (Viny_Tokenizer_ *toker, Label msg)
 {
    toker->err_msg = msg;
    return false;
 }

 header_function
 void viny_TokenizerAdvance (Viny_Tokenizer_ *toker, uint32_t count)
 {
    for (size_t i = 0; (i < count) && txtStr(toker->t)[toker->cursor]; i++) {
        if (txtStr(toker->t)[toker->cursor] == '\n') {
            toker->line++;
            toker->column = 1;
        } else {
            toker->column++;
        }
        toker->cursor++;
    }
 }

 header_function
 void viny_TokenizerEatWhitespace (Viny_Tokenizer_ *toker)
 {
    Bool ate_something_this_loop = true;
    while (ate_something_this_loop) {
        ate_something_this_loop = false;

        while ((txtStr(toker->t)[toker->cursor] == ' ') ||
               (txtStr(toker->t)[toker->cursor] == '\t') ||
               (txtStr(toker->t)[toker->cursor] == '\r') ||
               (txtStr(toker->t)[toker->cursor] == '\n')) {

            ate_something_this_loop = true;

            viny_TokenizerAdvance(toker, 1);
        }

        if (txtStr(toker->t)[toker->cursor] == '#') {
            viny_TokenizerAdvance(toker, 1);
            if (txtStr(toker->t)[toker->cursor] == '{') {
                ate_something_this_loop = true;
                viny_TokenizerAdvance(toker, 2);

                Uint comment_nesting = 1;

                while (txtStr(toker->t)[toker->cursor]) {
                    if (txtStr(toker->t)[toker->cursor] == '#') {
                        viny_TokenizerAdvance(toker, 1);
                        if (txtStr(toker->t)[toker->cursor] == '{') {
                            comment_nesting++;
                            viny_TokenizerAdvance(toker, 2);
                        } else if (txtStr(toker->t)[toker->cursor] == '}') {
                            comment_nesting--;
                            viny_TokenizerAdvance(toker, 2);
                        }
                    }

                    if (comment_nesting == 0) break;
                    viny_TokenizerAdvance(toker, 1);
                }
            } else {
                ate_something_this_loop = true;
                while (txtStr(toker->t)[toker->cursor]) {
                    if (txtStr(toker->t)[toker->cursor] == '\n') {
                        if ((toker->cursor) > 0 && txtStr(toker->t)[toker->cursor-1] == '\\') {
                            // do nothing
                        } else {
                            break;
                        }
                    }
                    viny_TokenizerAdvance(toker, 1);
                }
                viny_TokenizerAdvance(toker, 1);
            }
        }
    }
 }

 header_function
 Txt_Span viny_TokenizeString (Viny_Tokenizer_ *toker, Char ending_quote)
 {
    Sint64 begin = toker->cursor;

    if (ending_quote) {
        while (txtStr(toker->t)[toker->cursor] &&
               (txtStr(toker->t)[toker->cursor] != ending_quote)) {
            viny_TokenizerAdvance(toker, 1);

            if (txtStr(toker->t)[toker->cursor-1] == '\\') {
                viny_TokenizerAdvance(toker, 1);
            }
        }
    } else {
        while (txtStr(toker->t)[toker->cursor] &&
               (txtStr(toker->t)[toker->cursor] != ' ') &&
               (txtStr(toker->t)[toker->cursor] != '\t') &&
               (txtStr(toker->t)[toker->cursor] != '\n') &&
               (txtStr(toker->t)[toker->cursor] != '\r') &&
               (txtStr(toker->t)[toker->cursor] != ':') &&
               (txtStr(toker->t)[toker->cursor] != '{') &&
               (txtStr(toker->t)[toker->cursor] != '}')) {
            viny_TokenizerAdvance(toker, 1);

            if (txtStr(toker->t)[toker->cursor-1] == '\\') {
                viny_TokenizerAdvance(toker, 1);
            }
        }
    }

    Sint64 end = toker->cursor;
    Txt_Span ts = txtGetSpan(toker->t, begin, end);
    return ts;
 }

 header_function
 Bool viny_Tokenize (Viny_Tokenizer_ *toker)
 {
    viny_TokenizerEatWhitespace(toker);

    Viny_Token_Kind_ result;
    switch (txtStr(toker->t)[toker->cursor]) {
        case '\0': result = Viny_Token_Kind_EOF_; break;
        case ':': viny_TokenizerAdvance(toker, 1); result = Viny_Token_Kind_COLON_;         break;
        case '{': viny_TokenizerAdvance(toker, 1); result = Viny_Token_Kind_BRACE_OPEN_;    break;
        case '}': viny_TokenizerAdvance(toker, 1); result = Viny_Token_Kind_BRACE_CLOSE_;   break;
        case '"': case '\'': {
            Char quote = txtStr(toker->t)[toker->cursor];
            viny_TokenizerAdvance(toker, 1);
            toker->token.ts = viny_TokenizeString(toker, quote);
            if (txtStr(toker->t)[toker->cursor] != quote) return cast_val(Viny_Token_Kind_, viny_TokerError(toker, labelMake("No closing quote")));
            result = Viny_Token_Kind_STRING_;
            viny_TokenizerAdvance(toker, 1);
        } break;
        default: {
            toker->token.ts = viny_TokenizeString(toker, 0);
            result = Viny_Token_Kind_STRING_;
        } break;
    }

    toker->token.kind = result;

    return true;
 }

 header_function
 Viny_Tokenizer_ viny_TokenizerCreate (Txt t)
 {
    Viny_Tokenizer_ tok = {.t = t, .line = 1, .column = 1};
    viny_Tokenize(&tok);
    return tok;
 }

 header_function
 Bool viny_TokenMatch (Viny_Tokenizer_ *toker, Viny_Token_Kind_ kind)
 {
    if (toker->token.kind == kind) return true;
    return false;
 }

 header_function
 Bool viny_TokenConfirm (Viny_Tokenizer_ *toker, Viny_Token_Kind_ kind)
 {
    if (viny_TokenMatch(toker, kind)) {
        viny_Tokenize(toker);
        return true;
    }
    return false;
 }

 header_function Bool viny_ParseTable (Viny *v, Viny_Tokenizer_ *toker, Viny_Table *tbl);

 header_function
 Bool viny_ParseTable (Viny *v, Viny_Tokenizer_ *toker, Viny_Table *tbl)
 {
    listReset(&tbl->sequential);

    while (viny_TokenMatch(toker, Viny_Token_Kind_STRING_)) {
        Viny_Entry *entry = cast_val(Viny_Entry*, memPushAllot(&v->mp, sizeof(*entry)));
        memzero(entry);

        listAddBefore(&entry->sequential, &tbl->sequential);

        listReset(&entry->bucket_list);
        entry->key = toker->token.ts;

        viny_Tokenize(toker);

        if (viny_TokenConfirm(toker, Viny_Token_Kind_COLON_)) {
            if (viny_TokenConfirm(toker, Viny_Token_Kind_BRACE_OPEN_)) {
                Viny_Table *valtbl = cast_val(Viny_Table*, memPushAllot(&v->mp, sizeof(*valtbl)));
                btreeCreate(&valtbl->btree, v->push_miface);
                if (!viny_ParseTable(v, toker, valtbl)) return false;
                entry->value.table = valtbl;
                if (!viny_TokenConfirm(toker, Viny_Token_Kind_BRACE_CLOSE_)) return viny_TokerError(toker, labelMake("Closing brace expected when parsing table"));
            } else if (viny_TokenMatch(toker, Viny_Token_Kind_STRING_)) {
                entry->value.string = toker->token.ts;
                viny_Tokenize(toker);
            } else {
                return viny_TokerError(toker, labelMake("No valid token found while parsing table"));
            }
        } else {
            memzero(&entry->value);
        }

        // TODO(naman): First check if the key being inserted has already not been added.
        // If it has, replace it instead of adding a new one.

        List_Node *list_node = cast_val(List_Node*, btreeSearch(&tbl->btree, entry->key.hash));
        if (list_node == nullptr) {
            list_node = cast_val(List_Node*, memPushAllot(&v->mp, sizeof(*list_node)));
            listReset(list_node);
            btreeInsert(&tbl->btree, entry->key.hash, list_node);
        }
        listAddBefore(&entry->bucket_list, list_node);
    }

    return true;
 }

 header_function
 Bool vinyParseTxt (Viny *v, Txt t)
 {
    Viny_Tokenizer_ toker = viny_TokenizerCreate(t);
    Bool result = viny_ParseTable(v, &toker, &v->global_table);
    return result && viny_TokenMatch(&toker, Viny_Token_Kind_EOF_);
 }

 typedef struct Viny_Printer_ {
    PRINT_FUNC((*print_func));
    void *print_userdata;
    Uint indent;

    Byte _pad[4];
 } Viny_Printer_;

 header_function
 void viny_PrintRecursive (Viny_Printer_ *vpr, Viny_Table *tbl)
 {
    Uint indent = cast_val(Uint, vpr->indent);
    vpr->indent++;

    LIST_FOR_EACH(l, &tbl->sequential) {
        Viny_Entry *entry = cast_val(Viny_Entry *, containerof(l, Viny_Entry, sequential));

        if (entry->value.table) {
            for (Uint i = 0; i < indent; i++) vpr->print_func(vpr->print_userdata, "  ");
            vpr->print_func(vpr->print_userdata, "\"%.*s\": {\n",
                            cast_val(Sint, txtspanLen(entry->key)), txtspanStr(entry->key));
            viny_PrintRecursive(vpr, entry->value.table);
            for (Uint i = 0; i < indent; i++) vpr->print_func(vpr->print_userdata, "  ");
            vpr->print_func(vpr->print_userdata, "}\n");
        } else if (txtspanLen(entry->value.string)) {
            for (Uint i = 0; i < indent; i++) vpr->print_func(vpr->print_userdata, "  ");
            vpr->print_func(vpr->print_userdata, "\"%.*s\": \"%.*s\"\n",
                            cast_val(Sint, txtspanLen(entry->key)), txtspanStr(entry->key),
                            cast_val(Sint, txtspanLen(entry->value.string)), txtspanStr(entry->value.string));
        } else {
            for (Uint i = 0; i < indent; i++) vpr->print_func(vpr->print_userdata, "  ");
            vpr->print_func(vpr->print_userdata, "\"%.*s\"\n",
                            cast_val(Sint, txtspanLen(entry->key)), txtspanStr(entry->key));
        }
    }

    vpr->indent--;
 }

 header_function
 void vinyPrint (Viny *v, PRINT_FUNC((*print_func)), void *print_userdata)
 {
    Viny_Printer_ vpr = {.print_func = print_func, .print_userdata = print_userdata};
    viny_PrintRecursive(&vpr, &v->global_table);
 }

 header_function
 Viny_Entry* viny_QueryRecursive (Viny_Entry *entry, va_list args)
 {
    if (entry == nullptr) return nullptr;

    Char *arg = va_arg(args, Char*);
    if (arg == nullptr) return entry;
    Size argsize = strlen(arg);
    Uint64 arghash = hash64(arg, argsize, 0);
    List_Node *list = cast_val(List_Node*, btreeSearch(&entry->value.table->btree, arghash));
    if (list == nullptr) return nullptr;
    LIST_FOR_EACH(l, list) {
        Viny_Entry *e = cast_val(Viny_Entry *, containerof(l, Viny_Entry, bucket_list));

        if ((argsize == txtspanLen(e->key)) &&
            (memcmp(txtspanStr(e->key), arg, argsize) == 0)) {

            if (e->value.table) {
                return viny_QueryRecursive(e, args);
            } else {
                Char *nextarg = va_arg(args, Char*);

                if (nextarg) {
                    return nullptr;
                } else {
                    return e;
                }
            }

        }
    }

    return nullptr;
 }

 header_function
 Viny_Entry* viny_QueryEntry (Viny_Entry *ve, ...)
 {
    if (ve == nullptr) return nullptr;
    va_list args;
    va_start(args, ve);
    Viny_Entry *result = viny_QueryRecursive(ve, args);
    va_end(args);
    return result;
 }

 #define vinyQuery(_v, ...)       viny_QueryEntry(&(_v)->global_entry,  __VA_ARGS__, nullptr)
 #define vinyQueryEntry(_ve, ...) viny_QueryEntry((_ve),                __VA_ARGS__, nullptr)

 header_function
 Viny_Entry* viny_EntryHasTable (Viny_Entry *ve)
 {
    if (ve == nullptr) return nullptr;
    if (ve->value.table) return ve;
    return nullptr;
 }

 header_function
 Viny_Entry* viny_EntryHasString (Viny_Entry *ve)
 {
    if (ve == nullptr) return nullptr;
    if (txtspanLen(ve->value.string)) return ve;
    return nullptr;
 }

 header_function
 Viny_Entry* viny_EntryHasEmpty (Viny_Entry *ve)
 {
    if (ve == nullptr) return nullptr;
    if (!viny_EntryHasTable(ve) && !viny_EntryHasString(ve)) return ve;
    return nullptr;
 }

 #define vinyQueryForTable(_v, ...)  viny_EntryHasTable (vinyQuery(_v, __VA_ARGS__))
 #define vinyQueryForString(_v, ...) viny_EntryHasString(vinyQuery(_v, __VA_ARGS__))
 #define vinyQueryForEmpty(_v, ...)  viny_EntryHasEmpty (vinyQuery(_v, __VA_ARGS__))

 #define vinyQueryEntryForTable(_ve, ...)  viny_EntryHasTable (vinyQueryEntry(_ve, __VA_ARGS__))
 #define vinyQueryEntryForString(_ve, ...) viny_EntryHasString(vinyQueryEntry(_ve, __VA_ARGS__))
 #define vinyQueryEntryForEmpty(_ve, ...)  viny_EntryHasEmpty (vinyQueryEntry(_ve, __VA_ARGS__))

 /******************************************************************************************
 * C++ Data Structures
 *****************************************************************************************/

 #if defined(ENV_LANG_CXX)

 # if defined(ENV_LANF_CPP) && (ENV_LANF_CPP < 2011)
 template<class T>
 T* addressof(T& arg) noexcept
 {
    return reinterpret_cast<T*>(&const_cast<Byte&>(reinterpret_cast<const volatile Byte&>(arg)));
 }
 # endif

 template<typename T>
 struct Vector {
    Memory_Allocator_Interface miface;

    Size cap = 0; // NOTE(naman): Maximum number of elements that can be stored
    Size len = 0; // NOTE(naman): Count of elements actually stored
    T *buf = nullptr;

    void init_ (Memory_Allocator_Interface MIface) {
        cap = 0;
        len = 0;
        buf = nullptr;
        miface = MIface;
    }

    Vector(Memory_Allocator_Interface MIface) {
        init_(MIface);
    }

    Vector(Memory_Allocator_Interface MIface, Size elems) {
        init_(MIface);

        GrowToCap(elems);
    }

    Vector(Vector &t) {
        cap = t.cap;
        len = t.len;
        buf = t.buf;
        miface = t.miface;
    }

    ~Vector () {
        for (Size i = 0; i < cap; i++) {
            (addressof(buf[i]))->~T();
        }
        memIRescind(miface, buf);
        cap = 0;
        len = 0;
    }

    T& operator[] (Size index) const {
        return buf[index];
    }

    void AddInPlace_ (Size index, T elem) {
        buf[index] = elem;
    }

    Bool IsFull_ () {
        return len + 1 > cap;
    }

    void Grow () {
        GrowToCap(2 * cap);
    }

    void Add (T elem) {
        if (unlikely(IsFull_())) Grow();
        buf[len] = elem;
        len++;
    }

    void RemoveUnsorted (Size index) {
        buf[index] = buf[len - 1];
        len--;
    }

    void Clear () {
        memset(buf, 0, len * sizeof(T));
        len = 0;
    }

    void Resize (Size new_count) {
        if (new_count > cap) {
            GrowToCap(new_count);
        }
    }

    Size SizeOf    () { return len * sizeof(T); }
    Size ElemIn    () { return len;             }
    Size MaxSizeOf () { return cap * sizeof(T); }
    Size MaxElemIn () { return cap * sizeof(T); }

    Bool IsEmpty    () { return len == 0;       }
    Bool IsNotEmpty () { return !IsEmpty();     }
    Bool IsNull     () { return buf == nullptr; }

    T* PtrOnePastLast () {
        return buf + len;
    }

    T* PtrLast () {
        return buf + (len - 1);
    }

    T* PtrFirst () {
        return buf;
    }

    void GrowToSize_ (Size new_size) {
        if (IsNull()) {
            buf = static_cast<T*>(memIRequest(miface, new_size));
        } else {
            T *new_buf = static_cast<T*>(memIRequest(miface, new_size));
            memcpy(new_buf, buf, cap * sizeof(T));
            memIRescind(miface, buf);
            buf = new_buf;
        }
    }

    void GrowToCap (Size elem_count) {
        Size new_cap = maximum(elem_count, 16ULL);
        GrowToSize_(new_cap * sizeof(T));
        for (Size i = cap; i < new_cap; i++) {
            new (addressof(buf[i])) T();
        }
        cap = new_cap;
    }

    void IncreaseCapBy (Size elem_count) {
        Size new_cap = maximum(elem_count + len, 16ULL);
        GrowToCap(new_cap);
        cap = new_cap;
    }
 };

 template<typename T>
 struct Stack {
    Vector<T> data;

    Stack (Memory_Allocator_Interface MIface) : data{MIface} {}
    Stack (Memory_Allocator_Interface MIface, Size elems) : data{MIface, elems} {}

    void Push (T elem) {
        data.Add(elem);
    }

    T Peek () {  // NOTE(naman): Check if stack is empty before calling
        T elem = *data.PtrLast();
        return elem;
    }

    T Pop () { // NOTE(naman): Check if stack is empty before calling
        T elem = Peek();
        data.len--;
        return elem;
    }

    Bool IsEmpty () {
        return data.IsEmpty();
    }

    Bool IsNotEmpty () {
        return !IsEmpty();
    }

    Size ElemIn () {
        return data.ElemIn();
    }
 };

 // TODO(naman): Convert this from two-stack approach to some better (more performant) one.
 // Right now, the amortized time is constant, but it's not real-time.
 template<typename T>
 struct Queue { // Double Ended
    Stack<T> head, tail;

    Queue (Memory_Allocator_Interface MIface) : head{MIface}, tail{MIface} {}
    Queue (Memory_Allocator_Interface MIface, Size elems) : head{MIface, elems}, tail{MIface, elems} {}

    void PushFront (T elem) {
        head.Push(elem);
    }

    T PeekFront () {  // Check if queue is empty before calling
        if (head.IsEmpty()) {
            debugAssert(tail.IsEmpty() == false);
            Size iter = max(1ULL, tail.ElemIn() / 2);
            for (Size i = 0; i < iter; i++) {
                head.Push(tail.Pop());
            }
        }

        return head.Peek();
    }

    T PopFront () { // Check if queue is empty before calling
        PeekFront();
        return head.Pop();
    }

    void PushBack (T elem) {
        tail.Push(elem);
    }

    T PeekBack () { // Check if queue is empty before calling
        if (tail.IsEmpty()) {
            debugAssert(head.IsEmpty() == false);
            Size iter = max(1, head.ElemIn() / 2);
            for (Size i = 0; i < iter; i++) {
                tail.Push(head.Pop());
            }
        }

        return tail.Peek();
    }

    T PopBack () { // Check if queue is empty before calling
        PeekBack();
        return tail.Pop();
    }

    Bool IsEmpty () {
        return head.IsEmpty() && tail.IsEmpty();
    }

    Bool IsNotEmpty () {
        return !IsEmpty();
    }

    Size ElemIn () {
        return head.ElemIn() + tail.ElemIn();
    }
 };

 #endif

 #define STD_H_INCLUDE_GUARD
 #endif
No results found