imaami · October 1, 2024 18:40
diff --git a/b24.c b/b24.c
 /**
 * @file b24.c
 *
 * Compiling with MSVC 19.41 or later:
 *
 *   cl.exe /TC /std:clatest /O2 /Oi /GL /GF /Zo- /favor:AMD64 /arch:AVX2 b24.c /Fe: b24.exe /MD
 */
 #include <errno.h>
 #include <inttypes.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include "neuron.h"
 #include "popcnt.h"
 #include "vec.h"

 #if defined(__INTELLISENSE__) || defined(_MSC_VER)
 # define BITINT_C(n) n
 # define _BitInt(n) int
 #else
 # define BITINT_C(n) n##WB
 #endif

 #ifdef _MSC_VER
 # pragma intrinsic(_BitScanForward)
 static const_inline int __builtin_ctzl(unsigned long x) {
 	unsigned long y;
 	return _BitScanForward(&y, x) ? (int)y : (int)sizeof(x) * CHAR_BIT;
 }
 #endif

 pragma_msvc(warning(push))
 pragma_msvc(warning(disable: 4710))
 pragma_msvc(warning(disable: 4711))

 static const_inline char
 hexdig (unsigned _BitInt(4) n)
 {
 	return (const char[16]){
 		'0','1','2','3','4','5','6','7',
 		'8','9','a','b','c','d','e','f',
 	}[n];
 }

 static inline char *
 decstr_u5 (char                *dst,
           unsigned _BitInt(5)  src,
           bool                 aln)
 {
 	if (src >= BITINT_C(10U)) {
 		*dst++ = (char)((unsigned char)'0' + src / BITINT_C(10U));
 		src %= BITINT_C(10U);
 	} else if (aln)
 		*dst++ = ' ';
 	*dst++ = (char)((unsigned char)'0' + src);
 	return dst;
 }

 #if 0
 static force_inline char *
 decstr_s5 (char                *dst,
           unsigned _BitInt(5)  src,
           bool                 pad)
 {
 	const typeof(src) lt0 = src >> 4;
 	src = (src - lt0) ^ ((BITINT_C(31U) ^ BITINT_C(31U)) - lt0);

 	if (lt0) {
 		if (src >= BITINT_C(10U)) {
 			*dst++ = '-';
 			*dst++ = '1';
 			src -= BITINT_C(10U);
 		} else {
 			if (pad)
 				*dst++ = ' ';
 			*dst++ = '-';
 		}
 	} else {
 		if (src >= BITINT_C(10U)) {
 			if (pad)
 				*dst++ = ' ';
 			*dst++ = '1';
 			src -= BITINT_C(10U);
 		} else if (pad) {
 			*dst++ = ' ';
 			*dst++ = ' ';
 		}
 	}

 	*dst++ = (char)((unsigned char)'0' + src);
 	return dst;
 }
 #endif

 static force_inline char *
 decstr_u4 (char                *dst,
           unsigned _BitInt(4)  src,
           bool                 aln)
 {
 	if (src >= BITINT_C(10U)) {
 		*dst++ = '1';
 		src -= BITINT_C(10U);
 	} else if (aln)
 		*dst++ = ' ';
 	*dst++ = (char)((unsigned char)'0' + src);
 	return dst;
 }

 static force_inline char *
 decstr_s4 (char                *dst,
           unsigned _BitInt(4)  src,
           bool                 aln)
 {
 	if (src >= BITINT_C(8U)) {
 		*dst++ = '-';
 		*dst++ = (char)((unsigned char)'8' +
 		                (unsigned char)8 - src);
 	} else {
 		if (aln)
 			*dst++ = ' ';
 		*dst++ = (char)((unsigned char)'0' + src);
 	}
 	return dst;
 }

 static const uint16_t b24[] = {
 	0x9afU, 0x9ebU, 0xa6fU, 0xa7bU,
 	0xb3dU, 0xb4fU, 0xbcdU, 0xbd3U,
 	0xcbdU, 0xd2fU, 0xd79U, 0xde5U,
 	0xf2dU, 0xf4bU, 0xf59U, 0xf65U,
 };

 pragma_msvc(warning(push))
 pragma_msvc(warning(disable: 4200))
 pragma_msvc(warning(disable: 4820))

 struct b24_cfg {
 	uint32_t have_opt;
 	uint16_t rotation;
 	uint16_t offset;
 	uint64_t neuron;
 	size_t   n_seq;
 	uint16_t seq[];
 };

 pragma_msvc(warning(pop))

 typedef typeof((struct b24_cfg){0}.have_opt) b24_cfg_flags;
 typedef typeof((struct b24_cfg){0}.offset)   b24_cfg_offset;

 struct trace {
 	uint64_t cyc;
 	uint32_t seq;
 	uint16_t len;
 	uint16_t map;
 };

 const_function
 static struct trace
 b24_trace_init (struct b24_cfg const *const cfg,
                unsigned                    i)
 {
 	uint32_t seq = cfg->seq[i] | ((uint32_t)cfg->seq[i] << 16U);
 	return (struct trace){
 		.cyc = 0U,
 		.seq = cfg->rotation ? (seq >> (16 - cfg->rotation))
 		                     | (seq << cfg->rotation) : seq,
 		.len = 0U,
 		.map = 0U,
 	};
 }

 const_function
 static struct trace
 b24_trace_from (struct trace   tc,
                b24_cfg_offset off)
 {
 	tc.cyc = off;
 	tc.len = 1U;
 	tc.map = 1U << off;

 	for (;;) {
 		off = (tc.seq >> off) & (b24_cfg_offset)15;
 		typeof(tc.map) bit = (typeof(bit))1 << off;

 		if (tc.map & bit)
 			break;

 		tc.map |= bit;
 		tc.cyc |= (typeof(tc.cyc))off
 		          << (tc.len++ << 2U);
 	}

 	return tc;
 }

 const_function
 static struct trace
 b24_trace_next (struct trace tc)
 {
 	if (tc.map == (typeof(tc.map))0xffff)
 		return (struct trace){0};

 	b24_cfg_offset off = 0U;
 	uint_fast16_t map = tc.map;

 	while (map & (typeof(map))1) {
 		map >>= 1U;
 		++off;
 	}

 	return b24_trace_from(tc, off);
 }

 enum b24_opt {
 	/* The std_in option ('-') is 0 to guarantee all
 	 * argument-accepting options stored in `expect`
 	 * evaluate to true during argument parsing.
 	 */
 	b24_opt_std_in  , // - (unused for now)
 	b24_opt_big_hex , // -b
 	b24_opt_cycles  , // -c
 	b24_opt_debruijn, // -d
 	b24_opt_entries , // -e
 	b24_opt_graphviz, // -g
 	b24_opt_help    , // -h
 	b24_opt_json    , // -j
 	b24_opt_neuron  , // -n
 	b24_opt_one_path, // -o
 	b24_opt_python  , // -p
 	b24_opt_expand  , // -q
 	b24_opt_rotation, // -r
 	b24_opt_sig_path, // -s
 	b24_opt_no_space, // -w
 	b24_opt_hex_path, // -x
 	b24_opt_no_align, // -y
 	b24_opt_no_zeros, // -z

 	// qualifier flags
 	ONLY_ONCE = 0x40U,
 	NEEDS_ARG = 0x80U,
 };

 #define b24_opt_char(x)                    \
        _Generic(&(int[1U+b24_opt_##x]){0} \
        , int(*)[1U+b24_opt_std_in  ]: 0   \
        , int(*)[1U+b24_opt_big_hex ]: 'b' \
        , int(*)[1U+b24_opt_cycles  ]: 'c' \
        , int(*)[1U+b24_opt_debruijn]: 'd' \
        , int(*)[1U+b24_opt_entries ]: 'e' \
        , int(*)[1U+b24_opt_graphviz]: 'g' \
        , int(*)[1U+b24_opt_help    ]: 'h' \
        , int(*)[1U+b24_opt_json    ]: 'j' \
        , int(*)[1U+b24_opt_neuron  ]: 'n' \
        , int(*)[1U+b24_opt_one_path]: 'o' \
        , int(*)[1U+b24_opt_python  ]: 'p' \
        , int(*)[1U+b24_opt_expand  ]: 'q' \
        , int(*)[1U+b24_opt_rotation]: 'r' \
        , int(*)[1U+b24_opt_sig_path]: 's' \
        , int(*)[1U+b24_opt_no_space]: 'w' \
        , int(*)[1U+b24_opt_hex_path]: 'x' \
        , int(*)[1U+b24_opt_no_align]: 'y' \
        , int(*)[1U+b24_opt_no_zeros]: 'z' )

 static const char b24_opt_to_char[] = {
 	#define b24_option(x) [b24_opt_##x] = b24_opt_char(x)
 	b24_option(std_in  ),
 	b24_option(big_hex ),
 	b24_option(cycles  ),
 	b24_option(debruijn),
 	b24_option(entries ),
 	b24_option(graphviz),
 	b24_option(help    ),
 	b24_option(json    ),
 	b24_option(neuron  ),
 	b24_option(one_path),
 	b24_option(python  ),
 	b24_option(expand  ),
 	b24_option(rotation),
 	b24_option(sig_path),
 	b24_option(no_space),
 	b24_option(hex_path),
 	b24_option(no_align),
 	b24_option(no_zeros),
 	#undef b24_option
 };

 static const unsigned char
 b24_char_to_opt[1U << CHAR_BIT] = {
 	#define b24_option(x) [b24_opt_char(x)] = b24_opt_##x
 	b24_option(std_in  )| ONLY_ONCE,
 	b24_option(big_hex )| ONLY_ONCE,
 	b24_option(cycles  )| ONLY_ONCE,
 	b24_option(debruijn)| NEEDS_ARG,
 	b24_option(entries )| ONLY_ONCE,
 	b24_option(graphviz)| ONLY_ONCE,
 	b24_option(help    )| ONLY_ONCE,
 	b24_option(json    )| ONLY_ONCE,
 	b24_option(neuron  )| ONLY_ONCE
 	                    | NEEDS_ARG,
 	b24_option(one_path)| ONLY_ONCE
 	                    | NEEDS_ARG,
 	b24_option(python  )| ONLY_ONCE,
 	b24_option(expand  )| ONLY_ONCE,
 	b24_option(rotation)| ONLY_ONCE
 	                    | NEEDS_ARG,
 	b24_option(sig_path)| ONLY_ONCE,
 	b24_option(no_space)| ONLY_ONCE,
 	b24_option(hex_path)| ONLY_ONCE,
 	b24_option(no_align)| ONLY_ONCE,
 	b24_option(no_zeros)| ONLY_ONCE,
 	#undef b24_option
 };

 /**
 * @brief Check if option `opt` is present.
 */
 static pure_inline bool
 b24_has_option (struct b24_cfg const *const cfg,
                const enum b24_opt          opt)
 {
 	return cfg->have_opt & ((b24_cfg_flags)1 << opt);
 }

 /**
 * @brief Mark option `opt` as present.
 */
 static force_inline void
 b24_option_add (struct b24_cfg *const cfg,
                const enum b24_opt    opt)
 {
 	cfg->have_opt |= (b24_cfg_flags)1 << opt;
 }

 struct b24_syntax {
 	char bra;
 	char ket;
 	char sep;
 	char spc;
 };

 static force_inline char *
 hexstr_u4 (char                *dst,
           unsigned _BitInt(4)  src)
 {
 	*dst++ = '0';
 	*dst++ = 'x';
 	*dst++ = hexdig(src);
 	return dst;
 }

 static inline char *
 hexstr_u16 (char     *dst,
            uint16_t  src,
            bool      pad)
 {
 	*dst++ = '0';
 	*dst++ = 'x';

 	if (pad) {
 		*dst++ = hexdig(src >> 12U      );
 		*dst++ = hexdig(src >>  8U & 15U);
 		*dst++ = hexdig(src >>  4U & 15U);

 	} else if (src > UINT16_C(0x000f)) {
 		if (src > UINT16_C(0x00ff)) {
 			if (src > UINT16_C(0x0fff))
 				*dst++ = hexdig(src >> 12U);
 			*dst++ = hexdig(src >> 8U & 15U);
 		}
 		*dst++ = hexdig(src >> 4U & 15U);
 	}

 	*dst++ = hexdig(src & 15U);

 	return dst;
 }

 static force_inline char *
 hexstr_16x4 (char              *dst,
             uint64_t           src,
             unsigned           len,
             struct b24_syntax  syn)
 {
 	if (len) {
 		for (;; src >>= 4) {
 			dst = hexstr_u4(dst, src & 15U);
 			if (!--len)
 				break;
 			*dst++ = syn.sep;
 			*dst   = syn.spc;
 			 dst+= !!syn.spc;
 		}
 	}
 	return dst;
 }

 static char *
 hexstr_u64 (char     *dst,
            uint64_t  src,
            bool      pad)
 {
 	char buf[16] = "000000000000000";
 	char *p = &buf[15];

 	for (;; --p) {
 		*p = hexdig(src & 15U);
 		src >>= 4U;
 		if (!src)
 			break;
 	}
 	if (pad)
 		p = &buf[0];

 	*dst++ = '0';
 	*dst++ = 'x';
 	for (;; ++p) {
 		*dst++ = *p;
 		if (p == &buf[15])
 			break;
 	}

 	return dst;
 }

 static force_inline char *
 decstr_16x4 (char              *dst,
             uint64_t           src,
             unsigned           len,
             struct b24_syntax  syn,
             bool               aln,
             bool               sig)
 {
 	if (len) {
 		bool w = syn.spc;
 		if (sig) {
 			for (;; src >>= 4) {
 				dst = decstr_s4(dst, src & 15U, aln);
 				if (!--len)
 					break;
 				*dst++ = syn.sep;
 				*dst   = syn.spc;
 				dst  += w;
 			}
 		} else {
 			for (;; src >>= 4) {
 				dst = decstr_u4(dst, src & 15U, aln);
 				if (!--len)
 					break;
 				*dst++ = syn.sep;
 				*dst   = syn.spc;
 				dst  += w;
 			}
 		}
 	}
 	return dst;
 }

 static char *
 b24_cfg_trace_print (struct b24_cfg const *cfg,
                     char                 *dst,
                     struct trace          src,
                     unsigned              idx,
                     struct b24_syntax     syn)
 {
 	bool align = !b24_has_option(cfg, b24_opt_no_align);
 	bool zeros = !b24_has_option(cfg, b24_opt_no_zeros);
 	bool expand = b24_has_option(cfg, b24_opt_expand);
 	bool all = !b24_has_option(cfg, b24_opt_one_path) &&
 	           !expand;

 	if (all) {
 		*dst++ = syn.bra;
 		 dst   = hexstr_u16(dst, (uint16_t)src.seq, zeros);
 		*dst++ = syn.sep;
 		*dst   = syn.spc;
 		 dst+= !!syn.spc;
 		 dst   = decstr_u5(dst, idx, align);
 		*dst++ = syn.sep;
 		*dst   = syn.spc;
 		 dst+= !!syn.spc;
 		 dst   = hexstr_u16(dst, src.map, zeros);
 		*dst++ = syn.sep;
 		*dst   = syn.spc;
 		 dst+= !!syn.spc;
 		 dst   = decstr_u5(dst, src.len, align);
 		*dst++ = syn.sep;
 		*dst   = syn.spc;
 		 dst+= !!syn.spc;
 	}

 	if (b24_has_option(cfg, b24_opt_big_hex)) {
 		dst = hexstr_u64(dst, src.cyc, zeros);
 	} else {
 		*dst++ = syn.bra;
 		 dst   = b24_has_option(cfg, b24_opt_hex_path)
 		       ? hexstr_16x4(dst, src.cyc, src.len, syn)
 		       : decstr_16x4(dst, src.cyc, src.len, syn, align,
 		                b24_has_option(cfg, b24_opt_sig_path));
 		*dst++ = syn.ket;
 	}
 	if (all)
 		*dst++ = syn.ket;
 	*dst   = '\0';
 	return dst;
 }

 static struct trace
 b24_expand (struct trace tc)
 {
 	tc.cyc = seq_expand((uint16_t)tc.seq);
 	tc.len = 16U;
 	tc.map = 0xffffU;
 	return tc;
 }

 static void
 b24_cfg_trace (struct b24_cfg const *cfg)
 {
 	struct b24_syntax syn = (struct b24_syntax[]){
 		{'{', '}', ',', ' '},
 		{'[', ']', ',', ' '}, // json or python
 		{'{', '}', ',', 0x0}, // no space
 		{'[', ']', ',', 0x0}, // js/py, no space
 	}[b24_has_option(cfg, b24_opt_json)   |
 	  b24_has_option(cfg, b24_opt_python) |
 	 (b24_has_option(cfg, b24_opt_no_space) << 1U)];

 	bool expand = b24_has_option(cfg, b24_opt_expand);
 	bool once = b24_has_option(cfg, b24_opt_one_path);
 	bool all = !once && !expand;
 	bool please = false;

 	for (unsigned i = 0; i < cfg->n_seq;) {
 		struct trace tc = b24_trace_init(cfg, i);
 		unsigned cycles = 0;

 		for (;; ++cycles) {
 			struct trace t;

 			if (once) {
 				t = b24_trace_from(tc, cfg->offset);
 			} else if (expand) {
 				t = b24_expand(tc);
 			} else {
 				t = b24_trace_next(tc);
 				if (!t.map)
 					break;

 				tc.cyc |= t.cyc << (tc.len << 2U);
 				tc.len += t.len;
 				tc.map |= t.map;
 			}

 			char buf[160];
 			char *p = b24_cfg_trace_print(cfg, buf, t,
 			                              cycles, syn);
 			if (p) {
 				if (please) {
 					(void)putchar(syn.sep);
 					(void)putchar('\n');
 				}
 				(void)fputs(buf, stdout);
 				please = true;
 			}

 			if (!all)
 				break;
 		}

 		if (++i == cfg->n_seq && please)
 			(void)putchar('\n');
 	}
 }

 static void
 b24_help (void)
 {
 	(void)printf(
 		"Usage: b24 [OPTIONS]... [<0..65535>]...\n"
 		"Options:\n"
 		"  -%c           Print this help text and exit\n"
 		"  -%c <state>   Trace a neuron's state evolution\n"
 		"  -%c <0..15>   Trace a single path at offset\n"
 		"  -%c           Expand subsequences in place\n"
 		"\n"
 		"Input options:\n"
 		"  -%c <0..15>   Add distinct B(2, 4) as input\n"
 		"  -%c <0..15>   Apply rotation before tracing\n"
 		"\n"
 		"Output options:\n"
 		"  -%c           Only cycles, no entry paths\n"
 		"  -%c           Always find all entry paths\n"
 		"\n"
 		"Formatting options:\n"
 		"  -%c           Output in Graphviz format\n"
 		"  -%c           Output in JSON format\n"
 		"  -%c           Output in Python format\n"
 		"  -%c           Signed decimal path steps\n"
 		"  -%c           Hexadecimal path steps\n"
 		"  -%c           Print every sequence as one big hex number\n"
 		"  -%c           Don't align output columns\n"
 		"  -%c           Don't zero-pad hexadecimals\n"
 		"  -%c           Omit non-separator whitespace\n",
 		b24_opt_char(help),     b24_opt_char(neuron),
 		b24_opt_char(one_path), b24_opt_char(expand),
 		b24_opt_char(debruijn), b24_opt_char(rotation),
 		b24_opt_char(cycles),   b24_opt_char(entries),
 		b24_opt_char(graphviz), b24_opt_char(json),
 		b24_opt_char(python),   b24_opt_char(sig_path),
 		b24_opt_char(hex_path), b24_opt_char(big_hex),
 		b24_opt_char(no_align), b24_opt_char(no_zeros),
 		b24_opt_char(no_space)
 	);
 }

 struct str7 {
 	union {
 		char d[8U];
 		uint64_t u;
 	};
 };

 const_inline
 static struct str7
 b24_opt_char_str (char ch)
 {
 	struct str7 str = {{{'"', '-', ch}}};
 	str.d[3U - !ch] = '"';
 	return str;
 }

 static void
 b24_cfg_destroy (struct b24_cfg **cfg)
 {
 	if (cfg && *cfg) {
 		free(*cfg);
 		*cfg = NULL;
 	}
 }

 _Noreturn static void
 b24_helpful_exit (struct b24_cfg **cfg,
                  int              ret)
 {
 	b24_cfg_destroy(cfg);
 	b24_help();
 	exit(ret);
 }

 static struct b24_cfg *
 b24_cfg_create (size_t n_seq)
 {
 	struct b24_cfg *cfg = calloc(1, offsetof(struct b24_cfg, seq)
 	                             + n_seq * sizeof cfg->seq[0]);
 	if (!cfg)
 		perror("calloc");
 	return cfg;
 }

 _Noreturn static void
 b24_cosmic_ray (struct b24_cfg **cfg,
                char const      *what,
                char const      *func,
                int              line)
 {
 	size_t n = what ? strlen(what) : 0;
 	if (!n)
 		what = "";
 	(void)fprintf(stderr, "[\033[31mERROR\033[m] %s:%d: %s%s\n",
 	              func, line, what, &".\nFailure caused by the impact "
 	              "of a high-energy cosmic particle or a high-density "
 	              "program author."[n ? what[n-1U] == '.' : 2U]);
 	b24_cfg_destroy(cfg);
 	exit(EXIT_FAILURE);
 }

 static int
 b24_options_incompatible (struct b24_cfg *cfg,
                          enum b24_opt    opt,
                          b24_cfg_flags   bad,
                          const char      sep)
 {
 	if (!b24_has_option(cfg, opt))
 		return 0;

 	bad &= cfg->have_opt;
 	if (!bad)
 		return 0;
 	cfg->have_opt &= ~((typeof(cfg->have_opt))1 << opt);

 	int n = popcnt(bad);
 	if (n >= (int)array_size(b24_opt_to_char)) {
 		b24_cosmic_ray(&cfg, "Option flag outside expected range",
 		               __func__, __LINE__);
 	}

 	char buf[64] = "";
 	char *dst = &buf[0];
 	b24_cfg_flags bit = 1U;

 	for (int prev = 0, i = 0; i < n;) {
 		int o = __builtin_ctzl(bad);
 		if (o >= (int)array_size(b24_opt_to_char))
 			break;

 		bit <<= o - prev;
 		bad &= ~bit;
 		prev = o;

 		if (++i > 1) {
 			if (n > 2)
 				*dst++ = sep;
 			*dst++ = ' ';
 			if (i == n) {
 				*dst++ = 'o';
 				*dst++ = 'r';
 				*dst++ = ' ';
 			}
 		}

 		struct str7 s = b24_opt_char_str(b24_opt_to_char[o]);
 		for (char *src = s.d; *src; ++src) { *dst++ = *src; }
 	}

 	*dst = '\0';

 	(void)fprintf(stderr, "Option %s cannot be combined with %s\n",
 	              b24_opt_char_str(b24_opt_to_char[opt]).d, buf);

 	return 1;
 }

 static void
 b24_option_assert_once (struct b24_cfg *cfg,
                        enum b24_opt    opt)
 {
 	if (b24_has_option(cfg, opt)) {
 		(void)fprintf(stderr, "Option %s specified twice\n",
 		              b24_opt_char_str(b24_opt_to_char[opt]).d);
 		b24_helpful_exit(&cfg, EXIT_FAILURE);
 	}
 }

 static inline uint64_t
 b24_get_u64_arg (char *arg,
                 int  *err)
 {
 	if (!arg) {
 		*err = EFAULT;
 		return 0;
 	}

 	if (!*arg) {
 		*err = EINVAL;
 		return 0;
 	}

 	errno = 0;
 	uint64_t u64 = 0U;
 	char *endptr = arg;
 	int64_t i64 = _Generic(i64, long: strtol,
 	                       long long: strtoll)(arg, &endptr, 0);
 	int e = errno;
 	if (!e) {
 		u64 = (uint64_t)i64;
 		if (*endptr)
 			e = EINVAL;

 	} else if (e == ERANGE && i64 == _Generic(i64, long: LONG_MAX,
 	                                          long long: LLONG_MAX)) {
 		errno = 0;
 		endptr = arg;
 		u64 = _Generic(u64, unsigned long: strtoul,
 		               unsigned long long: strtoull)(arg, &endptr, 0);
 		e = errno;
 		if (!e && *endptr)
 			e = EINVAL;
 	}

 	*err = e;
 	return u64;
 }

 static uint64_t
 b24_get_int_arg (char     *arg,
                 int      *err,
                 uint64_t  max)
 {
 	int e = 0;
 	uint64_t u64 = b24_get_u64_arg(arg, &e);

 	if (u64 > max) {
 		u64 = max;
 		if (!e)
 			e = ERANGE;
 	}

 	*err = e;
 	return u64;
 }

 static struct b24_cfg *
 b24_parse_args (int    argc,
                char **argv)
 {
 	struct b24_cfg *cfg = b24_cfg_create(argc > 1 ? argc - 1 : 0);
 	if (!cfg)
 		return NULL;

 	enum b24_opt expect = 0;
 	char c_ = '\0';

 	for (int i = 1; i < argc; i++) {
 		char *a = argv[i];

 		if (expect) {
 			if (!*a)
 				goto missing_arg;

 		parse_arg: do{}while(0);
 			pragma_msvc(warning(push))
 			pragma_msvc(warning(disable: 4061))

 			uint64_t max = 15U;

 			switch (expect) {
 			case b24_opt_neuron:
 				max = UINT64_MAX;
 				break;
 			case b24_opt_debruijn:
 			case b24_opt_one_path:
 			case b24_opt_rotation:
 				break;
 			default:
 				b24_cosmic_ray(&cfg, "", __func__, __LINE__);
 			}

 			int e = 0;
 			uint64_t n = b24_get_int_arg(a, &e, max);
 			if (e) {
 				(void)fprintf(stderr, "Option %s "
 				              "parse error: %s\n",
 				              b24_opt_char_str(c_).d,
 				              strerror(e));
 				b24_helpful_exit(&cfg, EXIT_FAILURE);
 			}

 			switch (expect) {
 			case b24_opt_debruijn:
 				cfg->seq[cfg->n_seq++] = b24[n];
 				break;
 			case b24_opt_neuron:
 				cfg->neuron = n;
 				break;
 			case b24_opt_one_path:
 				cfg->offset = (b24_cfg_offset)n;
 				break;
 			case b24_opt_rotation:
 				cfg->rotation = (uint16_t)n;
 			default:
 				break;
 			}

 			pragma_msvc(warning(pop))

 			b24_option_add(cfg, expect);
 			expect = 0;

 			continue;
 		}


 		if (*a == '-') {
 			c_ = *++a;
 		concatenated_option:
 			unsigned ch_opt = b24_char_to_opt[(unsigned char)c_];
 			if (!ch_opt) {
 				(void)fprintf(stderr, "Option %s is unknown"
 				              "\n", b24_opt_char_str(c_).d);
 				b24_helpful_exit(&cfg, EXIT_FAILURE);
 			}

 			enum b24_opt opt = ch_opt & ~(NEEDS_ARG | ONLY_ONCE);

 			if (ch_opt & ONLY_ONCE)
 				b24_option_assert_once(cfg, opt);

 			if (ch_opt & NEEDS_ARG) {
 				expect = opt;
 				if (*++a)
 					goto parse_arg;
 			} else {
 				b24_option_add(cfg, opt);
 				if (c_ && (c_ = *++a))
 					goto concatenated_option;
 			}

 			continue;
 		}

 		int e = 0;
 		uint64_t n = b24_get_int_arg(a, &e, UINT16_MAX);
 		if (e) {
 			(void)fprintf(stderr, "Bad sequence: %s\n",
 			              strerror(e));
 			b24_cfg_destroy(&cfg);
 			return NULL;
 		}

 		cfg->seq[cfg->n_seq++] = (uint16_t)n;
 	}

 	int yikes = b24_options_incompatible(cfg, b24_opt_big_hex   ,
 	                                     1U << b24_opt_hex_path |
 	                                     1U << b24_opt_sig_path , ',')
 	          + b24_options_incompatible(cfg, b24_opt_cycles    ,
 	                                     1U << b24_opt_entries  |
 	                                     1U << b24_opt_neuron   |
 	                                     1U << b24_opt_expand   , ',')
 	          + b24_options_incompatible(cfg, b24_opt_entries   ,
 	                                     1U << b24_opt_cycles   |
 	                                     1U << b24_opt_neuron   |
 	                                     1U << b24_opt_expand   , ',')
 	          + b24_options_incompatible(cfg, b24_opt_graphviz  ,
 	                                     1U << b24_opt_json     |
 	                                     1U << b24_opt_python   , ',')
 	          + b24_options_incompatible(cfg, b24_opt_json      ,
 	                                     1U << b24_opt_python   |
 	                                     1U << b24_opt_graphviz , ',')
 	          + b24_options_incompatible(cfg, b24_opt_neuron    ,
 	                                     1U << b24_opt_one_path |
 	                                     1U << b24_opt_expand   , ',')
 	          + b24_options_incompatible(cfg, b24_opt_one_path  ,
 	                                     1U << b24_opt_expand   , ',')
 	          + b24_options_incompatible(cfg, b24_opt_python    ,
 	                                     1U << b24_opt_json     |
 	                                     1U << b24_opt_graphviz , ',')
 	          + b24_options_incompatible(cfg, b24_opt_expand    ,
 	                                     1U << b24_opt_cycles   |
 	                                     1U << b24_opt_entries  |
 	                                     1U << b24_opt_one_path , ',')
 	          + b24_options_incompatible(cfg, b24_opt_sig_path  ,
 	                                     1U << b24_opt_hex_path , ',');

 	if (yikes)
 		b24_helpful_exit(&cfg, EXIT_FAILURE);

 	if (expect)
 		goto missing_arg;

 	if (b24_has_option(cfg, b24_opt_help))
 		b24_helpful_exit(&cfg, EXIT_SUCCESS);

 	if (!cfg->n_seq) {
 		(void)fputs("No sequence(s) specified\n", stderr);
 		b24_helpful_exit(&cfg, EXIT_FAILURE);
 	}

 	return cfg;

 missing_arg:
 	(void)fprintf(stderr, "Option %s expects an argument\n",
 	              b24_opt_char_str(c_).d);
 	b24_helpful_exit(&cfg, EXIT_FAILURE);
 }

 static char *
 b24_render_gv_struct (char                *dst,
                      size_t               siz,
                      uint16_t             seq,
                      unsigned _BitInt(4)  id)
 {
 	if (siz <
 	    sizeof "S0 [label=\"<f>0|<e>0|<d>0|<c>0|"
 	                       "<b>0|<a>0|<9>0|<8>0|"
 	                       "<7>0|<6>0|<5>0|<4>0|"
 	                       "<3>0|<2>0|<1>0|<0>0\"];\n")
 		return NULL;

 	memcpy(dst, "S0 [label=\"", sizeof "S0 [label=\"" - 1U);
 	*++dst = hexdig(id);
 	dst += sizeof " [label=\"";

 	for (unsigned off = 15U;; --off) {
 		*dst++ = '<';
 		*dst++ = hexdig(off);
 		*dst++ = '>';
 		*dst++ = (char)((unsigned char)'0' + (seq >> off & 1U));
 		if (!off)
 			break;
 		*dst++ = '|';
 	}

 	*dst++ = '"';
 	*dst++ = ']';
 	*dst++ = ';';
 	*dst++ = '\n';

 	return dst;
 }

 int
 main (int    c,
      char **v)
 {
 	struct b24_cfg *cfg = b24_parse_args(c, v);
 	if (!cfg)
 		return EXIT_FAILURE;

 	b24_cfg_trace(cfg);
 	b24_cfg_destroy(&cfg);

 	return EXIT_SUCCESS;
 }

 pragma_msvc(warning(pop))
diff --git a/clock.h b/clock.h
 /** @file clock.h
 */
 #ifndef B24_CLOCK_H_
 #define B24_CLOCK_H_

 #ifndef _MSC_VER
 # include <time.h>
 #endif

 #include "vec.h"

 static force_inline vec128
 unix_epoch (void)
 {
 #ifdef _MSC_VER
 	FILETIME ft = {0};
 	GetSystemTimeAsFileTime(&ft);
 	return ((ULARGE_INTEGER){
 		.LowPart = ft.dwLowDateTime,
 		.HighPart = ft.dwHighDateTime,
 	}).QuadPart / UINT64_C(10000000) -
 	  ((369U * 365U + 89U) * UINT64_C(86400));
 #else
 	struct timespec ts = {0}, ts2 = {0};
 	(void)clock_gettime(CLOCK_REALTIME, &ts);
 	(void)clock_gettime(CLOCK_MONOTONIC, &ts2);

 	vec128 epoch = {
 		.u32[0] = ts.tv_sec,
 		.u32[2] = ts2.tv_sec,
 		.u32[1] = ts2.tv_sec,
 		.u32[3] = ts.tv_sec
 	};
 	return epoch;
 #endif
 }

 static force_inline void hol_up (uint64_t ns)
 {
 #ifdef _MSC_VER
 	LARGE_INTEGER t = {
 		.QuadPart = (int64_t)(ns / 100U) * -1
 	};
 	(void)NtDelayExecution(FALSE, &t);
 #else
 	struct timespec t = {
 		.tv_sec = (typeof(t.tv_sec))(ns / UINT64_C(1000000000)),
 		.tv_nsec = (typeof(t.tv_nsec))(ns % UINT64_C(1000000000))
 	};
 	(void)clock_nanosleep(CLOCK_MONOTONIC, 0, &t, NULL);
 #endif
 }

 #endif /* B24_CLOCK_H_ */
diff --git a/cortex.c b/cortex.c
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>

 #include "cortex.h"
 #include "seq.h"

 struct cortex *
 cortex_create (void)
 {
 	struct cortex *ctx = calloc(1, sizeof *ctx);
 	if (!ctx) {
 		perror("calloc");
 		return NULL;
 	}

 	cortex_init(ctx);
 	return ctx;
 }

 void
 cortex_destroy (struct cortex **ctx)
 {
 	if (ctx) {
 		struct cortex *c = *ctx;
 		*ctx = NULL;
 		ctx = NULL;
 		free(c);
 		c = NULL;
 	}
 }

 void
 cortex_init (struct cortex *ctx)
 {
 	const uint16_t de_bruijn_seq_2_4[] = {
 		0x9afU, 0x9ebU, 0xa6fU, 0xa7bU,
 		0xb3dU, 0xb4fU, 0xbcdU, 0xbd3U,
 		0xcbdU, 0xd2fU, 0xd79U, 0xde5U,
 		0xf2dU, 0xf4bU, 0xf59U, 0xf65U,
 	};

 	for (uint32_t i = 0, y = 0; y < 16U; ++y) {
 		uint32_t seq = de_bruijn_seq_2_4[y];
 		seq |= seq << 16U;
 		for (uint32_t x = i + 16U; i < x; ++i) {
 			ctx->paths[fwd(32, 8, i)] =
 				seq_expand_vec128((uint16_t)seq);
 			seq >>= 1U;
 		}
 	}

 	for (uint32_t i = 0; i < array_size(ctx->layer); ++i) {
 		layer_init(&ctx->layer[i], (struct layer_cfg){
 			.self = i,
 			.prev = (i + array_size(ctx->layer)
 			           - 1U) % array_size(ctx->layer),
 			.next = (i + 1U) % array_size(ctx->layer),
 			.mode = i,
 		}, &ctx->data[i], &ctx->data[i + 1U]);
 	}
 }
diff --git a/cortex.h b/cortex.h
 /** @file cortex.h
 */
 #ifndef B24_CORTEX_H_
 #define B24_CORTEX_H_

 #include "clock.h"
 #include "layer.h"
 #include "neuron.h"

 #ifdef __aarch64__
 # include "zp7.h"
 #endif

 enum ctx_flag {
 	CORTEX_TOCK = 1U
 };

 enum { DIR_IN, DIR_OUT };

 /**
 * @brief Cortex structure.
 *
 * This is the structure that holds all the neurons.
 *
 * More like a resonant bucket than a brain.
 */
 struct cortex {
 	uint64_t     resvd[7]; // align to cache line
 	uint64_t     flags;
 	vec128       paths[256];
 	struct layer layer[16];
 	struct chunk data[17];
 };

 union ctx_addr {
 	struct {
 		uint8_t b8  : 3;
 		uint8_t u8  : 4;
 		uint8_t vec : 1;
 		uint8_t     : 8;
 	};
 	struct {
 		uint8_t b64 : 6;
 		uint8_t u64 : 1;
 		uint8_t     : 1;
 		uint8_t rot : 4;
 		uint8_t seq : 4;
 	};
 	struct {
 		uint8_t     : 3;
 		uint8_t chn : 4;
 		uint8_t dir : 1;
 		uint8_t nrn : 8;
 	};
 	uint16_t raw;
 };
 _Static_assert(sizeof(union ctx_addr) == 2U, "");

 static pure_inline union ctx_addr
 ctx_addr (uint16_t raw)
 {
 	return (union ctx_addr){.raw = raw};
 }

 extern struct cortex *
 cortex_create (void);

 extern void
 cortex_destroy (struct cortex **ctx);

 extern void
 cortex_init (struct cortex *ctx);

 struct cortex_loc {
 	uint16_t bit : 2;
 	uint16_t chn : 4;
 	uint16_t rot : 4;
 	uint16_t seq : 4;
 	uint16_t pad : 2;
 };

 struct linear_loc {
 	uint16_t col : 7;
 	uint16_t row : 7;
 	uint16_t pad : 2;
 };

 static const_inline struct cortex_loc
 linear_loc_fwd (struct linear_loc loc)
 {
 	union {
 		struct cortex_loc ctx;
 		struct linear_loc lin;
 		uint16_t          raw;
 	} src = {
 		.lin = loc
 	}, dst = {
 		.raw = (uint16_t)fwd(32, 14, src.raw)
 	};

 	return dst.ctx;
 }

 static force_inline void
 cortex_tick (struct cortex *ctx)
 {
 	for (uint32_t i = 0; i < array_size(ctx->layer); ++i) {
 		layer_tick(&ctx->layer[i], &ctx->data[(i +  1U) & 15U]);
 		layer_tock(&ctx->layer[i], &ctx->data[i              ]);
 	}
 }

 #endif /* B24_CORTEX_H_ */
diff --git a/layer.c b/layer.c
 #include <stdio.h>

 #include "layer_priv.h"

 void
 layer_init (struct layer     *obj,
            struct layer_cfg  cfg,
            struct chunk     *in,
            struct chunk     *out)
 {
 	*obj = layer(cfg, in, out);
 }

 void
 layer_fini (struct layer *obj)
 {
 	(void)obj;
 }

 void
 layer_run (struct layer *obj,
           struct chunk *input)
 {
 	bool init_now = !obj->cfg.init;
 	unsigned hole = 0;

 	for (uint16_t i = 0; i < (uint16_t)array_size(obj->data[0]->v); ++i) {
 		// internal connections
 		vec128 self_vec = *layer_vector(obj, (union ctx_addr){
 			.dir = DIR_IN,
 			.nrn = i,
 		});
 		vec128 peer_vec[16] = {0};
 		union ctx_addr peer_addr[16] = {0};
 		for (uint16_t j = 0; j < (uint16_t)array_size(self_vec.u8); ++j) {
 			uint16_t tmp = layer_map(obj, (union ctx_addr){
 				.chn = j,
 				.nrn = i,
 			});
 			peer_addr[j] = (union ctx_addr){
 				.chn = tmp & 15U,
 				.dir = DIR_IN,
 				.nrn = tmp >> 4U,
 			};
 			peer_vec[j] = *layer_vector(obj, peer_addr[j]);
 		}
 		for (uint16_t j = 0; j < (uint16_t)array_size(self_vec.u8); ++j) {
 			if (peer_addr[j].nrn == i) {
 				if (init_now)
 					++hole;
 				// self connection (external input)
 				// TOEDOE: run callback and assign result to self.u8[j]
 				//self_vec.u8[j] = input->v[i].u8[j];
 				continue;
 			}

 			uint8_t ch = self_vec.u8[j];
 			uint8_t up = layer_channel_whence(
 				obj,
 				(union ctx_addr) {
 					.chn = j,
 					.nrn = i,
 				}
 			);
 			bool match_lsb = (ch & 15U) == (up & 15U);
 			bool match_msb = (ch >> 4U) == j;
 			if ((match_msb ^ match_lsb)) {
 				peer_vec[j].u8[j] = self_vec.u8[j];
 				self_vec.u8[j] = 0;

 			} else if (match_msb) {
 				self_vec.u8[j] = 0;

 			} else {
 				self_vec.u8[j] = peer_vec[j].u8[j];
 				//peer_vec[j].u8[j] = 0;
 			}

 			layer_save_vector(obj, (union ctx_addr){
 				.dir = DIR_OUT,
 				.nrn = peer_addr[j].nrn,
 			}, peer_vec[j]);
 		}

 		struct neuron nrn = layer_make_neuron(
 			obj,
 			(union ctx_addr){
 				.dir = DIR_IN,
 				.nrn = i,
 			}
 		);
 		layer_save_vector(
 			obj,
 			(union ctx_addr) {
 				.dir = DIR_OUT,
 				.nrn = i,
 			},
 			neuron_send(&nrn, self_vec)
 		);
 	}

 	if (init_now) {
 		obj->cfg.init = 1;
 		obj->cfg.hole = hole;
 		(void)printf("inputs: %u\n", hole);
 	}
 }
diff --git a/layer.h b/layer.h
 /** @file layer.h
 */
 #ifndef B24_LAYER_H_
 #define B24_LAYER_H_

 #include "vec.h"

 struct layer_cfg {
 	uint64_t self :  4;
 	uint64_t prev :  4;
 	uint64_t next :  4;
 	uint64_t mode :  4;
 	uint64_t hole :  8;
 	uint64_t init :  1;
 	uint64_t tock :  1;
 	uint64_t resv : 36;
 };
 _Static_assert(sizeof(struct layer_cfg)
            == sizeof(uint64_t), "");

 /**
 * @brief Network layer.
 */
 struct layer {
 	struct layer_cfg  cfg;
 	struct chunk     *data[2];
 };
 _Static_assert(sizeof(struct layer)
            == sizeof(struct layer_cfg)
             + sizeof(struct chunk *) * 2U, "");

 extern void
 layer_init (struct layer     *obj,
            struct layer_cfg  cfg,
            struct chunk     *in,
            struct chunk     *out);

 extern void
 layer_fini (struct layer *obj);

 extern void
 layer_run (struct layer *obj,
           struct chunk *in);

 static force_inline void
 layer_tick (struct layer *obj,
            struct chunk *in)
 {
 	obj->cfg.tock = 0;
 	layer_run(obj, in);
 }

 static force_inline void
 layer_tock (struct layer *obj,
            struct chunk *in)
 {
 	obj->cfg.tock = 1;
 	layer_run(obj, in);
 }


 #endif /* B24_LAYER_H_ */
diff --git a/layer_priv.h b/layer_priv.h
 /** @file layer_priv.h
 */
 #ifndef B24_LAYER_PRIV_H_
 #define B24_LAYER_PRIV_H_

 #include <stdio.h>

 #include "cortex.h"
 #include "layer.h"

 static pure_inline struct cortex *
 to_cortex (struct layer const *obj)
 {
 	return container_of(obj, struct cortex, layer[obj->cfg.self]);
 }

 static pure_inline struct vec128 *
 layer_vector (struct layer const *obj,
              union ctx_addr      addr)
 {
 	return &obj->data[addr.dir ^ obj->cfg.tock]->v[addr.nrn];
 }

 static pure_inline struct neuron
 layer_make_neuron (struct layer const *obj,
 	           union ctx_addr      addr)
 {
 	return neuron_from_vec128(
 		&to_cortex(obj)->paths[addr.nrn],
 		layer_vector(obj, addr)
 	);
 }

 static pure_inline uint8_t
 layer_channel_whence (struct layer const *obj,
                      union ctx_addr      addr)
 {
 	return to_cortex(obj)->paths[addr.nrn].u8[addr.chn];
 }

 static pure_inline uint8_t
 layer_channel_get (struct layer const *obj,
                   union ctx_addr      addr)
 {
 	return layer_vector(obj, addr)->u8[addr.chn];
 }

 static force_inline void
 layer_channel_set (struct layer   *obj,
                   union ctx_addr  addr,
                   uint8_t         data)
 {
 	layer_vector(obj, addr)->u8[addr.chn] = data;
 }

 static pure_inline uint8_t
 layer_channel_swap (struct layer   *obj,
                    union ctx_addr  addr,
                    uint8_t         data)
 {
 	uint8_t ret = layer_channel_get(obj, addr);
 	addr.dir ^= 1U;
 	layer_channel_set(obj, addr, data);
 	return ret;
 }

 static force_inline struct layer
 layer (struct layer_cfg  cfg,
       struct chunk     *in,
       struct chunk     *out)
 {
 	return (struct layer) {
 		.cfg  = cfg,
 		.data = {in, out}
 	};
 }

 static force_inline void
 layer_save_vector (struct layer   *obj,
                   union ctx_addr  addr,
                   vec128          data)
 {
 	*layer_vector(obj, addr) = data;
 }

 static pure_inline uint16_t
 layer_map (struct layer const *obj,
           union ctx_addr      addr)
 {
 	struct cortex *ctx = to_cortex(obj);
 	const uint16_t index[16] = {
 		// cortex::paths is shuffled with rev(32, 8),
 		// the original indices are the EOL comments.
 		 83,  36, 100,  99, 110, 129, 213, 200, //  29,  66,  74,  89, 122, 129, 143, 168,
 		204, 160, 224, 240, 167, 249, 253, 170, // 170, 192, 200, 204, 211, 237, 239, 240,
 	};

 	uint16_t s = index[addr.chn];
 	uint16_t r = index[obj->cfg.mode];
 	uint16_t them = (uint16_t)ctx->paths[s].u8[addr.seq] << 8U
 	              | (uint16_t)ctx->paths[r].u8[addr.rot] << 4U
 	              | (uint16_t)ctx->paths[r].u8[addr.chn];
 	return (uint16_t)rev(32, 12, them);
 }

 #endif /* B24_LAYER_PRIV_H_ */
diff --git a/neuron.h b/neuron.h
 /**
 * @file neuron.h
 */
 #ifndef B24_NEURON_H_
 #define B24_NEURON_H_

 #include <limits.h>

 #include "seq.h"
 #include "vec.h"

 /**
 * @brief Runtime neuron structure.
 *
 * This is what is actually used with the SIMD instructions.
 */
 struct neuron {
 	u8x16 paths;
 	u8x16 state;
 };

 static pure_inline struct neuron
 neuron_from_vec128 (vec128 *paths,
                    vec128 *state)
 {
 	return (struct neuron){
 		.paths = u8x16_from_vec128(*paths),
 		.state = u8x16_from_vec128(*state)
 	};
 }

 static pure_inline struct neuron
 neuron (uint16_t seq,
        vec128 state)
 {
 	return (struct neuron){
 		.paths = seq_expand_u8x16(seq),
 		.state = u8x16_from_vec128(state)
 	};
 }

 static force_inline u8x16
 neuron_tick (struct neuron *nrn,
             u8x16          in)
 {
 	u8x16 state = nrn->state;
 	nrn->state = age_and_mutate(
 		merge_input(
 			state,
 			in
 		),
 		nrn->paths
 	);
 	return state;
 }

 static force_inline vec128
 neuron_tock (struct neuron *nrn,
             vec128         in)
 {
 	nrn->state = age_and_mutate(
 		merge_input(
 			nrn->state,
 			u8x16_from_vec128(in)
 		),
 		nrn->paths
 	);
 	return vec128_from_u8x16(nrn->state);
 }

 static force_inline vec128
 neuron_send (struct neuron *nrn,
             vec128         in)
 {
 	nrn->state = mutate(
 		select_nonzero_input(
 			nrn->state,
 			u8x16_from_vec128(in)
 		),
 		nrn->paths
 	);
 	return vec128_from_u8x16(nrn->state);
 }

 #endif /* B24_NEURON_H_ */
diff --git a/popcnt.h b/popcnt.h
 /** @file popcnt.h
 */
 #ifndef B24_POPCNT_H_
 #define B24_POPCNT_H_

 #include <limits.h>
 #include <stdint.h>

 #include "util.h"

 #undef define_popcnt
 #undef popcnt16_impl
 #undef popcnt32_impl
 #undef popcnt64_impl
 #undef popcnt_compat

 #ifdef _MSC_VER
 # define popcnt16_impl  return __popcnt16
 # pragma intrinsic(__popcnt16)

 # define popcnt32_impl  return __popcnt
 # pragma intrinsic(__popcnt)

 # ifdef _M_IX86
 #  undef popcnt32x2
 #  define popcnt64_impl return popcnt32x2
 #  define popcnt32x2(x) \
        (__popcnt((uint32_t)(x >> 32U)) + \
         __popcnt((uint32_t)(x & ~(uint32_t)0)))
 # else
 #  define popcnt64_impl return __popcnt64
 #  pragma intrinsic(__popcnt64)
 # endif

 # define popcnt_pre_pragma _Pragma("warning(push)") \
        _Pragma("warning(disable: 4116)")
 # define popcnt_post_pragma _Pragma("warning(pop)")

 #else
 # if __has_builtin(__builtin_popcount) && (UINT_MAX == UINT32_MAX)
 #  define popcnt16_impl return (uint16_t)__builtin_popcount
 #  define popcnt32_impl return (uint32_t)__builtin_popcount
 # endif

 # if !defined(popcnt32_impl) && \
     __has_builtin(__builtin_popcountl) && (ULONG_MAX == UINT32_MAX)
 #  define popcnt16_impl return (uint16_t)__builtin_popcountl
 #  define popcnt32_impl return (uint32_t)__builtin_popcountl
 # endif

 # if __has_builtin(__builtin_popcountl) && (ULONG_MAX == UINT64_MAX)
 #  define popcnt64_impl return (uint64_t)__builtin_popcountl
 # endif

 # if !defined(popcnt64_impl) && \
     __has_builtin(__builtin_popcountll) && (ULLONG_MAX == UINT64_MAX)
 #  define popcnt64_impl return (uint64_t)__builtin_popcountll
 # endif

 # if __has_builtin(__builtin_popcountg)
 #  if !defined(popcnt16_impl)
 #   define popcnt16_impl return (uint16_t)__builtin_popcountg
 #  endif
 #  if !defined(popcnt32_impl)
 #   define popcnt32_impl return (uint32_t)__builtin_popcountg
 #  endif
 #  if !defined(popcnt64_impl)
 #   define popcnt64_impl return (uint64_t)__builtin_popcountg
 #  endif
 # endif
 #endif

 #define define_popcnt(b, ...) const_inline            \
        static uint##b##_t popcnt##b(uint##b##_t val) \
        { __VA_ARGS__(val); } _Static_assert(b>0, #b)

 #define popcnt_compat(x) typeof(_Generic(             \
                (char(*)[2 - (sizeof(x) > 4U)])0      \
                ,char(*)[1]: (uint64_t)0              \
                ,char(*)[2]: (uint32_t)0)) y  = (x);  \
        y -=  (typeof(y))-1/3 & (y >> 1U);            \
        y  = ((typeof(y))-1/5 & (y >> 2U))            \
           + ((typeof(y))-1/5 &  y);                  \
        return ((typeof(y))-1/ 17 & ((y >> 4U) + y))  \
             * ((typeof(y))-1/255) >> (sizeof y - 1U) \
             * CHAR_BIT

 #ifndef popcnt16_impl
 # define popcnt16_impl popcnt_compat
 #endif

 #ifndef popcnt32_impl
 # define popcnt32_impl popcnt_compat
 #endif

 #ifndef popcnt64_impl
 # define popcnt64_impl popcnt_compat
 #endif

 #ifndef popcnt_pre_pragma
 # define popcnt_pre_pragma
 #endif

 #ifndef popcnt_post_pragma
 # define popcnt_post_pragma
 #endif

 define_popcnt(16, popcnt16_impl);
 define_popcnt(32, popcnt32_impl);
 define_popcnt(64, popcnt64_impl);

 #undef define_popcnt
 #undef popcnt16_impl
 #undef popcnt32_impl
 #undef popcnt64_impl
 #undef popcnt_compat

 #if defined(_MSC_VER) && defined(_M_IX86)
 # undef popcnt32x2
 #endif

 #define popcnt(x) popcnt_pre_pragma _Generic((x),     \
        typeof(_Generic((char)0,                      \
                signed char: (struct{int i;}){0},     \
                unsigned char: (struct{int i;}){0},   \
                default: (char)0)): popcnt16,         \
        signed char: popcnt16, short: popcnt16,       \
        unsigned char: popcnt16, int: popcnt32,       \
        unsigned short: popcnt16, unsigned: popcnt32, \
        long long: popcnt64, unsigned long: _Generic( \
                &(int[sizeof 1UL]){0},                \
                int(*)[sizeof(uint32_t)]: popcnt32,   \
                int(*)[sizeof(uint64_t)]: popcnt64),  \
        unsigned long long: popcnt64, long: _Generic( \
                &(int[sizeof 1L]){0},                 \
                int(*)[sizeof(int32_t)]: popcnt32,    \
                int(*)[sizeof(int64_t)]: popcnt64))   \
        (_Generic((x), default: (x),                  \
                long long: (unsigned long long)(x),   \
                signed char: (unsigned char)(x),      \
                short: (unsigned short)(x),           \
                typeof(_Generic((char)0,              \
                        signed char:                  \
                                (struct{int i;}){0},  \
                        unsigned char:                \
                                (struct{int i;}){0},  \
                        default: (char)0)):           \
                                (unsigned char)(x),   \
                long: (unsigned long)(x),             \
                int: (unsigned)(x))) popcnt_post_pragma

 #endif /* B24_POPCNT_H_ */
diff --git a/seq.h b/seq.h
 /** @file seq.h
 */
 #ifndef B24_SEQ_H_
 #define B24_SEQ_H_

 #include "vec.h"

 #ifdef __aarch64__
 static const_inline u8x16
 mutate (u8x16 state,
        u8x16 paths)
 {
 	return vqtbl1q_u8(state, paths);
 }

 static const_inline uint16_t
 sum_of_abs_diff (u8x16 a,
                 u8x16 b)
 {
 	return vaddvq_u8(vabdq_u8(a, b));
 }

 /**
 * @brief Age and mutate the state.
 *
 * Decrement each non-zero strength by one and apply the given mutation.
 *
 * @param state The current state.
 * @param paths The mutation descriptor.
 * @return The resulting state.
 */
 static const_inline u8x16
 age_and_mutate (u8x16 state,
                u8x16 paths)
 {
 	return vqtbl1q_u8(
 		vsubq_u8(
 			state,
 			vbslq_u8(
 				vtstq_u8(
 					state,
 					vdupq_n_u8(0xf0)
 				),
 				vdupq_n_u8(0x10),
 				state
 			)
 		),
 		paths
 	);
 }

 /**
 * @brief Merge input into the current state, preferring the input.
 *
 * For each channel, if the input channel has a non-zero strength,
 * use that channel in the output, otherwise use the corresponding
 * channel from the current state.
 *
 * @param state The current state.
 * @param input The input to merge into the state.
 * @return The merged state.
 */
 static const_inline u8x16
 merge_input (u8x16 state,
             u8x16 input)
 {
 	return vbslq_u8(
 		vtstq_u8(
 			input,
 			vdupq_n_u8(0xf0)
 		),
 		input,
 		state
 	);
 }

 static const_inline u8x16
 select_nonzero_input (u8x16 state,
                      u8x16 input)
 {
 	return vbslq_u8(
 		vtstq_u8(
 			input,
 			vdupq_n_u8(0xff)
 		),
 		input,
 		state
 	);
 }

 static force_inline uint64_t
 seq_expand (uint16_t seq)
 {
 	uint16_t rot = seq << 12U | seq >> 4U;
 	uint64_t ret = 0;

 	uint64x2_t x = vreinterpretq_u64_u8(
 		vqtbl1q_u8(
 			vreinterpretq_u8_u16(
 				vld1q_lane_u16(
 					&rot,
 					vld1q_lane_u16(
 						&seq,
 						vdupq_n_u16(0),
 						0
 					),
 					1
 				)
 			),
 			vld1q_u8(((uint8_t[16]){
 				0x00, 0x00, 0x00, 0x00,
 				0x02, 0x02, 0x02, 0x02,
 				0x01, 0x01, 0x01, 0x01,
 				0x03, 0x03, 0x03, 0x03})
 			)
 		)
 	);
 	x = vorrq_u64(
 		vandq_u64(
 			x,
 			vdupq_n_u64(UINT64_C(0x003c000f003c000f))
 		),
 		vshrq_n_u64(
 			vandq_u64(
 				x,
 				vdupq_n_u64(UINT64_C(0x78001e0078001e00))
 			),
 			5
 		)
 	);
 	x = vorrq_u64(
 		vandq_u64(
 			x,
 			vdupq_n_u64(UINT64_C(0x000000ff000000ff))
 		),
 		vshrq_n_u64(
 			vandq_u64(
 				x,
 				vdupq_n_u64(UINT64_C(0x03fc000003fc0000))
 			),
 			10
 		)
 	);
 	vst1q_lane_u64(
 		&ret,
 		vreinterpretq_u64_u8(
 			vqtbl1q_u8(
 				vreinterpretq_u8_u64(x),
 				vld1q_u8(((uint8_t[16]){
 					0x00, 0x01, 0x04, 0x05,
 					0x08, 0x09, 0x0c, 0x0d,
 					0x80, 0x80, 0x80, 0x80,
 					0x80, 0x80, 0x80, 0x80})
 				)
 			)
 		),
 		0
 	);
 	return ret;
 }

 static force_inline u8x16
 seq_expand_u8x16 (uint16_t seq)
 {
 	uint64x2_t x = vreinterpretq_u64_u8(
 		vqtbl1q_u8(
 			vreinterpretq_u8_u16(
 				vld1q_lane_u16(
 					(uint16_t[1]){
 						seq << 12U |
 						seq >>  4U
 					},
 					vld1q_lane_u16(
 						&seq,
 						vdupq_n_u16(0),
 						0
 					),
 					1
 				)
 			),
 			vld1q_u8(((uint8_t[16]){
 				0x00, 0x00, 0x00, 0x00,
 				0x02, 0x02, 0x02, 0x02,
 				0x01, 0x01, 0x01, 0x01,
 				0x03, 0x03, 0x03, 0x03})
 			)
 		)
 	);
 	x = vorrq_u64(
 		vandq_u64(
 			x,
 			vdupq_n_u64(UINT64_C(0x003c000f003c000f))
 		),
 		vshrq_n_u64(
 			vandq_u64(
 				x,
 				vdupq_n_u64(UINT64_C(0x78001e0078001e00))
 			),
 			5
 		)
 	);
 	x = vorrq_u64(
 		vandq_u64(
 			x,
 			vdupq_n_u64(UINT64_C(0x000000ff000000ff))
 		),
 		vshrq_n_u64(
 			vandq_u64(
 				x,
 				vdupq_n_u64(UINT64_C(0x03fc000003fc0000))
 			),
 			10
 		)
 	);
 	x = vreinterpretq_u64_u8(
 		vqtbl1q_u8(
 			vreinterpretq_u8_u64(x),
 			vld1q_u8(((uint8_t[16]){
 				0x00, 0x80, 0x01, 0x80,
 				0x04, 0x80, 0x05, 0x80,
 				0x08, 0x80, 0x09, 0x80,
 				0x0c, 0x80, 0x0d, 0x80})
 			)
 		)
 	);
 	uint64x2_t y = vandq_u64(
 		x,
 		vdupq_n_u64(UINT64_C(0x00f000f000f000f0))
 	);
 	return vreinterpretq_u8_u64(
 		vorrq_u64(
 			veorq_u64(x, y),
 			vshlq_n_u64(y, 4)
 		)
 	);
 }

 static force_inline vec128
 seq_expand_vec128 (uint16_t seq)
 {
 	return vec128_from_u8x16(seq_expand_u8x16(seq));
 }
 #endif /* __aarch64__ */

 #if defined(__x86_64__) || defined(_MSC_VER)
 # define pext_mask 0x783c1e0f783c1e0fULL
 # define pdep_mask 0x0f0f0f0f0f0f0f0fULL

 static force_inline uint64_t
 seq_expand (uint16_t seq)
 {
 	u8x16 k = _mm_shuffle_epi8(
 		_mm_set_epi64x(
 			0, (uint32_t)(seq << 12U | seq >> 4U) << 16U | seq
 		),
 		_mm_set_epi64x(0x0303030301010101LL, 0x0202020200000000LL)
 	);

 	return _pext_u64((uint64_t)_mm_extract_epi64(k, 1), pext_mask) << 32U
 	     | _pext_u64((uint64_t)_mm_extract_epi64(k, 0), pext_mask);
 }

 static force_inline vec128
 seq_expand_vec128 (uint16_t seq)
 {
 	u8x16 k = _mm_shuffle_epi8(
 		_mm_set_epi64x(
 			0, (uint32_t)(seq << 12U | seq >> 4U) << 16U | seq
 		),
 		_mm_set_epi64x(0x0303030301010101LL, 0x0202020200000000LL)
 	);

 	return (vec128){
 		.u64[0] = _pdep_u64(
 			_pext_u64(
 				(unsigned long long)_mm_extract_epi64(k, 0),
 				pext_mask
 			),
 			pdep_mask
 		),
 		.u64[1] = _pdep_u64(
 			_pext_u64(
 				(unsigned long long)_mm_extract_epi64(k, 1),
 				pext_mask
 			),
 			pdep_mask
 		)
 	};
 }

 static force_inline u8x16
 seq_expand_u8x16 (uint16_t seq)
 {
 	return u8x16_from_vec128(seq_expand_vec128(seq));
 }

 # undef pdep_mask
 # undef pext_mask

 static const_inline u8x16
 mutate (u8x16 state,
        u8x16 paths)
 {
 	return _mm_shuffle_epi8(state, paths);
 }

 static const_inline uint16_t
 sum_of_abs_diff (u8x16 a,
                 u8x16 b)
 {
 	__m128i x = _mm_sad_epu8(a, b);
 	return _mm_extract_epi16(x, 0)
 	     + _mm_extract_epi16(x, 4);
 }

 static const_inline u8x16
 age_and_mutate (u8x16 state,
                u8x16 paths)
 {
 	return _mm_shuffle_epi8(
 		_mm_subs_epu8(
 			state,
 			_mm_set_epi64x(
 				0x1010101010101010LL,
 				0x1010101010101010LL
 			)
 		),
 		paths
 	);
 }

 static const_inline u8x16
 merge_input (u8x16 state,
             u8x16 input)
 {
 	return _mm_blendv_epi8(
 		state,
 		input,

 		/* Mask construction: adding 0x70 (saturated)
 		 * sets the top bit iff the value is at least
 		 * 0x10, and the result can be used as a mask
 		 * in `_mm_blendv_epi8()`.
 		 */
 		_mm_adds_epu8(
 			input,
 			_mm_set_epi64x(
 				0x7070707070707070LL,
 				0x7070707070707070LL
 			)
 		)
 	);
 }

 static const_inline u8x16
 select_nonzero_input (u8x16 state,
                      u8x16 input)
 {
 	return _mm_blendv_epi8(
 		input,
 		state,
 		_mm_cmpeq_epi8(
 			input,
 			_mm_set_epi64x(0LL, 0LL)
 		)
 	);
 }

 #endif /* __x86_64__ || _MSC_VER */

 static const_inline uint16_t
 mutation_sad (u8x16 state,
              u8x16 paths)
 {
 	return sum_of_abs_diff(
 		state,
 		mutate(state, paths)
 	);
 }

 #endif /* B24_SEQ_H_ */
diff --git a/test-cortex.c b/test-cortex.c
 /**
 * @file test-cortex.c
 *
 * Compiling with MSVC 19.41 or later:
 *
 *   cl.exe /TC /std:clatest /O2 /Oi /GL /GF /Zo- /favor:AMD64 /arch:AVX2 cortex.c test-cortex.c /Fe: test-cortex.exe /MD /link ntdll.lib
 */
 #include <inttypes.h>
 #include <math.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include "cortex.h"

 int
 main (void)
 {
 	char screen[69897];
 	char *s = screen;
 	struct cortex *ctx = cortex_create();

 	*s++ = '\033';
 	*s++ = '[';
 	*s++ = '2';
 	*s++ = 'J';
 	for (uint64_t k = 0U;; ++k) {
 		*s++ = '\033';
 		*s++ = '[';
 		*s++ = 'H';
 		for (uint16_t row = 0U; row < 128U; row += 2U) {
 			uint8_t r = 255U;
 			for (uint16_t col = 0U; col < 128U; col += 2U) {
 				struct cortex_loc loc =
 					linear_loc_fwd(
 						(struct linear_loc){
 							.row = row,
 							.col = col
 						}
 				);
 				uint8_t b = ctx->data[0].v[loc.seq << 4U | loc.rot]
 				                       .u8[loc.chn                ];
 				uint8_t p = b >> 4U;
 				uint8_t q = b & 15U;
 				if (p != r) {
 					r = p;
 					*s++ = '\033';
 					*s++ = '[';
 					*s++ = '3';
 					*s++ = '8';
 					*s++ = ';';
 					*s++ = '5';
 					*s++ = ';';
 					p = 232U + ((p & 8U)
 					            ? -((16U - p) * 23U - 3U)
 					            : p);
 					if (p >= 10U) {
 						if (p >= 100U) {
 							*s++ = (char)(p / 100U + (uint8_t)'0');
 							p %= 100U;
 						}
 						*s++ = (char)(p / 10U + (uint8_t)'0');
 						p %= 10U;
 					}
 					*s++ = (char)(p + (uint8_t)'0');
 					*s++ = 'm';
 				}
 				*s++ = "0123456789abcdef"[p];
 				*s++ = "0123456789abcdef"[q];
 			}
 			*s++ = '\n';
 		}
 		*s++ = '\0';
 		s = screen;
 		printf("%s\033[m%128" PRIu64 "\n", s, k);

 		cortex_tick(ctx);
 		//hol_up(100000000);
 	}

 	cortex_destroy(&ctx);
 }
diff --git a/util.h b/util.h
 /** @file util.h
 */
 #ifndef B24_UTIL_H_
 #define B24_UTIL_H_

 #ifdef _MSC_VER
 # define WIN32_LEAN_AND_MEAN
 # include <intrin.h>
 # include <windows.h>
 # include <sysinfoapi.h>
 #else
 # if defined(__aarch64__)
 #  include <arm_neon.h>
 # elif defined(__x86_64__)
 #  include <immintrin.h>
 # endif
 #endif

 #include <stddef.h>
 #include <stdint.h>

 #ifdef __aarch64__
 typedef uint8x16_t u8x16;
 #else
 typedef __m128i u8x16;
 #endif

 #ifndef __has_builtin
 # define __has_builtin(...) 0
 #endif

 #ifdef _MSC_VER
 # define __attribute__(...)
 # define force_inline           __forceinline
 # define pragma_msvc(...)       _Pragma(#__VA_ARGS__)
 # define typeof                 __typeof__
 #else
 # define force_inline           __attribute__((always_inline)) inline
 # define pragma_msvc(...)
 #endif

 #define array_size(a)   (sizeof (a) / sizeof (a)[0])
 #define const_function  __attribute__((const))
 #define const_inline    const_function force_inline
 #define pure_inline     __attribute__((pure)) force_inline

 /**
 * @brief Macros for doing Morton encoding stuff.
 *
 * We employ Morton encoding to map linear coordinates (e.g. array offsets
 * and bit indices) onto a quad tree space. This allows us to think of the
 * layer topology as recursive nested quadrants, which is a more intuitive
 * way to be confused.
 *
 * What happens in practice is that we take the upper and lower halves of
 * an index and interleave them to form a single Morton encoded address.
 * `fwd()` converts a linear address to a quad tree representation, `rev()`
 * does the inverse, and `M()` computes the mask for a given bit width and
 * interleaving pattern.
 *
 * Common parameters:
 *
 * @param T   The integer width to use for computing and storing the Morton
 *            encoded value (usually 32 or 64).
 * @param W   The number of value bits to encode. Must be a multiple of 2.
 * @param odd Whether to operate on the odd or even bits of the value.
 * @param v   The value to encode.
 */
 #define M(T, W, odd) (((uint##T##_t)-1) / 3U << !!(odd) >> ((8U << ((W > 32U) + 2U)) - W))
 #define fwd(T, W, v) (_pdep_u##T((v), M(T, W, 0)) | _pdep_u##T((v) >> W / 2U, M(T, W, 1)))
 #define rev(T, W, v) (_pext_u##T((v), M(T, W, 0)) | _pext_u##T((v), M(T, W, 1)) << W / 2U)


 #define container_of(ptr, type, member) \
        ((type *)((char *)(1 ? (ptr) : &((type *)0)->member) - offsetof(type, member)))

 #endif /* B24_UTIL_H_ */
diff --git a/vec.h b/vec.h
 /** @file vec.h
 */
 #ifndef B24_VEC_H_
 #define B24_VEC_H_

 #include <stdint.h>

 #include "util.h"

 typedef struct vec4x2 {
 	uint8_t  x  : 4; uint8_t  y  : 4;
 } vec4x2;

 typedef struct vec4x4 {
 	uint16_t x0 : 4; uint16_t y0 : 4;
 	uint16_t x1 : 4; uint16_t y1 : 4;
 } vec4x4;

 typedef struct vec4x8 {
 	uint32_t x0 : 4; uint32_t y0 : 4;
 	uint32_t x1 : 4; uint32_t y1 : 4;
 	uint32_t x2 : 4; uint32_t y2 : 4;
 	uint32_t x3 : 4; uint32_t y3 : 4;
 } vec4x8;

 typedef struct vec4x16 {
 	uint64_t x0 : 4; uint64_t y0 : 4;
 	uint64_t x1 : 4; uint64_t y1 : 4;
 	uint64_t x2 : 4; uint64_t y2 : 4;
 	uint64_t x3 : 4; uint64_t y3 : 4;
 	uint64_t x4 : 4; uint64_t y4 : 4;
 	uint64_t x5 : 4; uint64_t y5 : 4;
 	uint64_t x6 : 4; uint64_t y6 : 4;
 	uint64_t x7 : 4; uint64_t y7 : 4;
 } vec4x16;

 typedef struct vec128 {
 	union {
 		uint8_t  u8[  16];
 		vec4x2   u4x2[16];
 		uint16_t u16[  8];
 		vec4x4   u4x4[ 8];
 		uint32_t u32[  4];
 		vec4x8   u4x8[ 4];
 		uint64_t u64[  2];
 		vec4x16  u4x16[2];
 	};
 } vec128;

 _Static_assert(sizeof(vec4x2 ) ==  1U,"");
 _Static_assert(sizeof(vec4x4 ) ==  2U,"");
 _Static_assert(sizeof(vec4x8 ) ==  4U,"");
 _Static_assert(sizeof(vec4x16) ==  8U,"");
 _Static_assert(sizeof(vec128 ) == 16U,"");

 struct chunk {
 	vec128 v[256];
 };

 #ifdef __aarch64__
 static pure_inline u8x16
 u8x16_from_vec128 (vec128 v)
 {
 	return vld1q_u8(&v.u8[0]);
 }

 static pure_inline vec128
 vec128_from_u8x16 (u8x16 v)
 {
 	vec128 ret;
 	vst1q_u8(&ret.u8[0], v);
 	return ret;
 }
 #else
 static pure_inline u8x16
 u8x16_from_vec128 (vec128 v)
 {
 	return _mm_set_epi64x(
 		(long long)v.u64[1],
 		(long long)v.u64[0]
 	);
 }

 static pure_inline vec128
 vec128_from_u8x16 (u8x16 v)
 {
 	return (vec128){
 		.u64[0] = (uint64_t)_mm_extract_epi64(v, 0),
 		.u64[1] = (uint64_t)_mm_extract_epi64(v, 1),
 	};
 }
 #endif

 #endif /* B24_VEC_H_ */
diff --git a/zp7.c b/zp7.c
 // ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)
 //
 // Copyright (c) 2020 Zach Wegner
 // 
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 // 
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
 // 
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.

 #include "popcnt.h"

 // ZP7: branchless PEXT/PDEP replacement code for non-Intel processors
 //
 // The PEXT/PDEP instructions are pretty cool, with various (usually arcane)
 // uses, behaving like bitwise gather/scatter instructions. They were introduced
 // by Intel with the BMI2 instructions on Haswell.
 //
 // AMD processors implement these instructions, but very slowly. PEXT/PDEP can
 // take from 18 to ~300 cycles, depending on the input mask. See this table:
 // https://mobile.twitter.com/InstLatX64/status/1209095219087585281
 // Other processors don't have PEXT/PDEP at all. This code is a polyfill for
 // these processors. It's much slower than the raw instructions on Intel chips
 // (which are 3L1T), but should be faster than AMD's implementation.
 //
 // Description of the algorithm
 // ====
 //
 // This code uses a "parallel prefix popcount" technique (hereafter PPP for
 // brevity). What this means is that we determine, for every bit in the input
 // mask, how many bits below it are set. Or rather, aren't set--we need to get
 // a count of how many bits each input bit should be shifted to get to its final
 // position, that is, the difference between the bit-index of its destination
 // and its original bit-index. This is the same as the count of unset bits in
 // the mask below each input bit.
 //
 // The dumb way to get this PPP would be to create a 64-element array in a loop,
 // but we want to do this in a bit-parallel fashion. So we store the counts
 // "vertically" across six 64-bit values: one 64-bit value holds bit 0 of each
 // of the 64 counts, another holds bit 1, etc. We can compute these counts
 // fairly easily using a parallel prefix XOR (XOR is equivalent to a 1-bit
 // adder that wraps around and ignores carrying). Using parallel prefix XOR as
 // a 1-bit adder, we can build an n-bit adder by shifting the result left by
 // one and ANDing with the input bits: this computes the carry by seeing where
 // an input bit causes the 1-bit sum to overflow from 1 to 0. The shift left
 // is needed anyways, because we want the PPP values to represent population
 // counts *below* each bit, not including the bit itself.
 //
 // For processors with the CLMUL instructions (most x86 CPUs since ~2010), we
 // can do the parallel prefix XOR and left shift in one instruction, by
 // doing a carry-less multiply by -2.
 //
 // Anyways, once we have these six 64-bit values of the PPP, we can use each
 // PPP bit to shift input bits by a power of two. That is, input bits that are
 // in the bit-0 PPP mask are shifted by 2**0==1, bits in the bit-1 mask get
 // shifted by 2, and so on, for shifts by 4, 8, 16, and 32 bits. Out of these
 // six shifts, any shift value between 0 and 63 can be composed.
 //
 // For PEXT, we have to perform each shift in increasing order (1, 2, ...32) so
 // that input bits don't overlap in the intermediate results. PDEP is the
 // opposite: the 32-bit shift needs to happen first to make room for the smaller
 // shifts. There's also a small complication for PDEP in that the PPP values
 // line up where the input bits *end*, rather than where the input bits start
 // like for PEXT. This means each bit mask needs to be shifted backwards before
 // ANDing with the input bits.
 //
 // For both PEXT/PDEP the input bits need to be pre-masked so that only the
 // relevant bits are being shifted around. For PEXT, this is a simple AND
 // (input &= mask), but for PDEP we have to mask out everything but the low N
 // bits, where N is the population count of the mask.

 #define N_BITS      (6)

 typedef struct {
    uint64_t mask;
    uint64_t ppp_bit[N_BITS];
 } zp7_masks_64_t;


 #if !(defined(__aarch64__) && defined(__ARM_FEATURE_AES)) && \
    !((defined(__i386__) || defined(__x86_64__)) && defined(__PCLMUL__))
 // If we don't have access to the CLMUL instruction, emulate it with
 // shifts and XORs
 # define prefix_sum(x) ({ \
  typeof(_Generic(x, uint32_t:(uint32_t)0, uint64_t:(uint64_t)0)) y = x; \
  y ^= y << 1U; y ^= y << 2U; y ^= y << 4U; y ^= y << 8U; y ^= y << 16U; \
  _Generic(y, uint32_t:y, uint64_t:y ^ y << 32U); })
 #endif

 // Parallel-prefix-popcount. This is used by both the PEXT/PDEP polyfills.
 // It can also be called separately and cached, if the mask values will be used
 // more than once (these can be shared across PEXT and PDEP calls if they use
 // the same masks). 
 const_function
 static zp7_masks_64_t zp7_ppp_64(uint64_t mask) {
    zp7_masks_64_t r;
    r.mask = mask;

    // Count *unset* bits
    mask = ~mask;

 #if defined(__aarch64__) && defined(__ARM_FEATURE_AES)
    uint64x2_t m = vdupq_n_u64(mask);
    uint64x2_t neg_2 = vdupq_n_u64(-2LL);
    for (int i = 0; i < N_BITS - 1; i++) {
        uint64x2_t bit = vreinterpretq_u64_p128(vmull_p64(
            vgetq_lane_u64(m, 0), vgetq_lane_u64(neg_2, 0)));
        r.ppp_bit[i] = vgetq_lane_u64(bit, 0);
        m = vandq_u64(m, bit);
    }
    r.ppp_bit[N_BITS - 1] = -vgetq_lane_u64(m, 0) << 1;
 #elif (defined(__i386__) || defined(__x86_64__)) && defined(__PCLMUL__)
    // Move the mask and -2 to XMM registers for CLMUL
    __m128i m = _mm_cvtsi64_si128(mask);
    __m128i neg_2 = _mm_cvtsi64_si128(-2LL);
    for (int i = 0; i < N_BITS - 1; i++) {
        // Do a 1-bit parallel prefix popcount, shifted left by 1,
        // in one carry-less multiply by -2.
        __m128i bit = _mm_clmulepi64_si128(m, neg_2, 0);
        r.ppp_bit[i] = _mm_cvtsi128_si64(bit);

        // Get the carry bit of the 1-bit parallel prefix popcount. On
        // the next iteration, we will sum this bit to get the next mask
        m = _mm_and_si128(m, bit);
    }
    // For the last iteration, we can use a regular multiply by -2 instead of a
    // carry-less one (or rather, a strength reduction of that, with
    // neg/add/etc), since there can't be any carries anyways. That is because
    // the last value of m (which has one bit set for every 32nd unset mask bit)
    // has at most two bits set in it, when mask is zero and thus there are 64
    // bits set in ~mask. If two bits are set, one of them is the top bit, which
    // gets shifted out, since we're counting bits below each mask bit.
    r.ppp_bit[N_BITS - 1] = -_mm_cvtsi128_si64(m) << 1;
 #else
    for (int i = 0; i < N_BITS - 1; i++) {
        // Do a 1-bit parallel prefix popcount, shifted left by 1
        uint64_t bit = prefix_sum(mask << 1);
        r.ppp_bit[i] = bit;

        // Get the carry bit of the 1-bit parallel prefix popcount. On
        // the next iteration, we will sum this bit to get the next mask
        mask &= bit;
    }
    // The last iteration won't carry, so just use neg/shift. See the CLMUL
    // case above for justification.
    r.ppp_bit[N_BITS - 1] = -mask << 1;
 #endif

    return r;
 }

 // PEXT

 static force_inline uint64_t
 zp7_pext_pre_64(uint64_t a, const zp7_masks_64_t *masks) {
    // Mask only the bits that are set in the input mask. Otherwise they collide
    // with input bits and screw everything up
    a &= masks->mask;

    // For each bit in the PPP, shift right only those bits that are set in
    // that bit's mask
    for (int i = 0; i < N_BITS; i++) {
        uint64_t shift = 1 << i;
        uint64_t bit = masks->ppp_bit[i];
        // Shift only the input bits that are set in
        a = (a & ~bit) | ((a & bit) >> shift);
    }
    return a;
 }

 uint64_t zp7_pext_64(uint64_t a, uint64_t mask) asm("_pext_u64");

 uint64_t zp7_pext_64(uint64_t a, uint64_t mask) {
    zp7_masks_64_t masks = zp7_ppp_64(mask);
    return zp7_pext_pre_64(a, &masks);
 }

 uint32_t zp7_pext_32(uint32_t a, uint32_t mask) asm("_pext_u32");

 uint32_t zp7_pext_32(uint32_t a, uint32_t mask) {
 	return (uint32_t)zp7_pext_64(a, mask);
 }

 // PDEP

 static force_inline uint64_t
 zp7_pdep_pre_64(uint64_t a, const zp7_masks_64_t *masks) {
    uint64_t nbits = popcnt(masks->mask);

    // Mask just the bits that will end up in the final result--the low P bits,
    // where P is the popcount of the mask. The other bits would collide.
    // We need special handling for the mask==-1 case: because 64-bit shifts are
    // implicitly modulo 64 on x86 (and (uint64_t)1 << 64 is technically
    // undefined behavior in C), the regular "a &= (1 << pop) - 1" doesn't
    // work: (1 << popcnt(-1)) - 1 == (1 << 64) - 1 == (1 << 0) - 1 == 0, but
    // this should be -1. The BZHI instruction (introduced with BMI2, the same
    // instructions as PEXT/PDEP) handles this properly, but isn't portable.

 #if (defined(__i386__) || defined(__x86_64__)) && defined(__BMI2__)
    a = _bzhi_u64(a, nbits);
 #else
    // If we don't have BZHI, use a portable workaround.  Since (mask == -1)
    // is equivalent to popcnt(mask) >> 6, use that to mask out the 1 << 64
    // case.
    uint64_t pop_mask = (1ULL << nbits) & ~(nbits >> 6);
    a &= pop_mask - 1;
 #endif

    // For each bit in the PPP, shift left only those bits that are set in
    // that bit's mask. We do this in the opposite order compared to PEXT
    for (int i = N_BITS - 1; i >= 0; i--) {
        uint64_t shift = 1 << i;
        uint64_t bit = masks->ppp_bit[i] >> shift;
        // Micro-optimization: the bits that get shifted and those that don't
        // will always be disjoint, so we can add them instead of ORing them.
        // The shifts of 1 and 2 can thus merge with the adds to become LEAs.
        a = (a & ~bit) + ((a & bit) << shift);
    }
    return a;
 }

 uint64_t zp7_pdep_64(uint64_t a, uint64_t mask) asm("_pdep_u64");

 uint64_t zp7_pdep_64(uint64_t a, uint64_t mask) {
    zp7_masks_64_t masks = zp7_ppp_64(mask);
    return zp7_pdep_pre_64(a, &masks);
 }

 uint32_t zp7_pdep_32(uint32_t a, uint32_t mask) asm("_pdep_u32");

 uint32_t zp7_pdep_32(uint32_t a, uint32_t mask) {
 	return (uint32_t)zp7_pdep_64(a, mask);
 }
diff --git a/zp7.h b/zp7.h
 #ifndef ZP7_H_
 #define ZP7_H_

 #ifndef __cplusplus
 #include <stdint.h>
 #define STD
 #else // __cplusplus
 #include <cstdint>
 #define STD ::std::
 extern "C" {
 #endif // __cplusplus

 extern STD uint32_t _pext_u32(STD uint32_t a, STD uint32_t mask);
 extern STD uint64_t _pext_u64(STD uint64_t a, STD uint64_t mask);
 extern STD uint32_t _pdep_u32(STD uint32_t a, STD uint32_t mask);
 extern STD uint64_t _pdep_u64(STD uint64_t a, STD uint64_t mask);

 #undef STD

 #ifdef __cplusplus
 }
 #endif // __cplusplus

 #endif // ZP7_H_
	/**
	* @file b24.c
	*
	* Compiling with MSVC 19.41 or later:
	*
	* cl.exe /TC /std:clatest /O2 /Oi /GL /GF /Zo- /favor:AMD64 /arch:AVX2 b24.c /Fe: b24.exe /MD
	*/
	#include <errno.h>
	#include <inttypes.h>
	#include <stdbool.h>
	#include <stddef.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#include "neuron.h"
	#include "popcnt.h"
	#include "vec.h"

	#if defined(__INTELLISENSE__) \|\| defined(_MSC_VER)
	# define BITINT_C(n) n
	# define _BitInt(n) int
	#else
	# define BITINT_C(n) n##WB
	#endif

	#ifdef _MSC_VER
	# pragma intrinsic(_BitScanForward)
	static const_inline int __builtin_ctzl(unsigned long x) {
	unsigned long y;
	return _BitScanForward(&y, x) ? (int)y : (int)sizeof(x) * CHAR_BIT;
	}
	#endif

	pragma_msvc(warning(push))
	pragma_msvc(warning(disable: 4710))
	pragma_msvc(warning(disable: 4711))

	static const_inline char
	hexdig (unsigned _BitInt(4) n)
	{
	return (const char[16]){
	'0','1','2','3','4','5','6','7',
	'8','9','a','b','c','d','e','f',
	}[n];
	}

	static inline char *
	decstr_u5 (char *dst,
	unsigned _BitInt(5) src,
	bool aln)
	{
	if (src >= BITINT_C(10U)) {
	*dst++ = (char)((unsigned char)'0' + src / BITINT_C(10U));
	src %= BITINT_C(10U);
	} else if (aln)
	*dst++ = ' ';
	*dst++ = (char)((unsigned char)'0' + src);
	return dst;
	}

	#if 0
	static force_inline char *
	decstr_s5 (char *dst,
	unsigned _BitInt(5) src,
	bool pad)
	{
	const typeof(src) lt0 = src >> 4;
	src = (src - lt0) ^ ((BITINT_C(31U) ^ BITINT_C(31U)) - lt0);

	if (lt0) {
	if (src >= BITINT_C(10U)) {
	*dst++ = '-';
	*dst++ = '1';
	src -= BITINT_C(10U);
	} else {
	if (pad)
	*dst++ = ' ';
	*dst++ = '-';
	}
	} else {
	if (src >= BITINT_C(10U)) {
	if (pad)
	*dst++ = ' ';
	*dst++ = '1';
	src -= BITINT_C(10U);
	} else if (pad) {
	*dst++ = ' ';
	*dst++ = ' ';
	}
	}

	*dst++ = (char)((unsigned char)'0' + src);
	return dst;
	}
	#endif

	static force_inline char *
	decstr_u4 (char *dst,
	unsigned _BitInt(4) src,
	bool aln)
	{
	if (src >= BITINT_C(10U)) {
	*dst++ = '1';
	src -= BITINT_C(10U);
	} else if (aln)
	*dst++ = ' ';
	*dst++ = (char)((unsigned char)'0' + src);
	return dst;
	}

	static force_inline char *
	decstr_s4 (char *dst,
	unsigned _BitInt(4) src,
	bool aln)
	{
	if (src >= BITINT_C(8U)) {
	*dst++ = '-';
	*dst++ = (char)((unsigned char)'8' +
	(unsigned char)8 - src);
	} else {
	if (aln)
	*dst++ = ' ';
	*dst++ = (char)((unsigned char)'0' + src);
	}
	return dst;
	}

	static const uint16_t b24[] = {
	0x9afU, 0x9ebU, 0xa6fU, 0xa7bU,
	0xb3dU, 0xb4fU, 0xbcdU, 0xbd3U,
	0xcbdU, 0xd2fU, 0xd79U, 0xde5U,
	0xf2dU, 0xf4bU, 0xf59U, 0xf65U,
	};

	pragma_msvc(warning(push))
	pragma_msvc(warning(disable: 4200))
	pragma_msvc(warning(disable: 4820))

	struct b24_cfg {
	uint32_t have_opt;
	uint16_t rotation;
	uint16_t offset;
	uint64_t neuron;
	size_t n_seq;
	uint16_t seq[];
	};

	pragma_msvc(warning(pop))

	typedef typeof((struct b24_cfg){0}.have_opt) b24_cfg_flags;
	typedef typeof((struct b24_cfg){0}.offset) b24_cfg_offset;

	struct trace {
	uint64_t cyc;
	uint32_t seq;
	uint16_t len;
	uint16_t map;
	};

	const_function
	static struct trace
	b24_trace_init (struct b24_cfg const *const cfg,
	unsigned i)
	{
	uint32_t seq = cfg->seq[i] \| ((uint32_t)cfg->seq[i] << 16U);
	return (struct trace){
	.cyc = 0U,
	.seq = cfg->rotation ? (seq >> (16 - cfg->rotation))
	\| (seq << cfg->rotation) : seq,
	.len = 0U,
	.map = 0U,
	};
	}

	const_function
	static struct trace
	b24_trace_from (struct trace tc,
	b24_cfg_offset off)
	{
	tc.cyc = off;
	tc.len = 1U;
	tc.map = 1U << off;

	for (;;) {
	off = (tc.seq >> off) & (b24_cfg_offset)15;
	typeof(tc.map) bit = (typeof(bit))1 << off;

	if (tc.map & bit)
	break;

	tc.map \|= bit;
	tc.cyc \|= (typeof(tc.cyc))off
	<< (tc.len++ << 2U);
	}

	return tc;
	}

	const_function
	static struct trace
	b24_trace_next (struct trace tc)
	{
	if (tc.map == (typeof(tc.map))0xffff)
	return (struct trace){0};

	b24_cfg_offset off = 0U;
	uint_fast16_t map = tc.map;

	while (map & (typeof(map))1) {
	map >>= 1U;
	++off;
	}

	return b24_trace_from(tc, off);
	}

	enum b24_opt {
	/* The std_in option ('-') is 0 to guarantee all
	* argument-accepting options stored in `expect`
	* evaluate to true during argument parsing.
	*/
	b24_opt_std_in , // - (unused for now)
	b24_opt_big_hex , // -b
	b24_opt_cycles , // -c
	b24_opt_debruijn, // -d
	b24_opt_entries , // -e
	b24_opt_graphviz, // -g
	b24_opt_help , // -h
	b24_opt_json , // -j
	b24_opt_neuron , // -n
	b24_opt_one_path, // -o
	b24_opt_python , // -p
	b24_opt_expand , // -q
	b24_opt_rotation, // -r
	b24_opt_sig_path, // -s
	b24_opt_no_space, // -w
	b24_opt_hex_path, // -x
	b24_opt_no_align, // -y
	b24_opt_no_zeros, // -z

	// qualifier flags
	ONLY_ONCE = 0x40U,
	NEEDS_ARG = 0x80U,
	};

	#define b24_opt_char(x) \
	_Generic(&(int[1U+b24_opt_##x]){0} \
	, int(*)[1U+b24_opt_std_in ]: 0 \
	, int(*)[1U+b24_opt_big_hex ]: 'b' \
	, int(*)[1U+b24_opt_cycles ]: 'c' \
	, int(*)[1U+b24_opt_debruijn]: 'd' \
	, int(*)[1U+b24_opt_entries ]: 'e' \
	, int(*)[1U+b24_opt_graphviz]: 'g' \
	, int(*)[1U+b24_opt_help ]: 'h' \
	, int(*)[1U+b24_opt_json ]: 'j' \
	, int(*)[1U+b24_opt_neuron ]: 'n' \
	, int(*)[1U+b24_opt_one_path]: 'o' \
	, int(*)[1U+b24_opt_python ]: 'p' \
	, int(*)[1U+b24_opt_expand ]: 'q' \
	, int(*)[1U+b24_opt_rotation]: 'r' \
	, int(*)[1U+b24_opt_sig_path]: 's' \
	, int(*)[1U+b24_opt_no_space]: 'w' \
	, int(*)[1U+b24_opt_hex_path]: 'x' \
	, int(*)[1U+b24_opt_no_align]: 'y' \
	, int(*)[1U+b24_opt_no_zeros]: 'z' )

	static const char b24_opt_to_char[] = {
	#define b24_option(x) [b24_opt_##x] = b24_opt_char(x)
	b24_option(std_in ),
	b24_option(big_hex ),
	b24_option(cycles ),
	b24_option(debruijn),
	b24_option(entries ),
	b24_option(graphviz),
	b24_option(help ),
	b24_option(json ),
	b24_option(neuron ),
	b24_option(one_path),
	b24_option(python ),
	b24_option(expand ),
	b24_option(rotation),
	b24_option(sig_path),
	b24_option(no_space),
	b24_option(hex_path),
	b24_option(no_align),
	b24_option(no_zeros),
	#undef b24_option
	};

	static const unsigned char
	b24_char_to_opt[1U << CHAR_BIT] = {
	#define b24_option(x) [b24_opt_char(x)] = b24_opt_##x
	b24_option(std_in )\| ONLY_ONCE,
	b24_option(big_hex )\| ONLY_ONCE,
	b24_option(cycles )\| ONLY_ONCE,
	b24_option(debruijn)\| NEEDS_ARG,
	b24_option(entries )\| ONLY_ONCE,
	b24_option(graphviz)\| ONLY_ONCE,
	b24_option(help )\| ONLY_ONCE,
	b24_option(json )\| ONLY_ONCE,
	b24_option(neuron )\| ONLY_ONCE
	\| NEEDS_ARG,
	b24_option(one_path)\| ONLY_ONCE
	\| NEEDS_ARG,
	b24_option(python )\| ONLY_ONCE,
	b24_option(expand )\| ONLY_ONCE,
	b24_option(rotation)\| ONLY_ONCE
	\| NEEDS_ARG,
	b24_option(sig_path)\| ONLY_ONCE,
	b24_option(no_space)\| ONLY_ONCE,
	b24_option(hex_path)\| ONLY_ONCE,
	b24_option(no_align)\| ONLY_ONCE,
	b24_option(no_zeros)\| ONLY_ONCE,
	#undef b24_option
	};

	/**
	* @brief Check if option `opt` is present.
	*/
	static pure_inline bool
	b24_has_option (struct b24_cfg const *const cfg,
	const enum b24_opt opt)
	{
	return cfg->have_opt & ((b24_cfg_flags)1 << opt);
	}

	/**
	* @brief Mark option `opt` as present.
	*/
	static force_inline void
	b24_option_add (struct b24_cfg *const cfg,
	const enum b24_opt opt)
	{
	cfg->have_opt \|= (b24_cfg_flags)1 << opt;
	}

	struct b24_syntax {
	char bra;
	char ket;
	char sep;
	char spc;
	};

	static force_inline char *
	hexstr_u4 (char *dst,
	unsigned _BitInt(4) src)
	{
	*dst++ = '0';
	*dst++ = 'x';
	*dst++ = hexdig(src);
	return dst;
	}

	static inline char *
	hexstr_u16 (char *dst,
	uint16_t src,
	bool pad)
	{
	*dst++ = '0';
	*dst++ = 'x';

	if (pad) {
	*dst++ = hexdig(src >> 12U );
	*dst++ = hexdig(src >> 8U & 15U);
	*dst++ = hexdig(src >> 4U & 15U);

	} else if (src > UINT16_C(0x000f)) {
	if (src > UINT16_C(0x00ff)) {
	if (src > UINT16_C(0x0fff))
	*dst++ = hexdig(src >> 12U);
	*dst++ = hexdig(src >> 8U & 15U);
	}
	*dst++ = hexdig(src >> 4U & 15U);
	}

	*dst++ = hexdig(src & 15U);

	return dst;
	}

	static force_inline char *
	hexstr_16x4 (char *dst,
	uint64_t src,
	unsigned len,
	struct b24_syntax syn)
	{
	if (len) {
	for (;; src >>= 4) {
	dst = hexstr_u4(dst, src & 15U);
	if (!--len)
	break;
	*dst++ = syn.sep;
	*dst = syn.spc;
	dst+= !!syn.spc;
	}
	}
	return dst;
	}

	static char *
	hexstr_u64 (char *dst,
	uint64_t src,
	bool pad)
	{
	char buf[16] = "000000000000000";
	char *p = &buf[15];

	for (;; --p) {
	*p = hexdig(src & 15U);
	src >>= 4U;
	if (!src)
	break;
	}
	if (pad)
	p = &buf[0];

	*dst++ = '0';
	*dst++ = 'x';
	for (;; ++p) {
	dst++ = p;
	if (p == &buf[15])
	break;
	}

	return dst;
	}

	static force_inline char *
	decstr_16x4 (char *dst,
	uint64_t src,
	unsigned len,
	struct b24_syntax syn,
	bool aln,
	bool sig)
	{
	if (len) {
	bool w = syn.spc;
	if (sig) {
	for (;; src >>= 4) {
	dst = decstr_s4(dst, src & 15U, aln);
	if (!--len)
	break;
	*dst++ = syn.sep;
	*dst = syn.spc;
	dst += w;
	}
	} else {
	for (;; src >>= 4) {
	dst = decstr_u4(dst, src & 15U, aln);
	if (!--len)
	break;
	*dst++ = syn.sep;
	*dst = syn.spc;
	dst += w;
	}
	}
	}
	return dst;
	}

	static char *
	b24_cfg_trace_print (struct b24_cfg const *cfg,
	char *dst,
	struct trace src,
	unsigned idx,
	struct b24_syntax syn)
	{
	bool align = !b24_has_option(cfg, b24_opt_no_align);
	bool zeros = !b24_has_option(cfg, b24_opt_no_zeros);
	bool expand = b24_has_option(cfg, b24_opt_expand);
	bool all = !b24_has_option(cfg, b24_opt_one_path) &&
	!expand;

	if (all) {
	*dst++ = syn.bra;
	dst = hexstr_u16(dst, (uint16_t)src.seq, zeros);
	*dst++ = syn.sep;
	*dst = syn.spc;
	dst+= !!syn.spc;
	dst = decstr_u5(dst, idx, align);
	*dst++ = syn.sep;
	*dst = syn.spc;
	dst+= !!syn.spc;
	dst = hexstr_u16(dst, src.map, zeros);
	*dst++ = syn.sep;
	*dst = syn.spc;
	dst+= !!syn.spc;
	dst = decstr_u5(dst, src.len, align);
	*dst++ = syn.sep;
	*dst = syn.spc;
	dst+= !!syn.spc;
	}

	if (b24_has_option(cfg, b24_opt_big_hex)) {
	dst = hexstr_u64(dst, src.cyc, zeros);
	} else {
	*dst++ = syn.bra;
	dst = b24_has_option(cfg, b24_opt_hex_path)
	? hexstr_16x4(dst, src.cyc, src.len, syn)
	: decstr_16x4(dst, src.cyc, src.len, syn, align,
	b24_has_option(cfg, b24_opt_sig_path));
	*dst++ = syn.ket;
	}
	if (all)
	*dst++ = syn.ket;
	*dst = '\0';
	return dst;
	}

	static struct trace
	b24_expand (struct trace tc)
	{
	tc.cyc = seq_expand((uint16_t)tc.seq);
	tc.len = 16U;
	tc.map = 0xffffU;
	return tc;
	}

	static void
	b24_cfg_trace (struct b24_cfg const *cfg)
	{
	struct b24_syntax syn = (struct b24_syntax[]){
	{'{', '}', ',', ' '},
	{'[', ']', ',', ' '}, // json or python
	{'{', '}', ',', 0x0}, // no space
	{'[', ']', ',', 0x0}, // js/py, no space
	}[b24_has_option(cfg, b24_opt_json) \|
	b24_has_option(cfg, b24_opt_python) \|
	(b24_has_option(cfg, b24_opt_no_space) << 1U)];

	bool expand = b24_has_option(cfg, b24_opt_expand);
	bool once = b24_has_option(cfg, b24_opt_one_path);
	bool all = !once && !expand;
	bool please = false;

	for (unsigned i = 0; i < cfg->n_seq;) {
	struct trace tc = b24_trace_init(cfg, i);
	unsigned cycles = 0;

	for (;; ++cycles) {
	struct trace t;

	if (once) {
	t = b24_trace_from(tc, cfg->offset);
	} else if (expand) {
	t = b24_expand(tc);
	} else {
	t = b24_trace_next(tc);
	if (!t.map)
	break;

	tc.cyc \|= t.cyc << (tc.len << 2U);
	tc.len += t.len;
	tc.map \|= t.map;
	}

	char buf[160];
	char *p = b24_cfg_trace_print(cfg, buf, t,
	cycles, syn);
	if (p) {
	if (please) {
	(void)putchar(syn.sep);
	(void)putchar('\n');
	}
	(void)fputs(buf, stdout);
	please = true;
	}

	if (!all)
	break;
	}

	if (++i == cfg->n_seq && please)
	(void)putchar('\n');
	}
	}

	static void
	b24_help (void)
	{
	(void)printf(
	"Usage: b24 [OPTIONS]... [<0..65535>]...\n"
	"Options:\n"
	" -%c Print this help text and exit\n"
	" -%c <state> Trace a neuron's state evolution\n"
	" -%c <0..15> Trace a single path at offset\n"
	" -%c Expand subsequences in place\n"
	"\n"
	"Input options:\n"
	" -%c <0..15> Add distinct B(2, 4) as input\n"
	" -%c <0..15> Apply rotation before tracing\n"
	"\n"
	"Output options:\n"
	" -%c Only cycles, no entry paths\n"
	" -%c Always find all entry paths\n"
	"\n"
	"Formatting options:\n"
	" -%c Output in Graphviz format\n"
	" -%c Output in JSON format\n"
	" -%c Output in Python format\n"
	" -%c Signed decimal path steps\n"
	" -%c Hexadecimal path steps\n"
	" -%c Print every sequence as one big hex number\n"
	" -%c Don't align output columns\n"
	" -%c Don't zero-pad hexadecimals\n"
	" -%c Omit non-separator whitespace\n",
	b24_opt_char(help), b24_opt_char(neuron),
	b24_opt_char(one_path), b24_opt_char(expand),
	b24_opt_char(debruijn), b24_opt_char(rotation),
	b24_opt_char(cycles), b24_opt_char(entries),
	b24_opt_char(graphviz), b24_opt_char(json),
	b24_opt_char(python), b24_opt_char(sig_path),
	b24_opt_char(hex_path), b24_opt_char(big_hex),
	b24_opt_char(no_align), b24_opt_char(no_zeros),
	b24_opt_char(no_space)
	);
	}

	struct str7 {
	union {
	char d[8U];
	uint64_t u;
	};
	};

	const_inline
	static struct str7
	b24_opt_char_str (char ch)
	{
	struct str7 str = {{{'"', '-', ch}}};
	str.d[3U - !ch] = '"';
	return str;
	}

	static void
	b24_cfg_destroy (struct b24_cfg **cfg)
	{
	if (cfg && *cfg) {
	free(*cfg);
	*cfg = NULL;
	}
	}

	_Noreturn static void
	b24_helpful_exit (struct b24_cfg **cfg,
	int ret)
	{
	b24_cfg_destroy(cfg);
	b24_help();
	exit(ret);
	}

	static struct b24_cfg *
	b24_cfg_create (size_t n_seq)
	{
	struct b24_cfg *cfg = calloc(1, offsetof(struct b24_cfg, seq)
	+ n_seq * sizeof cfg->seq[0]);
	if (!cfg)
	perror("calloc");
	return cfg;
	}

	_Noreturn static void
	b24_cosmic_ray (struct b24_cfg **cfg,
	char const *what,
	char const *func,
	int line)
	{
	size_t n = what ? strlen(what) : 0;
	if (!n)
	what = "";
	(void)fprintf(stderr, "[\033[31mERROR\033[m] %s:%d: %s%s\n",
	func, line, what, &".\nFailure caused by the impact "
	"of a high-energy cosmic particle or a high-density "
	"program author."[n ? what[n-1U] == '.' : 2U]);
	b24_cfg_destroy(cfg);
	exit(EXIT_FAILURE);
	}

	static int
	b24_options_incompatible (struct b24_cfg *cfg,
	enum b24_opt opt,
	b24_cfg_flags bad,
	const char sep)
	{
	if (!b24_has_option(cfg, opt))
	return 0;

	bad &= cfg->have_opt;
	if (!bad)
	return 0;
	cfg->have_opt &= ~((typeof(cfg->have_opt))1 << opt);

	int n = popcnt(bad);
	if (n >= (int)array_size(b24_opt_to_char)) {
	b24_cosmic_ray(&cfg, "Option flag outside expected range",
	__func__, __LINE__);
	}

	char buf[64] = "";
	char *dst = &buf[0];
	b24_cfg_flags bit = 1U;

	for (int prev = 0, i = 0; i < n;) {
	int o = __builtin_ctzl(bad);
	if (o >= (int)array_size(b24_opt_to_char))
	break;

	bit <<= o - prev;
	bad &= ~bit;
	prev = o;

	if (++i > 1) {
	if (n > 2)
	*dst++ = sep;
	*dst++ = ' ';
	if (i == n) {
	*dst++ = 'o';
	*dst++ = 'r';
	*dst++ = ' ';
	}
	}

	struct str7 s = b24_opt_char_str(b24_opt_to_char[o]);
	for (char src = s.d; src; ++src) { dst++ = src; }
	}

	*dst = '\0';

	(void)fprintf(stderr, "Option %s cannot be combined with %s\n",
	b24_opt_char_str(b24_opt_to_char[opt]).d, buf);

	return 1;
	}

	static void
	b24_option_assert_once (struct b24_cfg *cfg,
	enum b24_opt opt)
	{
	if (b24_has_option(cfg, opt)) {
	(void)fprintf(stderr, "Option %s specified twice\n",
	b24_opt_char_str(b24_opt_to_char[opt]).d);
	b24_helpful_exit(&cfg, EXIT_FAILURE);
	}
	}

	static inline uint64_t
	b24_get_u64_arg (char *arg,
	int *err)
	{
	if (!arg) {
	*err = EFAULT;
	return 0;
	}

	if (!*arg) {
	*err = EINVAL;
	return 0;
	}

	errno = 0;
	uint64_t u64 = 0U;
	char *endptr = arg;
	int64_t i64 = _Generic(i64, long: strtol,
	long long: strtoll)(arg, &endptr, 0);
	int e = errno;
	if (!e) {
	u64 = (uint64_t)i64;
	if (*endptr)
	e = EINVAL;

	} else if (e == ERANGE && i64 == _Generic(i64, long: LONG_MAX,
	long long: LLONG_MAX)) {
	errno = 0;
	endptr = arg;
	u64 = _Generic(u64, unsigned long: strtoul,
	unsigned long long: strtoull)(arg, &endptr, 0);
	e = errno;
	if (!e && *endptr)
	e = EINVAL;
	}

	*err = e;
	return u64;
	}

	static uint64_t
	b24_get_int_arg (char *arg,
	int *err,
	uint64_t max)
	{
	int e = 0;
	uint64_t u64 = b24_get_u64_arg(arg, &e);

	if (u64 > max) {
	u64 = max;
	if (!e)
	e = ERANGE;
	}

	*err = e;
	return u64;
	}

	static struct b24_cfg *
	b24_parse_args (int argc,
	char **argv)
	{
	struct b24_cfg *cfg = b24_cfg_create(argc > 1 ? argc - 1 : 0);
	if (!cfg)
	return NULL;

	enum b24_opt expect = 0;
	char c_ = '\0';

	for (int i = 1; i < argc; i++) {
	char *a = argv[i];

	if (expect) {
	if (!*a)
	goto missing_arg;

	parse_arg: do{}while(0);
	pragma_msvc(warning(push))
	pragma_msvc(warning(disable: 4061))

	uint64_t max = 15U;

	switch (expect) {
	case b24_opt_neuron:
	max = UINT64_MAX;
	break;
	case b24_opt_debruijn:
	case b24_opt_one_path:
	case b24_opt_rotation:
	break;
	default:
	b24_cosmic_ray(&cfg, "", __func__, __LINE__);
	}

	int e = 0;
	uint64_t n = b24_get_int_arg(a, &e, max);
	if (e) {
	(void)fprintf(stderr, "Option %s "
	"parse error: %s\n",
	b24_opt_char_str(c_).d,
	strerror(e));
	b24_helpful_exit(&cfg, EXIT_FAILURE);
	}

	switch (expect) {
	case b24_opt_debruijn:
	cfg->seq[cfg->n_seq++] = b24[n];
	break;
	case b24_opt_neuron:
	cfg->neuron = n;
	break;
	case b24_opt_one_path:
	cfg->offset = (b24_cfg_offset)n;
	break;
	case b24_opt_rotation:
	cfg->rotation = (uint16_t)n;
	default:
	break;
	}

	pragma_msvc(warning(pop))

	b24_option_add(cfg, expect);
	expect = 0;

	continue;
	}


	if (*a == '-') {
	c_ = *++a;
	concatenated_option:
	unsigned ch_opt = b24_char_to_opt[(unsigned char)c_];
	if (!ch_opt) {
	(void)fprintf(stderr, "Option %s is unknown"
	"\n", b24_opt_char_str(c_).d);
	b24_helpful_exit(&cfg, EXIT_FAILURE);
	}

	enum b24_opt opt = ch_opt & ~(NEEDS_ARG \| ONLY_ONCE);

	if (ch_opt & ONLY_ONCE)
	b24_option_assert_once(cfg, opt);

	if (ch_opt & NEEDS_ARG) {
	expect = opt;
	if (*++a)
	goto parse_arg;
	} else {
	b24_option_add(cfg, opt);
	if (c_ && (c_ = *++a))
	goto concatenated_option;
	}

	continue;
	}

	int e = 0;
	uint64_t n = b24_get_int_arg(a, &e, UINT16_MAX);
	if (e) {
	(void)fprintf(stderr, "Bad sequence: %s\n",
	strerror(e));
	b24_cfg_destroy(&cfg);
	return NULL;
	}

	cfg->seq[cfg->n_seq++] = (uint16_t)n;
	}

	int yikes = b24_options_incompatible(cfg, b24_opt_big_hex ,
	1U << b24_opt_hex_path \|
	1U << b24_opt_sig_path , ',')
	+ b24_options_incompatible(cfg, b24_opt_cycles ,
	1U << b24_opt_entries \|
	1U << b24_opt_neuron \|
	1U << b24_opt_expand , ',')
	+ b24_options_incompatible(cfg, b24_opt_entries ,
	1U << b24_opt_cycles \|
	1U << b24_opt_neuron \|
	1U << b24_opt_expand , ',')
	+ b24_options_incompatible(cfg, b24_opt_graphviz ,
	1U << b24_opt_json \|
	1U << b24_opt_python , ',')
	+ b24_options_incompatible(cfg, b24_opt_json ,
	1U << b24_opt_python \|
	1U << b24_opt_graphviz , ',')
	+ b24_options_incompatible(cfg, b24_opt_neuron ,
	1U << b24_opt_one_path \|
	1U << b24_opt_expand , ',')
	+ b24_options_incompatible(cfg, b24_opt_one_path ,
	1U << b24_opt_expand , ',')
	+ b24_options_incompatible(cfg, b24_opt_python ,
	1U << b24_opt_json \|
	1U << b24_opt_graphviz , ',')
	+ b24_options_incompatible(cfg, b24_opt_expand ,
	1U << b24_opt_cycles \|
	1U << b24_opt_entries \|
	1U << b24_opt_one_path , ',')
	+ b24_options_incompatible(cfg, b24_opt_sig_path ,
	1U << b24_opt_hex_path , ',');

	if (yikes)
	b24_helpful_exit(&cfg, EXIT_FAILURE);

	if (expect)
	goto missing_arg;

	if (b24_has_option(cfg, b24_opt_help))
	b24_helpful_exit(&cfg, EXIT_SUCCESS);

	if (!cfg->n_seq) {
	(void)fputs("No sequence(s) specified\n", stderr);
	b24_helpful_exit(&cfg, EXIT_FAILURE);
	}

	return cfg;

	missing_arg:
	(void)fprintf(stderr, "Option %s expects an argument\n",
	b24_opt_char_str(c_).d);
	b24_helpful_exit(&cfg, EXIT_FAILURE);
	}

	static char *
	b24_render_gv_struct (char *dst,
	size_t siz,
	uint16_t seq,
	unsigned _BitInt(4) id)
	{
	if (siz <
	sizeof "S0 [label=\"<f>0\|<e>0\|<d>0\|<c>0\|"
	"<b>0\|<a>0\|<9>0\|<8>0\|"
	"<7>0\|<6>0\|<5>0\|<4>0\|"
	"<3>0\|<2>0\|<1>0\|<0>0\"];\n")
	return NULL;

	memcpy(dst, "S0 [label=\"", sizeof "S0 [label=\"" - 1U);
	*++dst = hexdig(id);
	dst += sizeof " [label=\"";

	for (unsigned off = 15U;; --off) {
	*dst++ = '<';
	*dst++ = hexdig(off);
	*dst++ = '>';
	*dst++ = (char)((unsigned char)'0' + (seq >> off & 1U));
	if (!off)
	break;
	*dst++ = '\|';
	}

	*dst++ = '"';
	*dst++ = ']';
	*dst++ = ';';
	*dst++ = '\n';

	return dst;
	}

	int
	main (int c,
	char **v)
	{
	struct b24_cfg *cfg = b24_parse_args(c, v);
	if (!cfg)
	return EXIT_FAILURE;

	b24_cfg_trace(cfg);
	b24_cfg_destroy(&cfg);

	return EXIT_SUCCESS;
	}

	pragma_msvc(warning(pop))
	/** @file clock.h
	*/
	#ifndef B24_CLOCK_H_
	#define B24_CLOCK_H_

	#ifndef _MSC_VER
	# include <time.h>
	#endif

	#include "vec.h"

	static force_inline vec128
	unix_epoch (void)
	{
	#ifdef _MSC_VER
	FILETIME ft = {0};
	GetSystemTimeAsFileTime(&ft);
	return ((ULARGE_INTEGER){
	.LowPart = ft.dwLowDateTime,
	.HighPart = ft.dwHighDateTime,
	}).QuadPart / UINT64_C(10000000) -
	((369U * 365U + 89U) * UINT64_C(86400));
	#else
	struct timespec ts = {0}, ts2 = {0};
	(void)clock_gettime(CLOCK_REALTIME, &ts);
	(void)clock_gettime(CLOCK_MONOTONIC, &ts2);

	vec128 epoch = {
	.u32[0] = ts.tv_sec,
	.u32[2] = ts2.tv_sec,
	.u32[1] = ts2.tv_sec,
	.u32[3] = ts.tv_sec
	};
	return epoch;
	#endif
	}

	static force_inline void hol_up (uint64_t ns)
	{
	#ifdef _MSC_VER
	LARGE_INTEGER t = {
	.QuadPart = (int64_t)(ns / 100U) * -1
	};
	(void)NtDelayExecution(FALSE, &t);
	#else
	struct timespec t = {
	.tv_sec = (typeof(t.tv_sec))(ns / UINT64_C(1000000000)),
	.tv_nsec = (typeof(t.tv_nsec))(ns % UINT64_C(1000000000))
	};
	(void)clock_nanosleep(CLOCK_MONOTONIC, 0, &t, NULL);
	#endif
	}

	#endif /* B24_CLOCK_H_ */
	/** @file cortex.h
	*/
	#ifndef B24_CORTEX_H_
	#define B24_CORTEX_H_

	#include "clock.h"
	#include "layer.h"
	#include "neuron.h"

	#ifdef __aarch64__
	# include "zp7.h"
	#endif

	enum ctx_flag {
	CORTEX_TOCK = 1U
	};

	enum { DIR_IN, DIR_OUT };

	/**
	* @brief Cortex structure.
	*
	* This is the structure that holds all the neurons.
	*
	* More like a resonant bucket than a brain.
	*/
	struct cortex {
	uint64_t resvd[7]; // align to cache line
	uint64_t flags;
	vec128 paths[256];
	struct layer layer[16];
	struct chunk data[17];
	};

	union ctx_addr {
	struct {
	uint8_t b8 : 3;
	uint8_t u8 : 4;
	uint8_t vec : 1;
	uint8_t : 8;
	};
	struct {
	uint8_t b64 : 6;
	uint8_t u64 : 1;
	uint8_t : 1;
	uint8_t rot : 4;
	uint8_t seq : 4;
	};
	struct {
	uint8_t : 3;
	uint8_t chn : 4;
	uint8_t dir : 1;
	uint8_t nrn : 8;
	};
	uint16_t raw;
	};
	_Static_assert(sizeof(union ctx_addr) == 2U, "");

	static pure_inline union ctx_addr
	ctx_addr (uint16_t raw)
	{
	return (union ctx_addr){.raw = raw};
	}

	extern struct cortex *
	cortex_create (void);

	extern void
	cortex_destroy (struct cortex **ctx);

	extern void
	cortex_init (struct cortex *ctx);

	struct cortex_loc {
	uint16_t bit : 2;
	uint16_t chn : 4;
	uint16_t rot : 4;
	uint16_t seq : 4;
	uint16_t pad : 2;
	};

	struct linear_loc {
	uint16_t col : 7;
	uint16_t row : 7;
	uint16_t pad : 2;
	};

	static const_inline struct cortex_loc
	linear_loc_fwd (struct linear_loc loc)
	{
	union {
	struct cortex_loc ctx;
	struct linear_loc lin;
	uint16_t raw;
	} src = {
	.lin = loc
	}, dst = {
	.raw = (uint16_t)fwd(32, 14, src.raw)
	};

	return dst.ctx;
	}

	static force_inline void
	cortex_tick (struct cortex *ctx)
	{
	for (uint32_t i = 0; i < array_size(ctx->layer); ++i) {
	layer_tick(&ctx->layer[i], &ctx->data[(i + 1U) & 15U]);
	layer_tock(&ctx->layer[i], &ctx->data[i ]);
	}
	}

	#endif /* B24_CORTEX_H_ */
	#include <stdio.h>

	#include "layer_priv.h"

	void
	layer_init (struct layer *obj,
	struct layer_cfg cfg,
	struct chunk *in,
	struct chunk *out)
	{
	*obj = layer(cfg, in, out);
	}

	void
	layer_fini (struct layer *obj)
	{
	(void)obj;
	}

	void
	layer_run (struct layer *obj,
	struct chunk *input)
	{
	bool init_now = !obj->cfg.init;
	unsigned hole = 0;

	for (uint16_t i = 0; i < (uint16_t)array_size(obj->data[0]->v); ++i) {
	// internal connections
	vec128 self_vec = *layer_vector(obj, (union ctx_addr){
	.dir = DIR_IN,
	.nrn = i,
	});
	vec128 peer_vec[16] = {0};
	union ctx_addr peer_addr[16] = {0};
	for (uint16_t j = 0; j < (uint16_t)array_size(self_vec.u8); ++j) {
	uint16_t tmp = layer_map(obj, (union ctx_addr){
	.chn = j,
	.nrn = i,
	});
	peer_addr[j] = (union ctx_addr){
	.chn = tmp & 15U,
	.dir = DIR_IN,
	.nrn = tmp >> 4U,
	};
	peer_vec[j] = *layer_vector(obj, peer_addr[j]);
	}
	for (uint16_t j = 0; j < (uint16_t)array_size(self_vec.u8); ++j) {
	if (peer_addr[j].nrn == i) {
	if (init_now)
	++hole;
	// self connection (external input)
	// TOEDOE: run callback and assign result to self.u8[j]
	//self_vec.u8[j] = input->v[i].u8[j];
	continue;
	}

	uint8_t ch = self_vec.u8[j];
	uint8_t up = layer_channel_whence(
	obj,
	(union ctx_addr) {
	.chn = j,
	.nrn = i,
	}
	);
	bool match_lsb = (ch & 15U) == (up & 15U);
	bool match_msb = (ch >> 4U) == j;
	if ((match_msb ^ match_lsb)) {
	peer_vec[j].u8[j] = self_vec.u8[j];
	self_vec.u8[j] = 0;

	} else if (match_msb) {
	self_vec.u8[j] = 0;

	} else {
	self_vec.u8[j] = peer_vec[j].u8[j];
	//peer_vec[j].u8[j] = 0;
	}

	layer_save_vector(obj, (union ctx_addr){
	.dir = DIR_OUT,
	.nrn = peer_addr[j].nrn,
	}, peer_vec[j]);
	}

	struct neuron nrn = layer_make_neuron(
	obj,
	(union ctx_addr){
	.dir = DIR_IN,
	.nrn = i,
	}
	);
	layer_save_vector(
	obj,
	(union ctx_addr) {
	.dir = DIR_OUT,
	.nrn = i,
	},
	neuron_send(&nrn, self_vec)
	);
	}

	if (init_now) {
	obj->cfg.init = 1;
	obj->cfg.hole = hole;
	(void)printf("inputs: %u\n", hole);
	}
	}
	/** @file layer.h
	*/
	#ifndef B24_LAYER_H_
	#define B24_LAYER_H_

	#include "vec.h"

	struct layer_cfg {
	uint64_t self : 4;
	uint64_t prev : 4;
	uint64_t next : 4;
	uint64_t mode : 4;
	uint64_t hole : 8;
	uint64_t init : 1;
	uint64_t tock : 1;
	uint64_t resv : 36;
	};
	_Static_assert(sizeof(struct layer_cfg)
	== sizeof(uint64_t), "");

	/**
	* @brief Network layer.
	*/
	struct layer {
	struct layer_cfg cfg;
	struct chunk *data[2];
	};
	_Static_assert(sizeof(struct layer)
	== sizeof(struct layer_cfg)
	+ sizeof(struct chunk ) 2U, "");

	extern void
	layer_init (struct layer *obj,
	struct layer_cfg cfg,
	struct chunk *in,
	struct chunk *out);

	extern void
	layer_fini (struct layer *obj);

	extern void
	layer_run (struct layer *obj,
	struct chunk *in);

	static force_inline void
	layer_tick (struct layer *obj,
	struct chunk *in)
	{
	obj->cfg.tock = 0;
	layer_run(obj, in);
	}

	static force_inline void
	layer_tock (struct layer *obj,
	struct chunk *in)
	{
	obj->cfg.tock = 1;
	layer_run(obj, in);
	}


	#endif /* B24_LAYER_H_ */
	/** @file layer_priv.h
	*/
	#ifndef B24_LAYER_PRIV_H_
	#define B24_LAYER_PRIV_H_

	#include <stdio.h>

	#include "cortex.h"
	#include "layer.h"

	static pure_inline struct cortex *
	to_cortex (struct layer const *obj)
	{
	return container_of(obj, struct cortex, layer[obj->cfg.self]);
	}

	static pure_inline struct vec128 *
	layer_vector (struct layer const *obj,
	union ctx_addr addr)
	{
	return &obj->data[addr.dir ^ obj->cfg.tock]->v[addr.nrn];
	}

	static pure_inline struct neuron
	layer_make_neuron (struct layer const *obj,
	union ctx_addr addr)
	{
	return neuron_from_vec128(
	&to_cortex(obj)->paths[addr.nrn],
	layer_vector(obj, addr)
	);
	}

	static pure_inline uint8_t
	layer_channel_whence (struct layer const *obj,
	union ctx_addr addr)
	{
	return to_cortex(obj)->paths[addr.nrn].u8[addr.chn];
	}

	static pure_inline uint8_t
	layer_channel_get (struct layer const *obj,
	union ctx_addr addr)
	{
	return layer_vector(obj, addr)->u8[addr.chn];
	}

	static force_inline void
	layer_channel_set (struct layer *obj,
	union ctx_addr addr,
	uint8_t data)
	{
	layer_vector(obj, addr)->u8[addr.chn] = data;
	}

	static pure_inline uint8_t
	layer_channel_swap (struct layer *obj,
	union ctx_addr addr,
	uint8_t data)
	{
	uint8_t ret = layer_channel_get(obj, addr);
	addr.dir ^= 1U;
	layer_channel_set(obj, addr, data);
	return ret;
	}

	static force_inline struct layer
	layer (struct layer_cfg cfg,
	struct chunk *in,
	struct chunk *out)
	{
	return (struct layer) {
	.cfg = cfg,
	.data = {in, out}
	};
	}

	static force_inline void
	layer_save_vector (struct layer *obj,
	union ctx_addr addr,
	vec128 data)
	{
	*layer_vector(obj, addr) = data;
	}

	static pure_inline uint16_t
	layer_map (struct layer const *obj,
	union ctx_addr addr)
	{
	struct cortex *ctx = to_cortex(obj);
	const uint16_t index[16] = {
	// cortex::paths is shuffled with rev(32, 8),
	// the original indices are the EOL comments.
	83, 36, 100, 99, 110, 129, 213, 200, // 29, 66, 74, 89, 122, 129, 143, 168,
	204, 160, 224, 240, 167, 249, 253, 170, // 170, 192, 200, 204, 211, 237, 239, 240,
	};

	uint16_t s = index[addr.chn];
	uint16_t r = index[obj->cfg.mode];
	uint16_t them = (uint16_t)ctx->paths[s].u8[addr.seq] << 8U
	\| (uint16_t)ctx->paths[r].u8[addr.rot] << 4U
	\| (uint16_t)ctx->paths[r].u8[addr.chn];
	return (uint16_t)rev(32, 12, them);
	}

	#endif /* B24_LAYER_PRIV_H_ */
	/**
	* @file neuron.h
	*/
	#ifndef B24_NEURON_H_
	#define B24_NEURON_H_

	#include <limits.h>

	#include "seq.h"
	#include "vec.h"

	/**
	* @brief Runtime neuron structure.
	*
	* This is what is actually used with the SIMD instructions.
	*/
	struct neuron {
	u8x16 paths;
	u8x16 state;
	};

	static pure_inline struct neuron
	neuron_from_vec128 (vec128 *paths,
	vec128 *state)
	{
	return (struct neuron){
	.paths = u8x16_from_vec128(*paths),
	.state = u8x16_from_vec128(*state)
	};
	}

	static pure_inline struct neuron
	neuron (uint16_t seq,
	vec128 state)
	{
	return (struct neuron){
	.paths = seq_expand_u8x16(seq),
	.state = u8x16_from_vec128(state)
	};
	}

	static force_inline u8x16
	neuron_tick (struct neuron *nrn,
	u8x16 in)
	{
	u8x16 state = nrn->state;
	nrn->state = age_and_mutate(
	merge_input(
	state,
	in
	),
	nrn->paths
	);
	return state;
	}

	static force_inline vec128
	neuron_tock (struct neuron *nrn,
	vec128 in)
	{
	nrn->state = age_and_mutate(
	merge_input(
	nrn->state,
	u8x16_from_vec128(in)
	),
	nrn->paths
	);
	return vec128_from_u8x16(nrn->state);
	}

	static force_inline vec128
	neuron_send (struct neuron *nrn,
	vec128 in)
	{
	nrn->state = mutate(
	select_nonzero_input(
	nrn->state,
	u8x16_from_vec128(in)
	),
	nrn->paths
	);
	return vec128_from_u8x16(nrn->state);
	}

	#endif /* B24_NEURON_H_ */
	/** @file popcnt.h
	*/
	#ifndef B24_POPCNT_H_
	#define B24_POPCNT_H_

	#include <limits.h>
	#include <stdint.h>

	#include "util.h"

	#undef define_popcnt
	#undef popcnt16_impl
	#undef popcnt32_impl
	#undef popcnt64_impl
	#undef popcnt_compat

	#ifdef _MSC_VER
	# define popcnt16_impl return __popcnt16
	# pragma intrinsic(__popcnt16)

	# define popcnt32_impl return __popcnt
	# pragma intrinsic(__popcnt)

	# ifdef _M_IX86
	# undef popcnt32x2
	# define popcnt64_impl return popcnt32x2
	# define popcnt32x2(x) \
	(__popcnt((uint32_t)(x >> 32U)) + \
	__popcnt((uint32_t)(x & ~(uint32_t)0)))
	# else
	# define popcnt64_impl return __popcnt64
	# pragma intrinsic(__popcnt64)
	# endif

	# define popcnt_pre_pragma _Pragma("warning(push)") \
	_Pragma("warning(disable: 4116)")
	# define popcnt_post_pragma _Pragma("warning(pop)")

	#else
	# if __has_builtin(__builtin_popcount) && (UINT_MAX == UINT32_MAX)
	# define popcnt16_impl return (uint16_t)__builtin_popcount
	# define popcnt32_impl return (uint32_t)__builtin_popcount
	# endif

	# if !defined(popcnt32_impl) && \
	__has_builtin(__builtin_popcountl) && (ULONG_MAX == UINT32_MAX)
	# define popcnt16_impl return (uint16_t)__builtin_popcountl
	# define popcnt32_impl return (uint32_t)__builtin_popcountl
	# endif

	# if __has_builtin(__builtin_popcountl) && (ULONG_MAX == UINT64_MAX)
	# define popcnt64_impl return (uint64_t)__builtin_popcountl
	# endif

	# if !defined(popcnt64_impl) && \
	__has_builtin(__builtin_popcountll) && (ULLONG_MAX == UINT64_MAX)
	# define popcnt64_impl return (uint64_t)__builtin_popcountll
	# endif

	# if __has_builtin(__builtin_popcountg)
	# if !defined(popcnt16_impl)
	# define popcnt16_impl return (uint16_t)__builtin_popcountg
	# endif
	# if !defined(popcnt32_impl)
	# define popcnt32_impl return (uint32_t)__builtin_popcountg
	# endif
	# if !defined(popcnt64_impl)
	# define popcnt64_impl return (uint64_t)__builtin_popcountg
	# endif
	# endif
	#endif

	#define define_popcnt(b, ...) const_inline \
	static uint##b##_t popcnt##b(uint##b##_t val) \
	{ __VA_ARGS__(val); } _Static_assert(b>0, #b)

	#define popcnt_compat(x) typeof(_Generic( \
	(char(*)[2 - (sizeof(x) > 4U)])0 \
	,char(*)[1]: (uint64_t)0 \
	,char(*)[2]: (uint32_t)0)) y = (x); \
	y -= (typeof(y))-1/3 & (y >> 1U); \
	y = ((typeof(y))-1/5 & (y >> 2U)) \
	+ ((typeof(y))-1/5 & y); \
	return ((typeof(y))-1/ 17 & ((y >> 4U) + y)) \
	* ((typeof(y))-1/255) >> (sizeof y - 1U) \
	* CHAR_BIT

	#ifndef popcnt16_impl
	# define popcnt16_impl popcnt_compat
	#endif

	#ifndef popcnt32_impl
	# define popcnt32_impl popcnt_compat
	#endif

	#ifndef popcnt64_impl
	# define popcnt64_impl popcnt_compat
	#endif

	#ifndef popcnt_pre_pragma
	# define popcnt_pre_pragma
	#endif

	#ifndef popcnt_post_pragma
	# define popcnt_post_pragma
	#endif

	define_popcnt(16, popcnt16_impl);
	define_popcnt(32, popcnt32_impl);
	define_popcnt(64, popcnt64_impl);

	#undef define_popcnt
	#undef popcnt16_impl
	#undef popcnt32_impl
	#undef popcnt64_impl
	#undef popcnt_compat

	#if defined(_MSC_VER) && defined(_M_IX86)
	# undef popcnt32x2
	#endif

	#define popcnt(x) popcnt_pre_pragma _Generic((x), \
	typeof(_Generic((char)0, \
	signed char: (struct{int i;}){0}, \
	unsigned char: (struct{int i;}){0}, \
	default: (char)0)): popcnt16, \
	signed char: popcnt16, short: popcnt16, \
	unsigned char: popcnt16, int: popcnt32, \
	unsigned short: popcnt16, unsigned: popcnt32, \
	long long: popcnt64, unsigned long: _Generic( \
	&(int[sizeof 1UL]){0}, \
	int(*)[sizeof(uint32_t)]: popcnt32, \
	int(*)[sizeof(uint64_t)]: popcnt64), \
	unsigned long long: popcnt64, long: _Generic( \
	&(int[sizeof 1L]){0}, \
	int(*)[sizeof(int32_t)]: popcnt32, \
	int(*)[sizeof(int64_t)]: popcnt64)) \
	(_Generic((x), default: (x), \
	long long: (unsigned long long)(x), \
	signed char: (unsigned char)(x), \
	short: (unsigned short)(x), \
	typeof(_Generic((char)0, \
	signed char: \
	(struct{int i;}){0}, \
	unsigned char: \
	(struct{int i;}){0}, \
	default: (char)0)): \
	(unsigned char)(x), \
	long: (unsigned long)(x), \
	int: (unsigned)(x))) popcnt_post_pragma

	#endif /* B24_POPCNT_H_ */
	/** @file seq.h
	*/
	#ifndef B24_SEQ_H_
	#define B24_SEQ_H_

	#include "vec.h"

	#ifdef __aarch64__
	static const_inline u8x16
	mutate (u8x16 state,
	u8x16 paths)
	{
	return vqtbl1q_u8(state, paths);
	}

	static const_inline uint16_t
	sum_of_abs_diff (u8x16 a,
	u8x16 b)
	{
	return vaddvq_u8(vabdq_u8(a, b));
	}

	/**
	* @brief Age and mutate the state.
	*
	* Decrement each non-zero strength by one and apply the given mutation.
	*
	* @param state The current state.
	* @param paths The mutation descriptor.
	* @return The resulting state.
	*/
	static const_inline u8x16
	age_and_mutate (u8x16 state,
	u8x16 paths)
	{
	return vqtbl1q_u8(
	vsubq_u8(
	state,
	vbslq_u8(
	vtstq_u8(
	state,
	vdupq_n_u8(0xf0)
	),
	vdupq_n_u8(0x10),
	state
	)
	),
	paths
	);
	}

	/**
	* @brief Merge input into the current state, preferring the input.
	*
	* For each channel, if the input channel has a non-zero strength,
	* use that channel in the output, otherwise use the corresponding
	* channel from the current state.
	*
	* @param state The current state.
	* @param input The input to merge into the state.
	* @return The merged state.
	*/
	static const_inline u8x16
	merge_input (u8x16 state,
	u8x16 input)
	{
	return vbslq_u8(
	vtstq_u8(
	input,
	vdupq_n_u8(0xf0)
	),
	input,
	state
	);
	}

	static const_inline u8x16
	select_nonzero_input (u8x16 state,
	u8x16 input)
	{
	return vbslq_u8(
	vtstq_u8(
	input,
	vdupq_n_u8(0xff)
	),
	input,
	state
	);
	}

	static force_inline uint64_t
	seq_expand (uint16_t seq)
	{
	uint16_t rot = seq << 12U \| seq >> 4U;
	uint64_t ret = 0;

	uint64x2_t x = vreinterpretq_u64_u8(
	vqtbl1q_u8(
	vreinterpretq_u8_u16(
	vld1q_lane_u16(
	&rot,
	vld1q_lane_u16(
	&seq,
	vdupq_n_u16(0),
	0
	),
	1
	)
	),
	vld1q_u8(((uint8_t[16]){
	0x00, 0x00, 0x00, 0x00,
	0x02, 0x02, 0x02, 0x02,
	0x01, 0x01, 0x01, 0x01,
	0x03, 0x03, 0x03, 0x03})
	)
	)
	);
	x = vorrq_u64(
	vandq_u64(
	x,
	vdupq_n_u64(UINT64_C(0x003c000f003c000f))
	),
	vshrq_n_u64(
	vandq_u64(
	x,
	vdupq_n_u64(UINT64_C(0x78001e0078001e00))
	),
	5
	)
	);
	x = vorrq_u64(
	vandq_u64(
	x,
	vdupq_n_u64(UINT64_C(0x000000ff000000ff))
	),
	vshrq_n_u64(
	vandq_u64(
	x,
	vdupq_n_u64(UINT64_C(0x03fc000003fc0000))
	),
	10
	)
	);
	vst1q_lane_u64(
	&ret,
	vreinterpretq_u64_u8(
	vqtbl1q_u8(
	vreinterpretq_u8_u64(x),
	vld1q_u8(((uint8_t[16]){
	0x00, 0x01, 0x04, 0x05,
	0x08, 0x09, 0x0c, 0x0d,
	0x80, 0x80, 0x80, 0x80,
	0x80, 0x80, 0x80, 0x80})
	)
	)
	),
	0
	);
	return ret;
	}

	static force_inline u8x16
	seq_expand_u8x16 (uint16_t seq)
	{
	uint64x2_t x = vreinterpretq_u64_u8(
	vqtbl1q_u8(
	vreinterpretq_u8_u16(
	vld1q_lane_u16(
	(uint16_t[1]){
	seq << 12U \|
	seq >> 4U
	},
	vld1q_lane_u16(
	&seq,
	vdupq_n_u16(0),
	0
	),
	1
	)
	),
	vld1q_u8(((uint8_t[16]){
	0x00, 0x00, 0x00, 0x00,
	0x02, 0x02, 0x02, 0x02,
	0x01, 0x01, 0x01, 0x01,
	0x03, 0x03, 0x03, 0x03})
	)
	)
	);
	x = vorrq_u64(
	vandq_u64(
	x,
	vdupq_n_u64(UINT64_C(0x003c000f003c000f))
	),
	vshrq_n_u64(
	vandq_u64(
	x,
	vdupq_n_u64(UINT64_C(0x78001e0078001e00))
	),
	5
	)
	);
	x = vorrq_u64(
	vandq_u64(
	x,
	vdupq_n_u64(UINT64_C(0x000000ff000000ff))
	),
	vshrq_n_u64(
	vandq_u64(
	x,
	vdupq_n_u64(UINT64_C(0x03fc000003fc0000))
	),
	10
	)
	);
	x = vreinterpretq_u64_u8(
	vqtbl1q_u8(
	vreinterpretq_u8_u64(x),
	vld1q_u8(((uint8_t[16]){
	0x00, 0x80, 0x01, 0x80,
	0x04, 0x80, 0x05, 0x80,
	0x08, 0x80, 0x09, 0x80,
	0x0c, 0x80, 0x0d, 0x80})
	)
	)
	);
	uint64x2_t y = vandq_u64(
	x,
	vdupq_n_u64(UINT64_C(0x00f000f000f000f0))
	);
	return vreinterpretq_u8_u64(
	vorrq_u64(
	veorq_u64(x, y),
	vshlq_n_u64(y, 4)
	)
	);
	}

	static force_inline vec128
	seq_expand_vec128 (uint16_t seq)
	{
	return vec128_from_u8x16(seq_expand_u8x16(seq));
	}
	#endif /* __aarch64__ */

	#if defined(__x86_64__) \|\| defined(_MSC_VER)
	# define pext_mask 0x783c1e0f783c1e0fULL
	# define pdep_mask 0x0f0f0f0f0f0f0f0fULL

	static force_inline uint64_t
	seq_expand (uint16_t seq)
	{
	u8x16 k = _mm_shuffle_epi8(
	_mm_set_epi64x(
	0, (uint32_t)(seq << 12U \| seq >> 4U) << 16U \| seq
	),
	_mm_set_epi64x(0x0303030301010101LL, 0x0202020200000000LL)
	);

	return _pext_u64((uint64_t)_mm_extract_epi64(k, 1), pext_mask) << 32U
	\| _pext_u64((uint64_t)_mm_extract_epi64(k, 0), pext_mask);
	}

	static force_inline vec128
	seq_expand_vec128 (uint16_t seq)
	{
	u8x16 k = _mm_shuffle_epi8(
	_mm_set_epi64x(
	0, (uint32_t)(seq << 12U \| seq >> 4U) << 16U \| seq
	),
	_mm_set_epi64x(0x0303030301010101LL, 0x0202020200000000LL)
	);

	return (vec128){
	.u64[0] = _pdep_u64(
	_pext_u64(
	(unsigned long long)_mm_extract_epi64(k, 0),
	pext_mask
	),
	pdep_mask
	),
	.u64[1] = _pdep_u64(
	_pext_u64(
	(unsigned long long)_mm_extract_epi64(k, 1),
	pext_mask
	),
	pdep_mask
	)
	};
	}

	static force_inline u8x16
	seq_expand_u8x16 (uint16_t seq)
	{
	return u8x16_from_vec128(seq_expand_vec128(seq));
	}

	# undef pdep_mask
	# undef pext_mask

	static const_inline u8x16
	mutate (u8x16 state,
	u8x16 paths)
	{
	return _mm_shuffle_epi8(state, paths);
	}

	static const_inline uint16_t
	sum_of_abs_diff (u8x16 a,
	u8x16 b)
	{
	__m128i x = _mm_sad_epu8(a, b);
	return _mm_extract_epi16(x, 0)
	+ _mm_extract_epi16(x, 4);
	}

	static const_inline u8x16
	age_and_mutate (u8x16 state,
	u8x16 paths)
	{
	return _mm_shuffle_epi8(
	_mm_subs_epu8(
	state,
	_mm_set_epi64x(
	0x1010101010101010LL,
	0x1010101010101010LL
	)
	),
	paths
	);
	}

	static const_inline u8x16
	merge_input (u8x16 state,
	u8x16 input)
	{
	return _mm_blendv_epi8(
	state,
	input,

	/* Mask construction: adding 0x70 (saturated)
	* sets the top bit iff the value is at least
	* 0x10, and the result can be used as a mask
	* in `_mm_blendv_epi8()`.
	*/
	_mm_adds_epu8(
	input,
	_mm_set_epi64x(
	0x7070707070707070LL,
	0x7070707070707070LL
	)
	)
	);
	}

	static const_inline u8x16
	select_nonzero_input (u8x16 state,
	u8x16 input)
	{
	return _mm_blendv_epi8(
	input,
	state,
	_mm_cmpeq_epi8(
	input,
	_mm_set_epi64x(0LL, 0LL)
	)
	);
	}

	#endif /* __x86_64__ \|\| _MSC_VER */

	static const_inline uint16_t
	mutation_sad (u8x16 state,
	u8x16 paths)
	{
	return sum_of_abs_diff(
	state,
	mutate(state, paths)
	);
	}

	#endif /* B24_SEQ_H_ */
	/**
	* @file test-cortex.c
	*
	* Compiling with MSVC 19.41 or later:
	*
	* cl.exe /TC /std:clatest /O2 /Oi /GL /GF /Zo- /favor:AMD64 /arch:AVX2 cortex.c test-cortex.c /Fe: test-cortex.exe /MD /link ntdll.lib
	*/
	#include <inttypes.h>
	#include <math.h>
	#include <stdbool.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#include "cortex.h"

	int
	main (void)
	{
	char screen[69897];
	char *s = screen;
	struct cortex *ctx = cortex_create();

	*s++ = '\033';
	*s++ = '[';
	*s++ = '2';
	*s++ = 'J';
	for (uint64_t k = 0U;; ++k) {
	*s++ = '\033';
	*s++ = '[';
	*s++ = 'H';
	for (uint16_t row = 0U; row < 128U; row += 2U) {
	uint8_t r = 255U;
	for (uint16_t col = 0U; col < 128U; col += 2U) {
	struct cortex_loc loc =
	linear_loc_fwd(
	(struct linear_loc){
	.row = row,
	.col = col
	}
	);
	uint8_t b = ctx->data[0].v[loc.seq << 4U \| loc.rot]
	.u8[loc.chn ];
	uint8_t p = b >> 4U;
	uint8_t q = b & 15U;
	if (p != r) {
	r = p;
	*s++ = '\033';
	*s++ = '[';
	*s++ = '3';
	*s++ = '8';
	*s++ = ';';
	*s++ = '5';
	*s++ = ';';
	p = 232U + ((p & 8U)
	? -((16U - p) * 23U - 3U)
	: p);
	if (p >= 10U) {
	if (p >= 100U) {
	*s++ = (char)(p / 100U + (uint8_t)'0');
	p %= 100U;
	}
	*s++ = (char)(p / 10U + (uint8_t)'0');
	p %= 10U;
	}
	*s++ = (char)(p + (uint8_t)'0');
	*s++ = 'm';
	}
	*s++ = "0123456789abcdef"[p];
	*s++ = "0123456789abcdef"[q];
	}
	*s++ = '\n';
	}
	*s++ = '\0';
	s = screen;
	printf("%s\033[m%128" PRIu64 "\n", s, k);

	cortex_tick(ctx);
	//hol_up(100000000);
	}

	cortex_destroy(&ctx);
	}