Last active
October 1, 2024 18:40
-
-
Save imaami/b74edbf7c212faa1f40241bda3c55f54 to your computer and use it in GitHub Desktop.
B(2, 4)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @file b24.c | |
* | |
* Compiling with MSVC 19.41 or later: | |
* | |
* cl.exe /TC /std:clatest /O2 /Oi /GL /GF /Zo- /favor:AMD64 /arch:AVX2 b24.c /Fe: b24.exe /MD | |
*/ | |
#include <errno.h> | |
#include <inttypes.h> | |
#include <stdbool.h> | |
#include <stddef.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include "neuron.h" | |
#include "popcnt.h" | |
#include "vec.h" | |
#if defined(__INTELLISENSE__) || defined(_MSC_VER) | |
# define BITINT_C(n) n | |
# define _BitInt(n) int | |
#else | |
# define BITINT_C(n) n##WB | |
#endif | |
#ifdef _MSC_VER | |
# pragma intrinsic(_BitScanForward) | |
static const_inline int __builtin_ctzl(unsigned long x) { | |
unsigned long y; | |
return _BitScanForward(&y, x) ? (int)y : (int)sizeof(x) * CHAR_BIT; | |
} | |
#endif | |
pragma_msvc(warning(push)) | |
pragma_msvc(warning(disable: 4710)) | |
pragma_msvc(warning(disable: 4711)) | |
static const_inline char | |
hexdig (unsigned _BitInt(4) n) | |
{ | |
return (const char[16]){ | |
'0','1','2','3','4','5','6','7', | |
'8','9','a','b','c','d','e','f', | |
}[n]; | |
} | |
static inline char * | |
decstr_u5 (char *dst, | |
unsigned _BitInt(5) src, | |
bool aln) | |
{ | |
if (src >= BITINT_C(10U)) { | |
*dst++ = (char)((unsigned char)'0' + src / BITINT_C(10U)); | |
src %= BITINT_C(10U); | |
} else if (aln) | |
*dst++ = ' '; | |
*dst++ = (char)((unsigned char)'0' + src); | |
return dst; | |
} | |
#if 0 | |
static force_inline char * | |
decstr_s5 (char *dst, | |
unsigned _BitInt(5) src, | |
bool pad) | |
{ | |
const typeof(src) lt0 = src >> 4; | |
src = (src - lt0) ^ ((BITINT_C(31U) ^ BITINT_C(31U)) - lt0); | |
if (lt0) { | |
if (src >= BITINT_C(10U)) { | |
*dst++ = '-'; | |
*dst++ = '1'; | |
src -= BITINT_C(10U); | |
} else { | |
if (pad) | |
*dst++ = ' '; | |
*dst++ = '-'; | |
} | |
} else { | |
if (src >= BITINT_C(10U)) { | |
if (pad) | |
*dst++ = ' '; | |
*dst++ = '1'; | |
src -= BITINT_C(10U); | |
} else if (pad) { | |
*dst++ = ' '; | |
*dst++ = ' '; | |
} | |
} | |
*dst++ = (char)((unsigned char)'0' + src); | |
return dst; | |
} | |
#endif | |
static force_inline char * | |
decstr_u4 (char *dst, | |
unsigned _BitInt(4) src, | |
bool aln) | |
{ | |
if (src >= BITINT_C(10U)) { | |
*dst++ = '1'; | |
src -= BITINT_C(10U); | |
} else if (aln) | |
*dst++ = ' '; | |
*dst++ = (char)((unsigned char)'0' + src); | |
return dst; | |
} | |
static force_inline char * | |
decstr_s4 (char *dst, | |
unsigned _BitInt(4) src, | |
bool aln) | |
{ | |
if (src >= BITINT_C(8U)) { | |
*dst++ = '-'; | |
*dst++ = (char)((unsigned char)'8' + | |
(unsigned char)8 - src); | |
} else { | |
if (aln) | |
*dst++ = ' '; | |
*dst++ = (char)((unsigned char)'0' + src); | |
} | |
return dst; | |
} | |
static const uint16_t b24[] = { | |
0x9afU, 0x9ebU, 0xa6fU, 0xa7bU, | |
0xb3dU, 0xb4fU, 0xbcdU, 0xbd3U, | |
0xcbdU, 0xd2fU, 0xd79U, 0xde5U, | |
0xf2dU, 0xf4bU, 0xf59U, 0xf65U, | |
}; | |
pragma_msvc(warning(push)) | |
pragma_msvc(warning(disable: 4200)) | |
pragma_msvc(warning(disable: 4820)) | |
struct b24_cfg { | |
uint32_t have_opt; | |
uint16_t rotation; | |
uint16_t offset; | |
uint64_t neuron; | |
size_t n_seq; | |
uint16_t seq[]; | |
}; | |
pragma_msvc(warning(pop)) | |
typedef typeof((struct b24_cfg){0}.have_opt) b24_cfg_flags; | |
typedef typeof((struct b24_cfg){0}.offset) b24_cfg_offset; | |
struct trace { | |
uint64_t cyc; | |
uint32_t seq; | |
uint16_t len; | |
uint16_t map; | |
}; | |
const_function | |
static struct trace | |
b24_trace_init (struct b24_cfg const *const cfg, | |
unsigned i) | |
{ | |
uint32_t seq = cfg->seq[i] | ((uint32_t)cfg->seq[i] << 16U); | |
return (struct trace){ | |
.cyc = 0U, | |
.seq = cfg->rotation ? (seq >> (16 - cfg->rotation)) | |
| (seq << cfg->rotation) : seq, | |
.len = 0U, | |
.map = 0U, | |
}; | |
} | |
const_function | |
static struct trace | |
b24_trace_from (struct trace tc, | |
b24_cfg_offset off) | |
{ | |
tc.cyc = off; | |
tc.len = 1U; | |
tc.map = 1U << off; | |
for (;;) { | |
off = (tc.seq >> off) & (b24_cfg_offset)15; | |
typeof(tc.map) bit = (typeof(bit))1 << off; | |
if (tc.map & bit) | |
break; | |
tc.map |= bit; | |
tc.cyc |= (typeof(tc.cyc))off | |
<< (tc.len++ << 2U); | |
} | |
return tc; | |
} | |
const_function | |
static struct trace | |
b24_trace_next (struct trace tc) | |
{ | |
if (tc.map == (typeof(tc.map))0xffff) | |
return (struct trace){0}; | |
b24_cfg_offset off = 0U; | |
uint_fast16_t map = tc.map; | |
while (map & (typeof(map))1) { | |
map >>= 1U; | |
++off; | |
} | |
return b24_trace_from(tc, off); | |
} | |
enum b24_opt { | |
/* The std_in option ('-') is 0 to guarantee all | |
* argument-accepting options stored in `expect` | |
* evaluate to true during argument parsing. | |
*/ | |
b24_opt_std_in , // - (unused for now) | |
b24_opt_big_hex , // -b | |
b24_opt_cycles , // -c | |
b24_opt_debruijn, // -d | |
b24_opt_entries , // -e | |
b24_opt_graphviz, // -g | |
b24_opt_help , // -h | |
b24_opt_json , // -j | |
b24_opt_neuron , // -n | |
b24_opt_one_path, // -o | |
b24_opt_python , // -p | |
b24_opt_expand , // -q | |
b24_opt_rotation, // -r | |
b24_opt_sig_path, // -s | |
b24_opt_no_space, // -w | |
b24_opt_hex_path, // -x | |
b24_opt_no_align, // -y | |
b24_opt_no_zeros, // -z | |
// qualifier flags | |
ONLY_ONCE = 0x40U, | |
NEEDS_ARG = 0x80U, | |
}; | |
#define b24_opt_char(x) \ | |
_Generic(&(int[1U+b24_opt_##x]){0} \ | |
, int(*)[1U+b24_opt_std_in ]: 0 \ | |
, int(*)[1U+b24_opt_big_hex ]: 'b' \ | |
, int(*)[1U+b24_opt_cycles ]: 'c' \ | |
, int(*)[1U+b24_opt_debruijn]: 'd' \ | |
, int(*)[1U+b24_opt_entries ]: 'e' \ | |
, int(*)[1U+b24_opt_graphviz]: 'g' \ | |
, int(*)[1U+b24_opt_help ]: 'h' \ | |
, int(*)[1U+b24_opt_json ]: 'j' \ | |
, int(*)[1U+b24_opt_neuron ]: 'n' \ | |
, int(*)[1U+b24_opt_one_path]: 'o' \ | |
, int(*)[1U+b24_opt_python ]: 'p' \ | |
, int(*)[1U+b24_opt_expand ]: 'q' \ | |
, int(*)[1U+b24_opt_rotation]: 'r' \ | |
, int(*)[1U+b24_opt_sig_path]: 's' \ | |
, int(*)[1U+b24_opt_no_space]: 'w' \ | |
, int(*)[1U+b24_opt_hex_path]: 'x' \ | |
, int(*)[1U+b24_opt_no_align]: 'y' \ | |
, int(*)[1U+b24_opt_no_zeros]: 'z' ) | |
static const char b24_opt_to_char[] = { | |
#define b24_option(x) [b24_opt_##x] = b24_opt_char(x) | |
b24_option(std_in ), | |
b24_option(big_hex ), | |
b24_option(cycles ), | |
b24_option(debruijn), | |
b24_option(entries ), | |
b24_option(graphviz), | |
b24_option(help ), | |
b24_option(json ), | |
b24_option(neuron ), | |
b24_option(one_path), | |
b24_option(python ), | |
b24_option(expand ), | |
b24_option(rotation), | |
b24_option(sig_path), | |
b24_option(no_space), | |
b24_option(hex_path), | |
b24_option(no_align), | |
b24_option(no_zeros), | |
#undef b24_option | |
}; | |
static const unsigned char | |
b24_char_to_opt[1U << CHAR_BIT] = { | |
#define b24_option(x) [b24_opt_char(x)] = b24_opt_##x | |
b24_option(std_in )| ONLY_ONCE, | |
b24_option(big_hex )| ONLY_ONCE, | |
b24_option(cycles )| ONLY_ONCE, | |
b24_option(debruijn)| NEEDS_ARG, | |
b24_option(entries )| ONLY_ONCE, | |
b24_option(graphviz)| ONLY_ONCE, | |
b24_option(help )| ONLY_ONCE, | |
b24_option(json )| ONLY_ONCE, | |
b24_option(neuron )| ONLY_ONCE | |
| NEEDS_ARG, | |
b24_option(one_path)| ONLY_ONCE | |
| NEEDS_ARG, | |
b24_option(python )| ONLY_ONCE, | |
b24_option(expand )| ONLY_ONCE, | |
b24_option(rotation)| ONLY_ONCE | |
| NEEDS_ARG, | |
b24_option(sig_path)| ONLY_ONCE, | |
b24_option(no_space)| ONLY_ONCE, | |
b24_option(hex_path)| ONLY_ONCE, | |
b24_option(no_align)| ONLY_ONCE, | |
b24_option(no_zeros)| ONLY_ONCE, | |
#undef b24_option | |
}; | |
/** | |
* @brief Check if option `opt` is present. | |
*/ | |
static pure_inline bool | |
b24_has_option (struct b24_cfg const *const cfg, | |
const enum b24_opt opt) | |
{ | |
return cfg->have_opt & ((b24_cfg_flags)1 << opt); | |
} | |
/** | |
* @brief Mark option `opt` as present. | |
*/ | |
static force_inline void | |
b24_option_add (struct b24_cfg *const cfg, | |
const enum b24_opt opt) | |
{ | |
cfg->have_opt |= (b24_cfg_flags)1 << opt; | |
} | |
struct b24_syntax { | |
char bra; | |
char ket; | |
char sep; | |
char spc; | |
}; | |
static force_inline char * | |
hexstr_u4 (char *dst, | |
unsigned _BitInt(4) src) | |
{ | |
*dst++ = '0'; | |
*dst++ = 'x'; | |
*dst++ = hexdig(src); | |
return dst; | |
} | |
static inline char * | |
hexstr_u16 (char *dst, | |
uint16_t src, | |
bool pad) | |
{ | |
*dst++ = '0'; | |
*dst++ = 'x'; | |
if (pad) { | |
*dst++ = hexdig(src >> 12U ); | |
*dst++ = hexdig(src >> 8U & 15U); | |
*dst++ = hexdig(src >> 4U & 15U); | |
} else if (src > UINT16_C(0x000f)) { | |
if (src > UINT16_C(0x00ff)) { | |
if (src > UINT16_C(0x0fff)) | |
*dst++ = hexdig(src >> 12U); | |
*dst++ = hexdig(src >> 8U & 15U); | |
} | |
*dst++ = hexdig(src >> 4U & 15U); | |
} | |
*dst++ = hexdig(src & 15U); | |
return dst; | |
} | |
static force_inline char * | |
hexstr_16x4 (char *dst, | |
uint64_t src, | |
unsigned len, | |
struct b24_syntax syn) | |
{ | |
if (len) { | |
for (;; src >>= 4) { | |
dst = hexstr_u4(dst, src & 15U); | |
if (!--len) | |
break; | |
*dst++ = syn.sep; | |
*dst = syn.spc; | |
dst+= !!syn.spc; | |
} | |
} | |
return dst; | |
} | |
static char * | |
hexstr_u64 (char *dst, | |
uint64_t src, | |
bool pad) | |
{ | |
char buf[16] = "000000000000000"; | |
char *p = &buf[15]; | |
for (;; --p) { | |
*p = hexdig(src & 15U); | |
src >>= 4U; | |
if (!src) | |
break; | |
} | |
if (pad) | |
p = &buf[0]; | |
*dst++ = '0'; | |
*dst++ = 'x'; | |
for (;; ++p) { | |
*dst++ = *p; | |
if (p == &buf[15]) | |
break; | |
} | |
return dst; | |
} | |
static force_inline char * | |
decstr_16x4 (char *dst, | |
uint64_t src, | |
unsigned len, | |
struct b24_syntax syn, | |
bool aln, | |
bool sig) | |
{ | |
if (len) { | |
bool w = syn.spc; | |
if (sig) { | |
for (;; src >>= 4) { | |
dst = decstr_s4(dst, src & 15U, aln); | |
if (!--len) | |
break; | |
*dst++ = syn.sep; | |
*dst = syn.spc; | |
dst += w; | |
} | |
} else { | |
for (;; src >>= 4) { | |
dst = decstr_u4(dst, src & 15U, aln); | |
if (!--len) | |
break; | |
*dst++ = syn.sep; | |
*dst = syn.spc; | |
dst += w; | |
} | |
} | |
} | |
return dst; | |
} | |
static char * | |
b24_cfg_trace_print (struct b24_cfg const *cfg, | |
char *dst, | |
struct trace src, | |
unsigned idx, | |
struct b24_syntax syn) | |
{ | |
bool align = !b24_has_option(cfg, b24_opt_no_align); | |
bool zeros = !b24_has_option(cfg, b24_opt_no_zeros); | |
bool expand = b24_has_option(cfg, b24_opt_expand); | |
bool all = !b24_has_option(cfg, b24_opt_one_path) && | |
!expand; | |
if (all) { | |
*dst++ = syn.bra; | |
dst = hexstr_u16(dst, (uint16_t)src.seq, zeros); | |
*dst++ = syn.sep; | |
*dst = syn.spc; | |
dst+= !!syn.spc; | |
dst = decstr_u5(dst, idx, align); | |
*dst++ = syn.sep; | |
*dst = syn.spc; | |
dst+= !!syn.spc; | |
dst = hexstr_u16(dst, src.map, zeros); | |
*dst++ = syn.sep; | |
*dst = syn.spc; | |
dst+= !!syn.spc; | |
dst = decstr_u5(dst, src.len, align); | |
*dst++ = syn.sep; | |
*dst = syn.spc; | |
dst+= !!syn.spc; | |
} | |
if (b24_has_option(cfg, b24_opt_big_hex)) { | |
dst = hexstr_u64(dst, src.cyc, zeros); | |
} else { | |
*dst++ = syn.bra; | |
dst = b24_has_option(cfg, b24_opt_hex_path) | |
? hexstr_16x4(dst, src.cyc, src.len, syn) | |
: decstr_16x4(dst, src.cyc, src.len, syn, align, | |
b24_has_option(cfg, b24_opt_sig_path)); | |
*dst++ = syn.ket; | |
} | |
if (all) | |
*dst++ = syn.ket; | |
*dst = '\0'; | |
return dst; | |
} | |
static struct trace | |
b24_expand (struct trace tc) | |
{ | |
tc.cyc = seq_expand((uint16_t)tc.seq); | |
tc.len = 16U; | |
tc.map = 0xffffU; | |
return tc; | |
} | |
static void | |
b24_cfg_trace (struct b24_cfg const *cfg) | |
{ | |
struct b24_syntax syn = (struct b24_syntax[]){ | |
{'{', '}', ',', ' '}, | |
{'[', ']', ',', ' '}, // json or python | |
{'{', '}', ',', 0x0}, // no space | |
{'[', ']', ',', 0x0}, // js/py, no space | |
}[b24_has_option(cfg, b24_opt_json) | | |
b24_has_option(cfg, b24_opt_python) | | |
(b24_has_option(cfg, b24_opt_no_space) << 1U)]; | |
bool expand = b24_has_option(cfg, b24_opt_expand); | |
bool once = b24_has_option(cfg, b24_opt_one_path); | |
bool all = !once && !expand; | |
bool please = false; | |
for (unsigned i = 0; i < cfg->n_seq;) { | |
struct trace tc = b24_trace_init(cfg, i); | |
unsigned cycles = 0; | |
for (;; ++cycles) { | |
struct trace t; | |
if (once) { | |
t = b24_trace_from(tc, cfg->offset); | |
} else if (expand) { | |
t = b24_expand(tc); | |
} else { | |
t = b24_trace_next(tc); | |
if (!t.map) | |
break; | |
tc.cyc |= t.cyc << (tc.len << 2U); | |
tc.len += t.len; | |
tc.map |= t.map; | |
} | |
char buf[160]; | |
char *p = b24_cfg_trace_print(cfg, buf, t, | |
cycles, syn); | |
if (p) { | |
if (please) { | |
(void)putchar(syn.sep); | |
(void)putchar('\n'); | |
} | |
(void)fputs(buf, stdout); | |
please = true; | |
} | |
if (!all) | |
break; | |
} | |
if (++i == cfg->n_seq && please) | |
(void)putchar('\n'); | |
} | |
} | |
static void | |
b24_help (void) | |
{ | |
(void)printf( | |
"Usage: b24 [OPTIONS]... [<0..65535>]...\n" | |
"Options:\n" | |
" -%c Print this help text and exit\n" | |
" -%c <state> Trace a neuron's state evolution\n" | |
" -%c <0..15> Trace a single path at offset\n" | |
" -%c Expand subsequences in place\n" | |
"\n" | |
"Input options:\n" | |
" -%c <0..15> Add distinct B(2, 4) as input\n" | |
" -%c <0..15> Apply rotation before tracing\n" | |
"\n" | |
"Output options:\n" | |
" -%c Only cycles, no entry paths\n" | |
" -%c Always find all entry paths\n" | |
"\n" | |
"Formatting options:\n" | |
" -%c Output in Graphviz format\n" | |
" -%c Output in JSON format\n" | |
" -%c Output in Python format\n" | |
" -%c Signed decimal path steps\n" | |
" -%c Hexadecimal path steps\n" | |
" -%c Print every sequence as one big hex number\n" | |
" -%c Don't align output columns\n" | |
" -%c Don't zero-pad hexadecimals\n" | |
" -%c Omit non-separator whitespace\n", | |
b24_opt_char(help), b24_opt_char(neuron), | |
b24_opt_char(one_path), b24_opt_char(expand), | |
b24_opt_char(debruijn), b24_opt_char(rotation), | |
b24_opt_char(cycles), b24_opt_char(entries), | |
b24_opt_char(graphviz), b24_opt_char(json), | |
b24_opt_char(python), b24_opt_char(sig_path), | |
b24_opt_char(hex_path), b24_opt_char(big_hex), | |
b24_opt_char(no_align), b24_opt_char(no_zeros), | |
b24_opt_char(no_space) | |
); | |
} | |
struct str7 { | |
union { | |
char d[8U]; | |
uint64_t u; | |
}; | |
}; | |
const_inline | |
static struct str7 | |
b24_opt_char_str (char ch) | |
{ | |
struct str7 str = {{{'"', '-', ch}}}; | |
str.d[3U - !ch] = '"'; | |
return str; | |
} | |
static void | |
b24_cfg_destroy (struct b24_cfg **cfg) | |
{ | |
if (cfg && *cfg) { | |
free(*cfg); | |
*cfg = NULL; | |
} | |
} | |
_Noreturn static void | |
b24_helpful_exit (struct b24_cfg **cfg, | |
int ret) | |
{ | |
b24_cfg_destroy(cfg); | |
b24_help(); | |
exit(ret); | |
} | |
static struct b24_cfg * | |
b24_cfg_create (size_t n_seq) | |
{ | |
struct b24_cfg *cfg = calloc(1, offsetof(struct b24_cfg, seq) | |
+ n_seq * sizeof cfg->seq[0]); | |
if (!cfg) | |
perror("calloc"); | |
return cfg; | |
} | |
_Noreturn static void | |
b24_cosmic_ray (struct b24_cfg **cfg, | |
char const *what, | |
char const *func, | |
int line) | |
{ | |
size_t n = what ? strlen(what) : 0; | |
if (!n) | |
what = ""; | |
(void)fprintf(stderr, "[\033[31mERROR\033[m] %s:%d: %s%s\n", | |
func, line, what, &".\nFailure caused by the impact " | |
"of a high-energy cosmic particle or a high-density " | |
"program author."[n ? what[n-1U] == '.' : 2U]); | |
b24_cfg_destroy(cfg); | |
exit(EXIT_FAILURE); | |
} | |
static int | |
b24_options_incompatible (struct b24_cfg *cfg, | |
enum b24_opt opt, | |
b24_cfg_flags bad, | |
const char sep) | |
{ | |
if (!b24_has_option(cfg, opt)) | |
return 0; | |
bad &= cfg->have_opt; | |
if (!bad) | |
return 0; | |
cfg->have_opt &= ~((typeof(cfg->have_opt))1 << opt); | |
int n = popcnt(bad); | |
if (n >= (int)array_size(b24_opt_to_char)) { | |
b24_cosmic_ray(&cfg, "Option flag outside expected range", | |
__func__, __LINE__); | |
} | |
char buf[64] = ""; | |
char *dst = &buf[0]; | |
b24_cfg_flags bit = 1U; | |
for (int prev = 0, i = 0; i < n;) { | |
int o = __builtin_ctzl(bad); | |
if (o >= (int)array_size(b24_opt_to_char)) | |
break; | |
bit <<= o - prev; | |
bad &= ~bit; | |
prev = o; | |
if (++i > 1) { | |
if (n > 2) | |
*dst++ = sep; | |
*dst++ = ' '; | |
if (i == n) { | |
*dst++ = 'o'; | |
*dst++ = 'r'; | |
*dst++ = ' '; | |
} | |
} | |
struct str7 s = b24_opt_char_str(b24_opt_to_char[o]); | |
for (char *src = s.d; *src; ++src) { *dst++ = *src; } | |
} | |
*dst = '\0'; | |
(void)fprintf(stderr, "Option %s cannot be combined with %s\n", | |
b24_opt_char_str(b24_opt_to_char[opt]).d, buf); | |
return 1; | |
} | |
static void | |
b24_option_assert_once (struct b24_cfg *cfg, | |
enum b24_opt opt) | |
{ | |
if (b24_has_option(cfg, opt)) { | |
(void)fprintf(stderr, "Option %s specified twice\n", | |
b24_opt_char_str(b24_opt_to_char[opt]).d); | |
b24_helpful_exit(&cfg, EXIT_FAILURE); | |
} | |
} | |
static inline uint64_t | |
b24_get_u64_arg (char *arg, | |
int *err) | |
{ | |
if (!arg) { | |
*err = EFAULT; | |
return 0; | |
} | |
if (!*arg) { | |
*err = EINVAL; | |
return 0; | |
} | |
errno = 0; | |
uint64_t u64 = 0U; | |
char *endptr = arg; | |
int64_t i64 = _Generic(i64, long: strtol, | |
long long: strtoll)(arg, &endptr, 0); | |
int e = errno; | |
if (!e) { | |
u64 = (uint64_t)i64; | |
if (*endptr) | |
e = EINVAL; | |
} else if (e == ERANGE && i64 == _Generic(i64, long: LONG_MAX, | |
long long: LLONG_MAX)) { | |
errno = 0; | |
endptr = arg; | |
u64 = _Generic(u64, unsigned long: strtoul, | |
unsigned long long: strtoull)(arg, &endptr, 0); | |
e = errno; | |
if (!e && *endptr) | |
e = EINVAL; | |
} | |
*err = e; | |
return u64; | |
} | |
static uint64_t | |
b24_get_int_arg (char *arg, | |
int *err, | |
uint64_t max) | |
{ | |
int e = 0; | |
uint64_t u64 = b24_get_u64_arg(arg, &e); | |
if (u64 > max) { | |
u64 = max; | |
if (!e) | |
e = ERANGE; | |
} | |
*err = e; | |
return u64; | |
} | |
static struct b24_cfg * | |
b24_parse_args (int argc, | |
char **argv) | |
{ | |
struct b24_cfg *cfg = b24_cfg_create(argc > 1 ? argc - 1 : 0); | |
if (!cfg) | |
return NULL; | |
enum b24_opt expect = 0; | |
char c_ = '\0'; | |
for (int i = 1; i < argc; i++) { | |
char *a = argv[i]; | |
if (expect) { | |
if (!*a) | |
goto missing_arg; | |
parse_arg: do{}while(0); | |
pragma_msvc(warning(push)) | |
pragma_msvc(warning(disable: 4061)) | |
uint64_t max = 15U; | |
switch (expect) { | |
case b24_opt_neuron: | |
max = UINT64_MAX; | |
break; | |
case b24_opt_debruijn: | |
case b24_opt_one_path: | |
case b24_opt_rotation: | |
break; | |
default: | |
b24_cosmic_ray(&cfg, "", __func__, __LINE__); | |
} | |
int e = 0; | |
uint64_t n = b24_get_int_arg(a, &e, max); | |
if (e) { | |
(void)fprintf(stderr, "Option %s " | |
"parse error: %s\n", | |
b24_opt_char_str(c_).d, | |
strerror(e)); | |
b24_helpful_exit(&cfg, EXIT_FAILURE); | |
} | |
switch (expect) { | |
case b24_opt_debruijn: | |
cfg->seq[cfg->n_seq++] = b24[n]; | |
break; | |
case b24_opt_neuron: | |
cfg->neuron = n; | |
break; | |
case b24_opt_one_path: | |
cfg->offset = (b24_cfg_offset)n; | |
break; | |
case b24_opt_rotation: | |
cfg->rotation = (uint16_t)n; | |
default: | |
break; | |
} | |
pragma_msvc(warning(pop)) | |
b24_option_add(cfg, expect); | |
expect = 0; | |
continue; | |
} | |
if (*a == '-') { | |
c_ = *++a; | |
concatenated_option: | |
unsigned ch_opt = b24_char_to_opt[(unsigned char)c_]; | |
if (!ch_opt) { | |
(void)fprintf(stderr, "Option %s is unknown" | |
"\n", b24_opt_char_str(c_).d); | |
b24_helpful_exit(&cfg, EXIT_FAILURE); | |
} | |
enum b24_opt opt = ch_opt & ~(NEEDS_ARG | ONLY_ONCE); | |
if (ch_opt & ONLY_ONCE) | |
b24_option_assert_once(cfg, opt); | |
if (ch_opt & NEEDS_ARG) { | |
expect = opt; | |
if (*++a) | |
goto parse_arg; | |
} else { | |
b24_option_add(cfg, opt); | |
if (c_ && (c_ = *++a)) | |
goto concatenated_option; | |
} | |
continue; | |
} | |
int e = 0; | |
uint64_t n = b24_get_int_arg(a, &e, UINT16_MAX); | |
if (e) { | |
(void)fprintf(stderr, "Bad sequence: %s\n", | |
strerror(e)); | |
b24_cfg_destroy(&cfg); | |
return NULL; | |
} | |
cfg->seq[cfg->n_seq++] = (uint16_t)n; | |
} | |
int yikes = b24_options_incompatible(cfg, b24_opt_big_hex , | |
1U << b24_opt_hex_path | | |
1U << b24_opt_sig_path , ',') | |
+ b24_options_incompatible(cfg, b24_opt_cycles , | |
1U << b24_opt_entries | | |
1U << b24_opt_neuron | | |
1U << b24_opt_expand , ',') | |
+ b24_options_incompatible(cfg, b24_opt_entries , | |
1U << b24_opt_cycles | | |
1U << b24_opt_neuron | | |
1U << b24_opt_expand , ',') | |
+ b24_options_incompatible(cfg, b24_opt_graphviz , | |
1U << b24_opt_json | | |
1U << b24_opt_python , ',') | |
+ b24_options_incompatible(cfg, b24_opt_json , | |
1U << b24_opt_python | | |
1U << b24_opt_graphviz , ',') | |
+ b24_options_incompatible(cfg, b24_opt_neuron , | |
1U << b24_opt_one_path | | |
1U << b24_opt_expand , ',') | |
+ b24_options_incompatible(cfg, b24_opt_one_path , | |
1U << b24_opt_expand , ',') | |
+ b24_options_incompatible(cfg, b24_opt_python , | |
1U << b24_opt_json | | |
1U << b24_opt_graphviz , ',') | |
+ b24_options_incompatible(cfg, b24_opt_expand , | |
1U << b24_opt_cycles | | |
1U << b24_opt_entries | | |
1U << b24_opt_one_path , ',') | |
+ b24_options_incompatible(cfg, b24_opt_sig_path , | |
1U << b24_opt_hex_path , ','); | |
if (yikes) | |
b24_helpful_exit(&cfg, EXIT_FAILURE); | |
if (expect) | |
goto missing_arg; | |
if (b24_has_option(cfg, b24_opt_help)) | |
b24_helpful_exit(&cfg, EXIT_SUCCESS); | |
if (!cfg->n_seq) { | |
(void)fputs("No sequence(s) specified\n", stderr); | |
b24_helpful_exit(&cfg, EXIT_FAILURE); | |
} | |
return cfg; | |
missing_arg: | |
(void)fprintf(stderr, "Option %s expects an argument\n", | |
b24_opt_char_str(c_).d); | |
b24_helpful_exit(&cfg, EXIT_FAILURE); | |
} | |
static char * | |
b24_render_gv_struct (char *dst, | |
size_t siz, | |
uint16_t seq, | |
unsigned _BitInt(4) id) | |
{ | |
if (siz < | |
sizeof "S0 [label=\"<f>0|<e>0|<d>0|<c>0|" | |
"<b>0|<a>0|<9>0|<8>0|" | |
"<7>0|<6>0|<5>0|<4>0|" | |
"<3>0|<2>0|<1>0|<0>0\"];\n") | |
return NULL; | |
memcpy(dst, "S0 [label=\"", sizeof "S0 [label=\"" - 1U); | |
*++dst = hexdig(id); | |
dst += sizeof " [label=\""; | |
for (unsigned off = 15U;; --off) { | |
*dst++ = '<'; | |
*dst++ = hexdig(off); | |
*dst++ = '>'; | |
*dst++ = (char)((unsigned char)'0' + (seq >> off & 1U)); | |
if (!off) | |
break; | |
*dst++ = '|'; | |
} | |
*dst++ = '"'; | |
*dst++ = ']'; | |
*dst++ = ';'; | |
*dst++ = '\n'; | |
return dst; | |
} | |
int | |
main (int c, | |
char **v) | |
{ | |
struct b24_cfg *cfg = b24_parse_args(c, v); | |
if (!cfg) | |
return EXIT_FAILURE; | |
b24_cfg_trace(cfg); | |
b24_cfg_destroy(&cfg); | |
return EXIT_SUCCESS; | |
} | |
pragma_msvc(warning(pop)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** @file clock.h | |
*/ | |
#ifndef B24_CLOCK_H_ | |
#define B24_CLOCK_H_ | |
#ifndef _MSC_VER | |
# include <time.h> | |
#endif | |
#include "vec.h" | |
static force_inline vec128 | |
unix_epoch (void) | |
{ | |
#ifdef _MSC_VER | |
FILETIME ft = {0}; | |
GetSystemTimeAsFileTime(&ft); | |
return ((ULARGE_INTEGER){ | |
.LowPart = ft.dwLowDateTime, | |
.HighPart = ft.dwHighDateTime, | |
}).QuadPart / UINT64_C(10000000) - | |
((369U * 365U + 89U) * UINT64_C(86400)); | |
#else | |
struct timespec ts = {0}, ts2 = {0}; | |
(void)clock_gettime(CLOCK_REALTIME, &ts); | |
(void)clock_gettime(CLOCK_MONOTONIC, &ts2); | |
vec128 epoch = { | |
.u32[0] = ts.tv_sec, | |
.u32[2] = ts2.tv_sec, | |
.u32[1] = ts2.tv_sec, | |
.u32[3] = ts.tv_sec | |
}; | |
return epoch; | |
#endif | |
} | |
static force_inline void hol_up (uint64_t ns) | |
{ | |
#ifdef _MSC_VER | |
LARGE_INTEGER t = { | |
.QuadPart = (int64_t)(ns / 100U) * -1 | |
}; | |
(void)NtDelayExecution(FALSE, &t); | |
#else | |
struct timespec t = { | |
.tv_sec = (typeof(t.tv_sec))(ns / UINT64_C(1000000000)), | |
.tv_nsec = (typeof(t.tv_nsec))(ns % UINT64_C(1000000000)) | |
}; | |
(void)clock_nanosleep(CLOCK_MONOTONIC, 0, &t, NULL); | |
#endif | |
} | |
#endif /* B24_CLOCK_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdbool.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include "cortex.h" | |
#include "seq.h" | |
struct cortex * | |
cortex_create (void) | |
{ | |
struct cortex *ctx = calloc(1, sizeof *ctx); | |
if (!ctx) { | |
perror("calloc"); | |
return NULL; | |
} | |
cortex_init(ctx); | |
return ctx; | |
} | |
void | |
cortex_destroy (struct cortex **ctx) | |
{ | |
if (ctx) { | |
struct cortex *c = *ctx; | |
*ctx = NULL; | |
ctx = NULL; | |
free(c); | |
c = NULL; | |
} | |
} | |
void | |
cortex_init (struct cortex *ctx) | |
{ | |
const uint16_t de_bruijn_seq_2_4[] = { | |
0x9afU, 0x9ebU, 0xa6fU, 0xa7bU, | |
0xb3dU, 0xb4fU, 0xbcdU, 0xbd3U, | |
0xcbdU, 0xd2fU, 0xd79U, 0xde5U, | |
0xf2dU, 0xf4bU, 0xf59U, 0xf65U, | |
}; | |
for (uint32_t i = 0, y = 0; y < 16U; ++y) { | |
uint32_t seq = de_bruijn_seq_2_4[y]; | |
seq |= seq << 16U; | |
for (uint32_t x = i + 16U; i < x; ++i) { | |
ctx->paths[fwd(32, 8, i)] = | |
seq_expand_vec128((uint16_t)seq); | |
seq >>= 1U; | |
} | |
} | |
for (uint32_t i = 0; i < array_size(ctx->layer); ++i) { | |
layer_init(&ctx->layer[i], (struct layer_cfg){ | |
.self = i, | |
.prev = (i + array_size(ctx->layer) | |
- 1U) % array_size(ctx->layer), | |
.next = (i + 1U) % array_size(ctx->layer), | |
.mode = i, | |
}, &ctx->data[i], &ctx->data[i + 1U]); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** @file cortex.h | |
*/ | |
#ifndef B24_CORTEX_H_ | |
#define B24_CORTEX_H_ | |
#include "clock.h" | |
#include "layer.h" | |
#include "neuron.h" | |
#ifdef __aarch64__ | |
# include "zp7.h" | |
#endif | |
enum ctx_flag { | |
CORTEX_TOCK = 1U | |
}; | |
enum { DIR_IN, DIR_OUT }; | |
/** | |
* @brief Cortex structure. | |
* | |
* This is the structure that holds all the neurons. | |
* | |
* More like a resonant bucket than a brain. | |
*/ | |
struct cortex { | |
uint64_t resvd[7]; // align to cache line | |
uint64_t flags; | |
vec128 paths[256]; | |
struct layer layer[16]; | |
struct chunk data[17]; | |
}; | |
union ctx_addr { | |
struct { | |
uint8_t b8 : 3; | |
uint8_t u8 : 4; | |
uint8_t vec : 1; | |
uint8_t : 8; | |
}; | |
struct { | |
uint8_t b64 : 6; | |
uint8_t u64 : 1; | |
uint8_t : 1; | |
uint8_t rot : 4; | |
uint8_t seq : 4; | |
}; | |
struct { | |
uint8_t : 3; | |
uint8_t chn : 4; | |
uint8_t dir : 1; | |
uint8_t nrn : 8; | |
}; | |
uint16_t raw; | |
}; | |
_Static_assert(sizeof(union ctx_addr) == 2U, ""); | |
static pure_inline union ctx_addr | |
ctx_addr (uint16_t raw) | |
{ | |
return (union ctx_addr){.raw = raw}; | |
} | |
extern struct cortex * | |
cortex_create (void); | |
extern void | |
cortex_destroy (struct cortex **ctx); | |
extern void | |
cortex_init (struct cortex *ctx); | |
struct cortex_loc { | |
uint16_t bit : 2; | |
uint16_t chn : 4; | |
uint16_t rot : 4; | |
uint16_t seq : 4; | |
uint16_t pad : 2; | |
}; | |
struct linear_loc { | |
uint16_t col : 7; | |
uint16_t row : 7; | |
uint16_t pad : 2; | |
}; | |
static const_inline struct cortex_loc | |
linear_loc_fwd (struct linear_loc loc) | |
{ | |
union { | |
struct cortex_loc ctx; | |
struct linear_loc lin; | |
uint16_t raw; | |
} src = { | |
.lin = loc | |
}, dst = { | |
.raw = (uint16_t)fwd(32, 14, src.raw) | |
}; | |
return dst.ctx; | |
} | |
static force_inline void | |
cortex_tick (struct cortex *ctx) | |
{ | |
for (uint32_t i = 0; i < array_size(ctx->layer); ++i) { | |
layer_tick(&ctx->layer[i], &ctx->data[(i + 1U) & 15U]); | |
layer_tock(&ctx->layer[i], &ctx->data[i ]); | |
} | |
} | |
#endif /* B24_CORTEX_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include "layer_priv.h" | |
void | |
layer_init (struct layer *obj, | |
struct layer_cfg cfg, | |
struct chunk *in, | |
struct chunk *out) | |
{ | |
*obj = layer(cfg, in, out); | |
} | |
void | |
layer_fini (struct layer *obj) | |
{ | |
(void)obj; | |
} | |
void | |
layer_run (struct layer *obj, | |
struct chunk *input) | |
{ | |
bool init_now = !obj->cfg.init; | |
unsigned hole = 0; | |
for (uint16_t i = 0; i < (uint16_t)array_size(obj->data[0]->v); ++i) { | |
// internal connections | |
vec128 self_vec = *layer_vector(obj, (union ctx_addr){ | |
.dir = DIR_IN, | |
.nrn = i, | |
}); | |
vec128 peer_vec[16] = {0}; | |
union ctx_addr peer_addr[16] = {0}; | |
for (uint16_t j = 0; j < (uint16_t)array_size(self_vec.u8); ++j) { | |
uint16_t tmp = layer_map(obj, (union ctx_addr){ | |
.chn = j, | |
.nrn = i, | |
}); | |
peer_addr[j] = (union ctx_addr){ | |
.chn = tmp & 15U, | |
.dir = DIR_IN, | |
.nrn = tmp >> 4U, | |
}; | |
peer_vec[j] = *layer_vector(obj, peer_addr[j]); | |
} | |
for (uint16_t j = 0; j < (uint16_t)array_size(self_vec.u8); ++j) { | |
if (peer_addr[j].nrn == i) { | |
if (init_now) | |
++hole; | |
// self connection (external input) | |
// TOEDOE: run callback and assign result to self.u8[j] | |
//self_vec.u8[j] = input->v[i].u8[j]; | |
continue; | |
} | |
uint8_t ch = self_vec.u8[j]; | |
uint8_t up = layer_channel_whence( | |
obj, | |
(union ctx_addr) { | |
.chn = j, | |
.nrn = i, | |
} | |
); | |
bool match_lsb = (ch & 15U) == (up & 15U); | |
bool match_msb = (ch >> 4U) == j; | |
if ((match_msb ^ match_lsb)) { | |
peer_vec[j].u8[j] = self_vec.u8[j]; | |
self_vec.u8[j] = 0; | |
} else if (match_msb) { | |
self_vec.u8[j] = 0; | |
} else { | |
self_vec.u8[j] = peer_vec[j].u8[j]; | |
//peer_vec[j].u8[j] = 0; | |
} | |
layer_save_vector(obj, (union ctx_addr){ | |
.dir = DIR_OUT, | |
.nrn = peer_addr[j].nrn, | |
}, peer_vec[j]); | |
} | |
struct neuron nrn = layer_make_neuron( | |
obj, | |
(union ctx_addr){ | |
.dir = DIR_IN, | |
.nrn = i, | |
} | |
); | |
layer_save_vector( | |
obj, | |
(union ctx_addr) { | |
.dir = DIR_OUT, | |
.nrn = i, | |
}, | |
neuron_send(&nrn, self_vec) | |
); | |
} | |
if (init_now) { | |
obj->cfg.init = 1; | |
obj->cfg.hole = hole; | |
(void)printf("inputs: %u\n", hole); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** @file layer.h | |
*/ | |
#ifndef B24_LAYER_H_ | |
#define B24_LAYER_H_ | |
#include "vec.h" | |
struct layer_cfg { | |
uint64_t self : 4; | |
uint64_t prev : 4; | |
uint64_t next : 4; | |
uint64_t mode : 4; | |
uint64_t hole : 8; | |
uint64_t init : 1; | |
uint64_t tock : 1; | |
uint64_t resv : 36; | |
}; | |
_Static_assert(sizeof(struct layer_cfg) | |
== sizeof(uint64_t), ""); | |
/** | |
* @brief Network layer. | |
*/ | |
struct layer { | |
struct layer_cfg cfg; | |
struct chunk *data[2]; | |
}; | |
_Static_assert(sizeof(struct layer) | |
== sizeof(struct layer_cfg) | |
+ sizeof(struct chunk *) * 2U, ""); | |
extern void | |
layer_init (struct layer *obj, | |
struct layer_cfg cfg, | |
struct chunk *in, | |
struct chunk *out); | |
extern void | |
layer_fini (struct layer *obj); | |
extern void | |
layer_run (struct layer *obj, | |
struct chunk *in); | |
static force_inline void | |
layer_tick (struct layer *obj, | |
struct chunk *in) | |
{ | |
obj->cfg.tock = 0; | |
layer_run(obj, in); | |
} | |
static force_inline void | |
layer_tock (struct layer *obj, | |
struct chunk *in) | |
{ | |
obj->cfg.tock = 1; | |
layer_run(obj, in); | |
} | |
#endif /* B24_LAYER_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** @file layer_priv.h | |
*/ | |
#ifndef B24_LAYER_PRIV_H_ | |
#define B24_LAYER_PRIV_H_ | |
#include <stdio.h> | |
#include "cortex.h" | |
#include "layer.h" | |
static pure_inline struct cortex * | |
to_cortex (struct layer const *obj) | |
{ | |
return container_of(obj, struct cortex, layer[obj->cfg.self]); | |
} | |
static pure_inline struct vec128 * | |
layer_vector (struct layer const *obj, | |
union ctx_addr addr) | |
{ | |
return &obj->data[addr.dir ^ obj->cfg.tock]->v[addr.nrn]; | |
} | |
static pure_inline struct neuron | |
layer_make_neuron (struct layer const *obj, | |
union ctx_addr addr) | |
{ | |
return neuron_from_vec128( | |
&to_cortex(obj)->paths[addr.nrn], | |
layer_vector(obj, addr) | |
); | |
} | |
static pure_inline uint8_t | |
layer_channel_whence (struct layer const *obj, | |
union ctx_addr addr) | |
{ | |
return to_cortex(obj)->paths[addr.nrn].u8[addr.chn]; | |
} | |
static pure_inline uint8_t | |
layer_channel_get (struct layer const *obj, | |
union ctx_addr addr) | |
{ | |
return layer_vector(obj, addr)->u8[addr.chn]; | |
} | |
static force_inline void | |
layer_channel_set (struct layer *obj, | |
union ctx_addr addr, | |
uint8_t data) | |
{ | |
layer_vector(obj, addr)->u8[addr.chn] = data; | |
} | |
static pure_inline uint8_t | |
layer_channel_swap (struct layer *obj, | |
union ctx_addr addr, | |
uint8_t data) | |
{ | |
uint8_t ret = layer_channel_get(obj, addr); | |
addr.dir ^= 1U; | |
layer_channel_set(obj, addr, data); | |
return ret; | |
} | |
static force_inline struct layer | |
layer (struct layer_cfg cfg, | |
struct chunk *in, | |
struct chunk *out) | |
{ | |
return (struct layer) { | |
.cfg = cfg, | |
.data = {in, out} | |
}; | |
} | |
static force_inline void | |
layer_save_vector (struct layer *obj, | |
union ctx_addr addr, | |
vec128 data) | |
{ | |
*layer_vector(obj, addr) = data; | |
} | |
static pure_inline uint16_t | |
layer_map (struct layer const *obj, | |
union ctx_addr addr) | |
{ | |
struct cortex *ctx = to_cortex(obj); | |
const uint16_t index[16] = { | |
// cortex::paths is shuffled with rev(32, 8), | |
// the original indices are the EOL comments. | |
83, 36, 100, 99, 110, 129, 213, 200, // 29, 66, 74, 89, 122, 129, 143, 168, | |
204, 160, 224, 240, 167, 249, 253, 170, // 170, 192, 200, 204, 211, 237, 239, 240, | |
}; | |
uint16_t s = index[addr.chn]; | |
uint16_t r = index[obj->cfg.mode]; | |
uint16_t them = (uint16_t)ctx->paths[s].u8[addr.seq] << 8U | |
| (uint16_t)ctx->paths[r].u8[addr.rot] << 4U | |
| (uint16_t)ctx->paths[r].u8[addr.chn]; | |
return (uint16_t)rev(32, 12, them); | |
} | |
#endif /* B24_LAYER_PRIV_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @file neuron.h | |
*/ | |
#ifndef B24_NEURON_H_ | |
#define B24_NEURON_H_ | |
#include <limits.h> | |
#include "seq.h" | |
#include "vec.h" | |
/** | |
* @brief Runtime neuron structure. | |
* | |
* This is what is actually used with the SIMD instructions. | |
*/ | |
struct neuron { | |
u8x16 paths; | |
u8x16 state; | |
}; | |
static pure_inline struct neuron | |
neuron_from_vec128 (vec128 *paths, | |
vec128 *state) | |
{ | |
return (struct neuron){ | |
.paths = u8x16_from_vec128(*paths), | |
.state = u8x16_from_vec128(*state) | |
}; | |
} | |
static pure_inline struct neuron | |
neuron (uint16_t seq, | |
vec128 state) | |
{ | |
return (struct neuron){ | |
.paths = seq_expand_u8x16(seq), | |
.state = u8x16_from_vec128(state) | |
}; | |
} | |
static force_inline u8x16 | |
neuron_tick (struct neuron *nrn, | |
u8x16 in) | |
{ | |
u8x16 state = nrn->state; | |
nrn->state = age_and_mutate( | |
merge_input( | |
state, | |
in | |
), | |
nrn->paths | |
); | |
return state; | |
} | |
static force_inline vec128 | |
neuron_tock (struct neuron *nrn, | |
vec128 in) | |
{ | |
nrn->state = age_and_mutate( | |
merge_input( | |
nrn->state, | |
u8x16_from_vec128(in) | |
), | |
nrn->paths | |
); | |
return vec128_from_u8x16(nrn->state); | |
} | |
static force_inline vec128 | |
neuron_send (struct neuron *nrn, | |
vec128 in) | |
{ | |
nrn->state = mutate( | |
select_nonzero_input( | |
nrn->state, | |
u8x16_from_vec128(in) | |
), | |
nrn->paths | |
); | |
return vec128_from_u8x16(nrn->state); | |
} | |
#endif /* B24_NEURON_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** @file popcnt.h | |
*/ | |
#ifndef B24_POPCNT_H_ | |
#define B24_POPCNT_H_ | |
#include <limits.h> | |
#include <stdint.h> | |
#include "util.h" | |
#undef define_popcnt | |
#undef popcnt16_impl | |
#undef popcnt32_impl | |
#undef popcnt64_impl | |
#undef popcnt_compat | |
#ifdef _MSC_VER | |
# define popcnt16_impl return __popcnt16 | |
# pragma intrinsic(__popcnt16) | |
# define popcnt32_impl return __popcnt | |
# pragma intrinsic(__popcnt) | |
# ifdef _M_IX86 | |
# undef popcnt32x2 | |
# define popcnt64_impl return popcnt32x2 | |
# define popcnt32x2(x) \ | |
(__popcnt((uint32_t)(x >> 32U)) + \ | |
__popcnt((uint32_t)(x & ~(uint32_t)0))) | |
# else | |
# define popcnt64_impl return __popcnt64 | |
# pragma intrinsic(__popcnt64) | |
# endif | |
# define popcnt_pre_pragma _Pragma("warning(push)") \ | |
_Pragma("warning(disable: 4116)") | |
# define popcnt_post_pragma _Pragma("warning(pop)") | |
#else | |
# if __has_builtin(__builtin_popcount) && (UINT_MAX == UINT32_MAX) | |
# define popcnt16_impl return (uint16_t)__builtin_popcount | |
# define popcnt32_impl return (uint32_t)__builtin_popcount | |
# endif | |
# if !defined(popcnt32_impl) && \ | |
__has_builtin(__builtin_popcountl) && (ULONG_MAX == UINT32_MAX) | |
# define popcnt16_impl return (uint16_t)__builtin_popcountl | |
# define popcnt32_impl return (uint32_t)__builtin_popcountl | |
# endif | |
# if __has_builtin(__builtin_popcountl) && (ULONG_MAX == UINT64_MAX) | |
# define popcnt64_impl return (uint64_t)__builtin_popcountl | |
# endif | |
# if !defined(popcnt64_impl) && \ | |
__has_builtin(__builtin_popcountll) && (ULLONG_MAX == UINT64_MAX) | |
# define popcnt64_impl return (uint64_t)__builtin_popcountll | |
# endif | |
# if __has_builtin(__builtin_popcountg) | |
# if !defined(popcnt16_impl) | |
# define popcnt16_impl return (uint16_t)__builtin_popcountg | |
# endif | |
# if !defined(popcnt32_impl) | |
# define popcnt32_impl return (uint32_t)__builtin_popcountg | |
# endif | |
# if !defined(popcnt64_impl) | |
# define popcnt64_impl return (uint64_t)__builtin_popcountg | |
# endif | |
# endif | |
#endif | |
#define define_popcnt(b, ...) const_inline \ | |
static uint##b##_t popcnt##b(uint##b##_t val) \ | |
{ __VA_ARGS__(val); } _Static_assert(b>0, #b) | |
#define popcnt_compat(x) typeof(_Generic( \ | |
(char(*)[2 - (sizeof(x) > 4U)])0 \ | |
,char(*)[1]: (uint64_t)0 \ | |
,char(*)[2]: (uint32_t)0)) y = (x); \ | |
y -= (typeof(y))-1/3 & (y >> 1U); \ | |
y = ((typeof(y))-1/5 & (y >> 2U)) \ | |
+ ((typeof(y))-1/5 & y); \ | |
return ((typeof(y))-1/ 17 & ((y >> 4U) + y)) \ | |
* ((typeof(y))-1/255) >> (sizeof y - 1U) \ | |
* CHAR_BIT | |
#ifndef popcnt16_impl | |
# define popcnt16_impl popcnt_compat | |
#endif | |
#ifndef popcnt32_impl | |
# define popcnt32_impl popcnt_compat | |
#endif | |
#ifndef popcnt64_impl | |
# define popcnt64_impl popcnt_compat | |
#endif | |
#ifndef popcnt_pre_pragma | |
# define popcnt_pre_pragma | |
#endif | |
#ifndef popcnt_post_pragma | |
# define popcnt_post_pragma | |
#endif | |
define_popcnt(16, popcnt16_impl); | |
define_popcnt(32, popcnt32_impl); | |
define_popcnt(64, popcnt64_impl); | |
#undef define_popcnt | |
#undef popcnt16_impl | |
#undef popcnt32_impl | |
#undef popcnt64_impl | |
#undef popcnt_compat | |
#if defined(_MSC_VER) && defined(_M_IX86) | |
# undef popcnt32x2 | |
#endif | |
#define popcnt(x) popcnt_pre_pragma _Generic((x), \ | |
typeof(_Generic((char)0, \ | |
signed char: (struct{int i;}){0}, \ | |
unsigned char: (struct{int i;}){0}, \ | |
default: (char)0)): popcnt16, \ | |
signed char: popcnt16, short: popcnt16, \ | |
unsigned char: popcnt16, int: popcnt32, \ | |
unsigned short: popcnt16, unsigned: popcnt32, \ | |
long long: popcnt64, unsigned long: _Generic( \ | |
&(int[sizeof 1UL]){0}, \ | |
int(*)[sizeof(uint32_t)]: popcnt32, \ | |
int(*)[sizeof(uint64_t)]: popcnt64), \ | |
unsigned long long: popcnt64, long: _Generic( \ | |
&(int[sizeof 1L]){0}, \ | |
int(*)[sizeof(int32_t)]: popcnt32, \ | |
int(*)[sizeof(int64_t)]: popcnt64)) \ | |
(_Generic((x), default: (x), \ | |
long long: (unsigned long long)(x), \ | |
signed char: (unsigned char)(x), \ | |
short: (unsigned short)(x), \ | |
typeof(_Generic((char)0, \ | |
signed char: \ | |
(struct{int i;}){0}, \ | |
unsigned char: \ | |
(struct{int i;}){0}, \ | |
default: (char)0)): \ | |
(unsigned char)(x), \ | |
long: (unsigned long)(x), \ | |
int: (unsigned)(x))) popcnt_post_pragma | |
#endif /* B24_POPCNT_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** @file seq.h | |
*/ | |
#ifndef B24_SEQ_H_ | |
#define B24_SEQ_H_ | |
#include "vec.h" | |
#ifdef __aarch64__ | |
static const_inline u8x16 | |
mutate (u8x16 state, | |
u8x16 paths) | |
{ | |
return vqtbl1q_u8(state, paths); | |
} | |
static const_inline uint16_t | |
sum_of_abs_diff (u8x16 a, | |
u8x16 b) | |
{ | |
return vaddvq_u8(vabdq_u8(a, b)); | |
} | |
/** | |
* @brief Age and mutate the state. | |
* | |
* Decrement each non-zero strength by one and apply the given mutation. | |
* | |
* @param state The current state. | |
* @param paths The mutation descriptor. | |
* @return The resulting state. | |
*/ | |
static const_inline u8x16 | |
age_and_mutate (u8x16 state, | |
u8x16 paths) | |
{ | |
return vqtbl1q_u8( | |
vsubq_u8( | |
state, | |
vbslq_u8( | |
vtstq_u8( | |
state, | |
vdupq_n_u8(0xf0) | |
), | |
vdupq_n_u8(0x10), | |
state | |
) | |
), | |
paths | |
); | |
} | |
/** | |
* @brief Merge input into the current state, preferring the input. | |
* | |
* For each channel, if the input channel has a non-zero strength, | |
* use that channel in the output, otherwise use the corresponding | |
* channel from the current state. | |
* | |
* @param state The current state. | |
* @param input The input to merge into the state. | |
* @return The merged state. | |
*/ | |
static const_inline u8x16 | |
merge_input (u8x16 state, | |
u8x16 input) | |
{ | |
return vbslq_u8( | |
vtstq_u8( | |
input, | |
vdupq_n_u8(0xf0) | |
), | |
input, | |
state | |
); | |
} | |
static const_inline u8x16 | |
select_nonzero_input (u8x16 state, | |
u8x16 input) | |
{ | |
return vbslq_u8( | |
vtstq_u8( | |
input, | |
vdupq_n_u8(0xff) | |
), | |
input, | |
state | |
); | |
} | |
static force_inline uint64_t | |
seq_expand (uint16_t seq) | |
{ | |
uint16_t rot = seq << 12U | seq >> 4U; | |
uint64_t ret = 0; | |
uint64x2_t x = vreinterpretq_u64_u8( | |
vqtbl1q_u8( | |
vreinterpretq_u8_u16( | |
vld1q_lane_u16( | |
&rot, | |
vld1q_lane_u16( | |
&seq, | |
vdupq_n_u16(0), | |
0 | |
), | |
1 | |
) | |
), | |
vld1q_u8(((uint8_t[16]){ | |
0x00, 0x00, 0x00, 0x00, | |
0x02, 0x02, 0x02, 0x02, | |
0x01, 0x01, 0x01, 0x01, | |
0x03, 0x03, 0x03, 0x03}) | |
) | |
) | |
); | |
x = vorrq_u64( | |
vandq_u64( | |
x, | |
vdupq_n_u64(UINT64_C(0x003c000f003c000f)) | |
), | |
vshrq_n_u64( | |
vandq_u64( | |
x, | |
vdupq_n_u64(UINT64_C(0x78001e0078001e00)) | |
), | |
5 | |
) | |
); | |
x = vorrq_u64( | |
vandq_u64( | |
x, | |
vdupq_n_u64(UINT64_C(0x000000ff000000ff)) | |
), | |
vshrq_n_u64( | |
vandq_u64( | |
x, | |
vdupq_n_u64(UINT64_C(0x03fc000003fc0000)) | |
), | |
10 | |
) | |
); | |
vst1q_lane_u64( | |
&ret, | |
vreinterpretq_u64_u8( | |
vqtbl1q_u8( | |
vreinterpretq_u8_u64(x), | |
vld1q_u8(((uint8_t[16]){ | |
0x00, 0x01, 0x04, 0x05, | |
0x08, 0x09, 0x0c, 0x0d, | |
0x80, 0x80, 0x80, 0x80, | |
0x80, 0x80, 0x80, 0x80}) | |
) | |
) | |
), | |
0 | |
); | |
return ret; | |
} | |
static force_inline u8x16 | |
seq_expand_u8x16 (uint16_t seq) | |
{ | |
uint64x2_t x = vreinterpretq_u64_u8( | |
vqtbl1q_u8( | |
vreinterpretq_u8_u16( | |
vld1q_lane_u16( | |
(uint16_t[1]){ | |
seq << 12U | | |
seq >> 4U | |
}, | |
vld1q_lane_u16( | |
&seq, | |
vdupq_n_u16(0), | |
0 | |
), | |
1 | |
) | |
), | |
vld1q_u8(((uint8_t[16]){ | |
0x00, 0x00, 0x00, 0x00, | |
0x02, 0x02, 0x02, 0x02, | |
0x01, 0x01, 0x01, 0x01, | |
0x03, 0x03, 0x03, 0x03}) | |
) | |
) | |
); | |
x = vorrq_u64( | |
vandq_u64( | |
x, | |
vdupq_n_u64(UINT64_C(0x003c000f003c000f)) | |
), | |
vshrq_n_u64( | |
vandq_u64( | |
x, | |
vdupq_n_u64(UINT64_C(0x78001e0078001e00)) | |
), | |
5 | |
) | |
); | |
x = vorrq_u64( | |
vandq_u64( | |
x, | |
vdupq_n_u64(UINT64_C(0x000000ff000000ff)) | |
), | |
vshrq_n_u64( | |
vandq_u64( | |
x, | |
vdupq_n_u64(UINT64_C(0x03fc000003fc0000)) | |
), | |
10 | |
) | |
); | |
x = vreinterpretq_u64_u8( | |
vqtbl1q_u8( | |
vreinterpretq_u8_u64(x), | |
vld1q_u8(((uint8_t[16]){ | |
0x00, 0x80, 0x01, 0x80, | |
0x04, 0x80, 0x05, 0x80, | |
0x08, 0x80, 0x09, 0x80, | |
0x0c, 0x80, 0x0d, 0x80}) | |
) | |
) | |
); | |
uint64x2_t y = vandq_u64( | |
x, | |
vdupq_n_u64(UINT64_C(0x00f000f000f000f0)) | |
); | |
return vreinterpretq_u8_u64( | |
vorrq_u64( | |
veorq_u64(x, y), | |
vshlq_n_u64(y, 4) | |
) | |
); | |
} | |
static force_inline vec128 | |
seq_expand_vec128 (uint16_t seq) | |
{ | |
return vec128_from_u8x16(seq_expand_u8x16(seq)); | |
} | |
#endif /* __aarch64__ */ | |
#if defined(__x86_64__) || defined(_MSC_VER) | |
# define pext_mask 0x783c1e0f783c1e0fULL | |
# define pdep_mask 0x0f0f0f0f0f0f0f0fULL | |
static force_inline uint64_t | |
seq_expand (uint16_t seq) | |
{ | |
u8x16 k = _mm_shuffle_epi8( | |
_mm_set_epi64x( | |
0, (uint32_t)(seq << 12U | seq >> 4U) << 16U | seq | |
), | |
_mm_set_epi64x(0x0303030301010101LL, 0x0202020200000000LL) | |
); | |
return _pext_u64((uint64_t)_mm_extract_epi64(k, 1), pext_mask) << 32U | |
| _pext_u64((uint64_t)_mm_extract_epi64(k, 0), pext_mask); | |
} | |
static force_inline vec128 | |
seq_expand_vec128 (uint16_t seq) | |
{ | |
u8x16 k = _mm_shuffle_epi8( | |
_mm_set_epi64x( | |
0, (uint32_t)(seq << 12U | seq >> 4U) << 16U | seq | |
), | |
_mm_set_epi64x(0x0303030301010101LL, 0x0202020200000000LL) | |
); | |
return (vec128){ | |
.u64[0] = _pdep_u64( | |
_pext_u64( | |
(unsigned long long)_mm_extract_epi64(k, 0), | |
pext_mask | |
), | |
pdep_mask | |
), | |
.u64[1] = _pdep_u64( | |
_pext_u64( | |
(unsigned long long)_mm_extract_epi64(k, 1), | |
pext_mask | |
), | |
pdep_mask | |
) | |
}; | |
} | |
static force_inline u8x16 | |
seq_expand_u8x16 (uint16_t seq) | |
{ | |
return u8x16_from_vec128(seq_expand_vec128(seq)); | |
} | |
# undef pdep_mask | |
# undef pext_mask | |
static const_inline u8x16 | |
mutate (u8x16 state, | |
u8x16 paths) | |
{ | |
return _mm_shuffle_epi8(state, paths); | |
} | |
static const_inline uint16_t | |
sum_of_abs_diff (u8x16 a, | |
u8x16 b) | |
{ | |
__m128i x = _mm_sad_epu8(a, b); | |
return _mm_extract_epi16(x, 0) | |
+ _mm_extract_epi16(x, 4); | |
} | |
static const_inline u8x16 | |
age_and_mutate (u8x16 state, | |
u8x16 paths) | |
{ | |
return _mm_shuffle_epi8( | |
_mm_subs_epu8( | |
state, | |
_mm_set_epi64x( | |
0x1010101010101010LL, | |
0x1010101010101010LL | |
) | |
), | |
paths | |
); | |
} | |
static const_inline u8x16 | |
merge_input (u8x16 state, | |
u8x16 input) | |
{ | |
return _mm_blendv_epi8( | |
state, | |
input, | |
/* Mask construction: adding 0x70 (saturated) | |
* sets the top bit iff the value is at least | |
* 0x10, and the result can be used as a mask | |
* in `_mm_blendv_epi8()`. | |
*/ | |
_mm_adds_epu8( | |
input, | |
_mm_set_epi64x( | |
0x7070707070707070LL, | |
0x7070707070707070LL | |
) | |
) | |
); | |
} | |
static const_inline u8x16 | |
select_nonzero_input (u8x16 state, | |
u8x16 input) | |
{ | |
return _mm_blendv_epi8( | |
input, | |
state, | |
_mm_cmpeq_epi8( | |
input, | |
_mm_set_epi64x(0LL, 0LL) | |
) | |
); | |
} | |
#endif /* __x86_64__ || _MSC_VER */ | |
static const_inline uint16_t | |
mutation_sad (u8x16 state, | |
u8x16 paths) | |
{ | |
return sum_of_abs_diff( | |
state, | |
mutate(state, paths) | |
); | |
} | |
#endif /* B24_SEQ_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @file test-cortex.c | |
* | |
* Compiling with MSVC 19.41 or later: | |
* | |
* cl.exe /TC /std:clatest /O2 /Oi /GL /GF /Zo- /favor:AMD64 /arch:AVX2 cortex.c test-cortex.c /Fe: test-cortex.exe /MD /link ntdll.lib | |
*/ | |
#include <inttypes.h> | |
#include <math.h> | |
#include <stdbool.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include "cortex.h" | |
int | |
main (void) | |
{ | |
char screen[69897]; | |
char *s = screen; | |
struct cortex *ctx = cortex_create(); | |
*s++ = '\033'; | |
*s++ = '['; | |
*s++ = '2'; | |
*s++ = 'J'; | |
for (uint64_t k = 0U;; ++k) { | |
*s++ = '\033'; | |
*s++ = '['; | |
*s++ = 'H'; | |
for (uint16_t row = 0U; row < 128U; row += 2U) { | |
uint8_t r = 255U; | |
for (uint16_t col = 0U; col < 128U; col += 2U) { | |
struct cortex_loc loc = | |
linear_loc_fwd( | |
(struct linear_loc){ | |
.row = row, | |
.col = col | |
} | |
); | |
uint8_t b = ctx->data[0].v[loc.seq << 4U | loc.rot] | |
.u8[loc.chn ]; | |
uint8_t p = b >> 4U; | |
uint8_t q = b & 15U; | |
if (p != r) { | |
r = p; | |
*s++ = '\033'; | |
*s++ = '['; | |
*s++ = '3'; | |
*s++ = '8'; | |
*s++ = ';'; | |
*s++ = '5'; | |
*s++ = ';'; | |
p = 232U + ((p & 8U) | |
? -((16U - p) * 23U - 3U) | |
: p); | |
if (p >= 10U) { | |
if (p >= 100U) { | |
*s++ = (char)(p / 100U + (uint8_t)'0'); | |
p %= 100U; | |
} | |
*s++ = (char)(p / 10U + (uint8_t)'0'); | |
p %= 10U; | |
} | |
*s++ = (char)(p + (uint8_t)'0'); | |
*s++ = 'm'; | |
} | |
*s++ = "0123456789abcdef"[p]; | |
*s++ = "0123456789abcdef"[q]; | |
} | |
*s++ = '\n'; | |
} | |
*s++ = '\0'; | |
s = screen; | |
printf("%s\033[m%128" PRIu64 "\n", s, k); | |
cortex_tick(ctx); | |
//hol_up(100000000); | |
} | |
cortex_destroy(&ctx); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** @file util.h | |
*/ | |
#ifndef B24_UTIL_H_ | |
#define B24_UTIL_H_ | |
#ifdef _MSC_VER | |
# define WIN32_LEAN_AND_MEAN | |
# include <intrin.h> | |
# include <windows.h> | |
# include <sysinfoapi.h> | |
#else | |
# if defined(__aarch64__) | |
# include <arm_neon.h> | |
# elif defined(__x86_64__) | |
# include <immintrin.h> | |
# endif | |
#endif | |
#include <stddef.h> | |
#include <stdint.h> | |
#ifdef __aarch64__ | |
typedef uint8x16_t u8x16; | |
#else | |
typedef __m128i u8x16; | |
#endif | |
#ifndef __has_builtin | |
# define __has_builtin(...) 0 | |
#endif | |
#ifdef _MSC_VER | |
# define __attribute__(...) | |
# define force_inline __forceinline | |
# define pragma_msvc(...) _Pragma(#__VA_ARGS__) | |
# define typeof __typeof__ | |
#else | |
# define force_inline __attribute__((always_inline)) inline | |
# define pragma_msvc(...) | |
#endif | |
#define array_size(a) (sizeof (a) / sizeof (a)[0]) | |
#define const_function __attribute__((const)) | |
#define const_inline const_function force_inline | |
#define pure_inline __attribute__((pure)) force_inline | |
/** | |
* @brief Macros for doing Morton encoding stuff. | |
* | |
* We employ Morton encoding to map linear coordinates (e.g. array offsets | |
* and bit indices) onto a quad tree space. This allows us to think of the | |
* layer topology as recursive nested quadrants, which is a more intuitive | |
* way to be confused. | |
* | |
* What happens in practice is that we take the upper and lower halves of | |
* an index and interleave them to form a single Morton encoded address. | |
* `fwd()` converts a linear address to a quad tree representation, `rev()` | |
* does the inverse, and `M()` computes the mask for a given bit width and | |
* interleaving pattern. | |
* | |
* Common parameters: | |
* | |
* @param T The integer width to use for computing and storing the Morton | |
* encoded value (usually 32 or 64). | |
* @param W The number of value bits to encode. Must be a multiple of 2. | |
* @param odd Whether to operate on the odd or even bits of the value. | |
* @param v The value to encode. | |
*/ | |
#define M(T, W, odd) (((uint##T##_t)-1) / 3U << !!(odd) >> ((8U << ((W > 32U) + 2U)) - W)) | |
#define fwd(T, W, v) (_pdep_u##T((v), M(T, W, 0)) | _pdep_u##T((v) >> W / 2U, M(T, W, 1))) | |
#define rev(T, W, v) (_pext_u##T((v), M(T, W, 0)) | _pext_u##T((v), M(T, W, 1)) << W / 2U) | |
#define container_of(ptr, type, member) \ | |
((type *)((char *)(1 ? (ptr) : &((type *)0)->member) - offsetof(type, member))) | |
#endif /* B24_UTIL_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** @file vec.h | |
*/ | |
#ifndef B24_VEC_H_ | |
#define B24_VEC_H_ | |
#include <stdint.h> | |
#include "util.h" | |
typedef struct vec4x2 { | |
uint8_t x : 4; uint8_t y : 4; | |
} vec4x2; | |
typedef struct vec4x4 { | |
uint16_t x0 : 4; uint16_t y0 : 4; | |
uint16_t x1 : 4; uint16_t y1 : 4; | |
} vec4x4; | |
typedef struct vec4x8 { | |
uint32_t x0 : 4; uint32_t y0 : 4; | |
uint32_t x1 : 4; uint32_t y1 : 4; | |
uint32_t x2 : 4; uint32_t y2 : 4; | |
uint32_t x3 : 4; uint32_t y3 : 4; | |
} vec4x8; | |
typedef struct vec4x16 { | |
uint64_t x0 : 4; uint64_t y0 : 4; | |
uint64_t x1 : 4; uint64_t y1 : 4; | |
uint64_t x2 : 4; uint64_t y2 : 4; | |
uint64_t x3 : 4; uint64_t y3 : 4; | |
uint64_t x4 : 4; uint64_t y4 : 4; | |
uint64_t x5 : 4; uint64_t y5 : 4; | |
uint64_t x6 : 4; uint64_t y6 : 4; | |
uint64_t x7 : 4; uint64_t y7 : 4; | |
} vec4x16; | |
typedef struct vec128 { | |
union { | |
uint8_t u8[ 16]; | |
vec4x2 u4x2[16]; | |
uint16_t u16[ 8]; | |
vec4x4 u4x4[ 8]; | |
uint32_t u32[ 4]; | |
vec4x8 u4x8[ 4]; | |
uint64_t u64[ 2]; | |
vec4x16 u4x16[2]; | |
}; | |
} vec128; | |
_Static_assert(sizeof(vec4x2 ) == 1U,""); | |
_Static_assert(sizeof(vec4x4 ) == 2U,""); | |
_Static_assert(sizeof(vec4x8 ) == 4U,""); | |
_Static_assert(sizeof(vec4x16) == 8U,""); | |
_Static_assert(sizeof(vec128 ) == 16U,""); | |
struct chunk { | |
vec128 v[256]; | |
}; | |
#ifdef __aarch64__ | |
static pure_inline u8x16 | |
u8x16_from_vec128 (vec128 v) | |
{ | |
return vld1q_u8(&v.u8[0]); | |
} | |
static pure_inline vec128 | |
vec128_from_u8x16 (u8x16 v) | |
{ | |
vec128 ret; | |
vst1q_u8(&ret.u8[0], v); | |
return ret; | |
} | |
#else | |
static pure_inline u8x16 | |
u8x16_from_vec128 (vec128 v) | |
{ | |
return _mm_set_epi64x( | |
(long long)v.u64[1], | |
(long long)v.u64[0] | |
); | |
} | |
static pure_inline vec128 | |
vec128_from_u8x16 (u8x16 v) | |
{ | |
return (vec128){ | |
.u64[0] = (uint64_t)_mm_extract_epi64(v, 0), | |
.u64[1] = (uint64_t)_mm_extract_epi64(v, 1), | |
}; | |
} | |
#endif | |
#endif /* B24_VEC_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill) | |
// | |
// Copyright (c) 2020 Zach Wegner | |
// | |
// Permission is hereby granted, free of charge, to any person obtaining a copy | |
// of this software and associated documentation files (the "Software"), to deal | |
// in the Software without restriction, including without limitation the rights | |
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
// copies of the Software, and to permit persons to whom the Software is | |
// furnished to do so, subject to the following conditions: | |
// | |
// The above copyright notice and this permission notice shall be included in | |
// all copies or substantial portions of the Software. | |
// | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
// SOFTWARE. | |
#include "popcnt.h" | |
// ZP7: branchless PEXT/PDEP replacement code for non-Intel processors | |
// | |
// The PEXT/PDEP instructions are pretty cool, with various (usually arcane) | |
// uses, behaving like bitwise gather/scatter instructions. They were introduced | |
// by Intel with the BMI2 instructions on Haswell. | |
// | |
// AMD processors implement these instructions, but very slowly. PEXT/PDEP can | |
// take from 18 to ~300 cycles, depending on the input mask. See this table: | |
// https://mobile.twitter.com/InstLatX64/status/1209095219087585281 | |
// Other processors don't have PEXT/PDEP at all. This code is a polyfill for | |
// these processors. It's much slower than the raw instructions on Intel chips | |
// (which are 3L1T), but should be faster than AMD's implementation. | |
// | |
// Description of the algorithm | |
// ==== | |
// | |
// This code uses a "parallel prefix popcount" technique (hereafter PPP for | |
// brevity). What this means is that we determine, for every bit in the input | |
// mask, how many bits below it are set. Or rather, aren't set--we need to get | |
// a count of how many bits each input bit should be shifted to get to its final | |
// position, that is, the difference between the bit-index of its destination | |
// and its original bit-index. This is the same as the count of unset bits in | |
// the mask below each input bit. | |
// | |
// The dumb way to get this PPP would be to create a 64-element array in a loop, | |
// but we want to do this in a bit-parallel fashion. So we store the counts | |
// "vertically" across six 64-bit values: one 64-bit value holds bit 0 of each | |
// of the 64 counts, another holds bit 1, etc. We can compute these counts | |
// fairly easily using a parallel prefix XOR (XOR is equivalent to a 1-bit | |
// adder that wraps around and ignores carrying). Using parallel prefix XOR as | |
// a 1-bit adder, we can build an n-bit adder by shifting the result left by | |
// one and ANDing with the input bits: this computes the carry by seeing where | |
// an input bit causes the 1-bit sum to overflow from 1 to 0. The shift left | |
// is needed anyways, because we want the PPP values to represent population | |
// counts *below* each bit, not including the bit itself. | |
// | |
// For processors with the CLMUL instructions (most x86 CPUs since ~2010), we | |
// can do the parallel prefix XOR and left shift in one instruction, by | |
// doing a carry-less multiply by -2. | |
// | |
// Anyways, once we have these six 64-bit values of the PPP, we can use each | |
// PPP bit to shift input bits by a power of two. That is, input bits that are | |
// in the bit-0 PPP mask are shifted by 2**0==1, bits in the bit-1 mask get | |
// shifted by 2, and so on, for shifts by 4, 8, 16, and 32 bits. Out of these | |
// six shifts, any shift value between 0 and 63 can be composed. | |
// | |
// For PEXT, we have to perform each shift in increasing order (1, 2, ...32) so | |
// that input bits don't overlap in the intermediate results. PDEP is the | |
// opposite: the 32-bit shift needs to happen first to make room for the smaller | |
// shifts. There's also a small complication for PDEP in that the PPP values | |
// line up where the input bits *end*, rather than where the input bits start | |
// like for PEXT. This means each bit mask needs to be shifted backwards before | |
// ANDing with the input bits. | |
// | |
// For both PEXT/PDEP the input bits need to be pre-masked so that only the | |
// relevant bits are being shifted around. For PEXT, this is a simple AND | |
// (input &= mask), but for PDEP we have to mask out everything but the low N | |
// bits, where N is the population count of the mask. | |
#define N_BITS (6) | |
typedef struct { | |
uint64_t mask; | |
uint64_t ppp_bit[N_BITS]; | |
} zp7_masks_64_t; | |
#if !(defined(__aarch64__) && defined(__ARM_FEATURE_AES)) && \ | |
!((defined(__i386__) || defined(__x86_64__)) && defined(__PCLMUL__)) | |
// If we don't have access to the CLMUL instruction, emulate it with | |
// shifts and XORs | |
# define prefix_sum(x) ({ \ | |
typeof(_Generic(x, uint32_t:(uint32_t)0, uint64_t:(uint64_t)0)) y = x; \ | |
y ^= y << 1U; y ^= y << 2U; y ^= y << 4U; y ^= y << 8U; y ^= y << 16U; \ | |
_Generic(y, uint32_t:y, uint64_t:y ^ y << 32U); }) | |
#endif | |
// Parallel-prefix-popcount. This is used by both the PEXT/PDEP polyfills. | |
// It can also be called separately and cached, if the mask values will be used | |
// more than once (these can be shared across PEXT and PDEP calls if they use | |
// the same masks). | |
const_function | |
static zp7_masks_64_t zp7_ppp_64(uint64_t mask) { | |
zp7_masks_64_t r; | |
r.mask = mask; | |
// Count *unset* bits | |
mask = ~mask; | |
#if defined(__aarch64__) && defined(__ARM_FEATURE_AES) | |
uint64x2_t m = vdupq_n_u64(mask); | |
uint64x2_t neg_2 = vdupq_n_u64(-2LL); | |
for (int i = 0; i < N_BITS - 1; i++) { | |
uint64x2_t bit = vreinterpretq_u64_p128(vmull_p64( | |
vgetq_lane_u64(m, 0), vgetq_lane_u64(neg_2, 0))); | |
r.ppp_bit[i] = vgetq_lane_u64(bit, 0); | |
m = vandq_u64(m, bit); | |
} | |
r.ppp_bit[N_BITS - 1] = -vgetq_lane_u64(m, 0) << 1; | |
#elif (defined(__i386__) || defined(__x86_64__)) && defined(__PCLMUL__) | |
// Move the mask and -2 to XMM registers for CLMUL | |
__m128i m = _mm_cvtsi64_si128(mask); | |
__m128i neg_2 = _mm_cvtsi64_si128(-2LL); | |
for (int i = 0; i < N_BITS - 1; i++) { | |
// Do a 1-bit parallel prefix popcount, shifted left by 1, | |
// in one carry-less multiply by -2. | |
__m128i bit = _mm_clmulepi64_si128(m, neg_2, 0); | |
r.ppp_bit[i] = _mm_cvtsi128_si64(bit); | |
// Get the carry bit of the 1-bit parallel prefix popcount. On | |
// the next iteration, we will sum this bit to get the next mask | |
m = _mm_and_si128(m, bit); | |
} | |
// For the last iteration, we can use a regular multiply by -2 instead of a | |
// carry-less one (or rather, a strength reduction of that, with | |
// neg/add/etc), since there can't be any carries anyways. That is because | |
// the last value of m (which has one bit set for every 32nd unset mask bit) | |
// has at most two bits set in it, when mask is zero and thus there are 64 | |
// bits set in ~mask. If two bits are set, one of them is the top bit, which | |
// gets shifted out, since we're counting bits below each mask bit. | |
r.ppp_bit[N_BITS - 1] = -_mm_cvtsi128_si64(m) << 1; | |
#else | |
for (int i = 0; i < N_BITS - 1; i++) { | |
// Do a 1-bit parallel prefix popcount, shifted left by 1 | |
uint64_t bit = prefix_sum(mask << 1); | |
r.ppp_bit[i] = bit; | |
// Get the carry bit of the 1-bit parallel prefix popcount. On | |
// the next iteration, we will sum this bit to get the next mask | |
mask &= bit; | |
} | |
// The last iteration won't carry, so just use neg/shift. See the CLMUL | |
// case above for justification. | |
r.ppp_bit[N_BITS - 1] = -mask << 1; | |
#endif | |
return r; | |
} | |
// PEXT | |
static force_inline uint64_t | |
zp7_pext_pre_64(uint64_t a, const zp7_masks_64_t *masks) { | |
// Mask only the bits that are set in the input mask. Otherwise they collide | |
// with input bits and screw everything up | |
a &= masks->mask; | |
// For each bit in the PPP, shift right only those bits that are set in | |
// that bit's mask | |
for (int i = 0; i < N_BITS; i++) { | |
uint64_t shift = 1 << i; | |
uint64_t bit = masks->ppp_bit[i]; | |
// Shift only the input bits that are set in | |
a = (a & ~bit) | ((a & bit) >> shift); | |
} | |
return a; | |
} | |
uint64_t zp7_pext_64(uint64_t a, uint64_t mask) asm("_pext_u64"); | |
uint64_t zp7_pext_64(uint64_t a, uint64_t mask) { | |
zp7_masks_64_t masks = zp7_ppp_64(mask); | |
return zp7_pext_pre_64(a, &masks); | |
} | |
uint32_t zp7_pext_32(uint32_t a, uint32_t mask) asm("_pext_u32"); | |
uint32_t zp7_pext_32(uint32_t a, uint32_t mask) { | |
return (uint32_t)zp7_pext_64(a, mask); | |
} | |
// PDEP | |
static force_inline uint64_t | |
zp7_pdep_pre_64(uint64_t a, const zp7_masks_64_t *masks) { | |
uint64_t nbits = popcnt(masks->mask); | |
// Mask just the bits that will end up in the final result--the low P bits, | |
// where P is the popcount of the mask. The other bits would collide. | |
// We need special handling for the mask==-1 case: because 64-bit shifts are | |
// implicitly modulo 64 on x86 (and (uint64_t)1 << 64 is technically | |
// undefined behavior in C), the regular "a &= (1 << pop) - 1" doesn't | |
// work: (1 << popcnt(-1)) - 1 == (1 << 64) - 1 == (1 << 0) - 1 == 0, but | |
// this should be -1. The BZHI instruction (introduced with BMI2, the same | |
// instructions as PEXT/PDEP) handles this properly, but isn't portable. | |
#if (defined(__i386__) || defined(__x86_64__)) && defined(__BMI2__) | |
a = _bzhi_u64(a, nbits); | |
#else | |
// If we don't have BZHI, use a portable workaround. Since (mask == -1) | |
// is equivalent to popcnt(mask) >> 6, use that to mask out the 1 << 64 | |
// case. | |
uint64_t pop_mask = (1ULL << nbits) & ~(nbits >> 6); | |
a &= pop_mask - 1; | |
#endif | |
// For each bit in the PPP, shift left only those bits that are set in | |
// that bit's mask. We do this in the opposite order compared to PEXT | |
for (int i = N_BITS - 1; i >= 0; i--) { | |
uint64_t shift = 1 << i; | |
uint64_t bit = masks->ppp_bit[i] >> shift; | |
// Micro-optimization: the bits that get shifted and those that don't | |
// will always be disjoint, so we can add them instead of ORing them. | |
// The shifts of 1 and 2 can thus merge with the adds to become LEAs. | |
a = (a & ~bit) + ((a & bit) << shift); | |
} | |
return a; | |
} | |
uint64_t zp7_pdep_64(uint64_t a, uint64_t mask) asm("_pdep_u64"); | |
uint64_t zp7_pdep_64(uint64_t a, uint64_t mask) { | |
zp7_masks_64_t masks = zp7_ppp_64(mask); | |
return zp7_pdep_pre_64(a, &masks); | |
} | |
uint32_t zp7_pdep_32(uint32_t a, uint32_t mask) asm("_pdep_u32"); | |
uint32_t zp7_pdep_32(uint32_t a, uint32_t mask) { | |
return (uint32_t)zp7_pdep_64(a, mask); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef ZP7_H_ | |
#define ZP7_H_ | |
#ifndef __cplusplus | |
#include <stdint.h> | |
#define STD | |
#else // __cplusplus | |
#include <cstdint> | |
#define STD ::std:: | |
extern "C" { | |
#endif // __cplusplus | |
extern STD uint32_t _pext_u32(STD uint32_t a, STD uint32_t mask); | |
extern STD uint64_t _pext_u64(STD uint64_t a, STD uint64_t mask); | |
extern STD uint32_t _pdep_u32(STD uint32_t a, STD uint32_t mask); | |
extern STD uint64_t _pdep_u64(STD uint64_t a, STD uint64_t mask); | |
#undef STD | |
#ifdef __cplusplus | |
} | |
#endif // __cplusplus | |
#endif // ZP7_H_ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment