Skip to content

Instantly share code, notes, and snippets.

@imaami
Last active October 1, 2024 18:40
Show Gist options
  • Save imaami/b74edbf7c212faa1f40241bda3c55f54 to your computer and use it in GitHub Desktop.
Save imaami/b74edbf7c212faa1f40241bda3c55f54 to your computer and use it in GitHub Desktop.
B(2, 4)
/**
* @file b24.c
*
* Compiling with MSVC 19.41 or later:
*
* cl.exe /TC /std:clatest /O2 /Oi /GL /GF /Zo- /favor:AMD64 /arch:AVX2 b24.c /Fe: b24.exe /MD
*/
#include <errno.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "neuron.h"
#include "popcnt.h"
#include "vec.h"
#if defined(__INTELLISENSE__) || defined(_MSC_VER)
# define BITINT_C(n) n
# define _BitInt(n) int
#else
# define BITINT_C(n) n##WB
#endif
#ifdef _MSC_VER
# pragma intrinsic(_BitScanForward)
static const_inline int __builtin_ctzl(unsigned long x) {
unsigned long y;
return _BitScanForward(&y, x) ? (int)y : (int)sizeof(x) * CHAR_BIT;
}
#endif
pragma_msvc(warning(push))
pragma_msvc(warning(disable: 4710))
pragma_msvc(warning(disable: 4711))
static const_inline char
hexdig (unsigned _BitInt(4) n)
{
return (const char[16]){
'0','1','2','3','4','5','6','7',
'8','9','a','b','c','d','e','f',
}[n];
}
static inline char *
decstr_u5 (char *dst,
unsigned _BitInt(5) src,
bool aln)
{
if (src >= BITINT_C(10U)) {
*dst++ = (char)((unsigned char)'0' + src / BITINT_C(10U));
src %= BITINT_C(10U);
} else if (aln)
*dst++ = ' ';
*dst++ = (char)((unsigned char)'0' + src);
return dst;
}
#if 0
static force_inline char *
decstr_s5 (char *dst,
unsigned _BitInt(5) src,
bool pad)
{
const typeof(src) lt0 = src >> 4;
src = (src - lt0) ^ ((BITINT_C(31U) ^ BITINT_C(31U)) - lt0);
if (lt0) {
if (src >= BITINT_C(10U)) {
*dst++ = '-';
*dst++ = '1';
src -= BITINT_C(10U);
} else {
if (pad)
*dst++ = ' ';
*dst++ = '-';
}
} else {
if (src >= BITINT_C(10U)) {
if (pad)
*dst++ = ' ';
*dst++ = '1';
src -= BITINT_C(10U);
} else if (pad) {
*dst++ = ' ';
*dst++ = ' ';
}
}
*dst++ = (char)((unsigned char)'0' + src);
return dst;
}
#endif
static force_inline char *
decstr_u4 (char *dst,
unsigned _BitInt(4) src,
bool aln)
{
if (src >= BITINT_C(10U)) {
*dst++ = '1';
src -= BITINT_C(10U);
} else if (aln)
*dst++ = ' ';
*dst++ = (char)((unsigned char)'0' + src);
return dst;
}
static force_inline char *
decstr_s4 (char *dst,
unsigned _BitInt(4) src,
bool aln)
{
if (src >= BITINT_C(8U)) {
*dst++ = '-';
*dst++ = (char)((unsigned char)'8' +
(unsigned char)8 - src);
} else {
if (aln)
*dst++ = ' ';
*dst++ = (char)((unsigned char)'0' + src);
}
return dst;
}
static const uint16_t b24[] = {
0x9afU, 0x9ebU, 0xa6fU, 0xa7bU,
0xb3dU, 0xb4fU, 0xbcdU, 0xbd3U,
0xcbdU, 0xd2fU, 0xd79U, 0xde5U,
0xf2dU, 0xf4bU, 0xf59U, 0xf65U,
};
pragma_msvc(warning(push))
pragma_msvc(warning(disable: 4200))
pragma_msvc(warning(disable: 4820))
struct b24_cfg {
uint32_t have_opt;
uint16_t rotation;
uint16_t offset;
uint64_t neuron;
size_t n_seq;
uint16_t seq[];
};
pragma_msvc(warning(pop))
typedef typeof((struct b24_cfg){0}.have_opt) b24_cfg_flags;
typedef typeof((struct b24_cfg){0}.offset) b24_cfg_offset;
struct trace {
uint64_t cyc;
uint32_t seq;
uint16_t len;
uint16_t map;
};
const_function
static struct trace
b24_trace_init (struct b24_cfg const *const cfg,
unsigned i)
{
uint32_t seq = cfg->seq[i] | ((uint32_t)cfg->seq[i] << 16U);
return (struct trace){
.cyc = 0U,
.seq = cfg->rotation ? (seq >> (16 - cfg->rotation))
| (seq << cfg->rotation) : seq,
.len = 0U,
.map = 0U,
};
}
const_function
static struct trace
b24_trace_from (struct trace tc,
b24_cfg_offset off)
{
tc.cyc = off;
tc.len = 1U;
tc.map = 1U << off;
for (;;) {
off = (tc.seq >> off) & (b24_cfg_offset)15;
typeof(tc.map) bit = (typeof(bit))1 << off;
if (tc.map & bit)
break;
tc.map |= bit;
tc.cyc |= (typeof(tc.cyc))off
<< (tc.len++ << 2U);
}
return tc;
}
const_function
static struct trace
b24_trace_next (struct trace tc)
{
if (tc.map == (typeof(tc.map))0xffff)
return (struct trace){0};
b24_cfg_offset off = 0U;
uint_fast16_t map = tc.map;
while (map & (typeof(map))1) {
map >>= 1U;
++off;
}
return b24_trace_from(tc, off);
}
enum b24_opt {
/* The std_in option ('-') is 0 to guarantee all
* argument-accepting options stored in `expect`
* evaluate to true during argument parsing.
*/
b24_opt_std_in , // - (unused for now)
b24_opt_big_hex , // -b
b24_opt_cycles , // -c
b24_opt_debruijn, // -d
b24_opt_entries , // -e
b24_opt_graphviz, // -g
b24_opt_help , // -h
b24_opt_json , // -j
b24_opt_neuron , // -n
b24_opt_one_path, // -o
b24_opt_python , // -p
b24_opt_expand , // -q
b24_opt_rotation, // -r
b24_opt_sig_path, // -s
b24_opt_no_space, // -w
b24_opt_hex_path, // -x
b24_opt_no_align, // -y
b24_opt_no_zeros, // -z
// qualifier flags
ONLY_ONCE = 0x40U,
NEEDS_ARG = 0x80U,
};
#define b24_opt_char(x) \
_Generic(&(int[1U+b24_opt_##x]){0} \
, int(*)[1U+b24_opt_std_in ]: 0 \
, int(*)[1U+b24_opt_big_hex ]: 'b' \
, int(*)[1U+b24_opt_cycles ]: 'c' \
, int(*)[1U+b24_opt_debruijn]: 'd' \
, int(*)[1U+b24_opt_entries ]: 'e' \
, int(*)[1U+b24_opt_graphviz]: 'g' \
, int(*)[1U+b24_opt_help ]: 'h' \
, int(*)[1U+b24_opt_json ]: 'j' \
, int(*)[1U+b24_opt_neuron ]: 'n' \
, int(*)[1U+b24_opt_one_path]: 'o' \
, int(*)[1U+b24_opt_python ]: 'p' \
, int(*)[1U+b24_opt_expand ]: 'q' \
, int(*)[1U+b24_opt_rotation]: 'r' \
, int(*)[1U+b24_opt_sig_path]: 's' \
, int(*)[1U+b24_opt_no_space]: 'w' \
, int(*)[1U+b24_opt_hex_path]: 'x' \
, int(*)[1U+b24_opt_no_align]: 'y' \
, int(*)[1U+b24_opt_no_zeros]: 'z' )
static const char b24_opt_to_char[] = {
#define b24_option(x) [b24_opt_##x] = b24_opt_char(x)
b24_option(std_in ),
b24_option(big_hex ),
b24_option(cycles ),
b24_option(debruijn),
b24_option(entries ),
b24_option(graphviz),
b24_option(help ),
b24_option(json ),
b24_option(neuron ),
b24_option(one_path),
b24_option(python ),
b24_option(expand ),
b24_option(rotation),
b24_option(sig_path),
b24_option(no_space),
b24_option(hex_path),
b24_option(no_align),
b24_option(no_zeros),
#undef b24_option
};
static const unsigned char
b24_char_to_opt[1U << CHAR_BIT] = {
#define b24_option(x) [b24_opt_char(x)] = b24_opt_##x
b24_option(std_in )| ONLY_ONCE,
b24_option(big_hex )| ONLY_ONCE,
b24_option(cycles )| ONLY_ONCE,
b24_option(debruijn)| NEEDS_ARG,
b24_option(entries )| ONLY_ONCE,
b24_option(graphviz)| ONLY_ONCE,
b24_option(help )| ONLY_ONCE,
b24_option(json )| ONLY_ONCE,
b24_option(neuron )| ONLY_ONCE
| NEEDS_ARG,
b24_option(one_path)| ONLY_ONCE
| NEEDS_ARG,
b24_option(python )| ONLY_ONCE,
b24_option(expand )| ONLY_ONCE,
b24_option(rotation)| ONLY_ONCE
| NEEDS_ARG,
b24_option(sig_path)| ONLY_ONCE,
b24_option(no_space)| ONLY_ONCE,
b24_option(hex_path)| ONLY_ONCE,
b24_option(no_align)| ONLY_ONCE,
b24_option(no_zeros)| ONLY_ONCE,
#undef b24_option
};
/**
* @brief Check if option `opt` is present.
*/
static pure_inline bool
b24_has_option (struct b24_cfg const *const cfg,
const enum b24_opt opt)
{
return cfg->have_opt & ((b24_cfg_flags)1 << opt);
}
/**
* @brief Mark option `opt` as present.
*/
static force_inline void
b24_option_add (struct b24_cfg *const cfg,
const enum b24_opt opt)
{
cfg->have_opt |= (b24_cfg_flags)1 << opt;
}
struct b24_syntax {
char bra;
char ket;
char sep;
char spc;
};
static force_inline char *
hexstr_u4 (char *dst,
unsigned _BitInt(4) src)
{
*dst++ = '0';
*dst++ = 'x';
*dst++ = hexdig(src);
return dst;
}
static inline char *
hexstr_u16 (char *dst,
uint16_t src,
bool pad)
{
*dst++ = '0';
*dst++ = 'x';
if (pad) {
*dst++ = hexdig(src >> 12U );
*dst++ = hexdig(src >> 8U & 15U);
*dst++ = hexdig(src >> 4U & 15U);
} else if (src > UINT16_C(0x000f)) {
if (src > UINT16_C(0x00ff)) {
if (src > UINT16_C(0x0fff))
*dst++ = hexdig(src >> 12U);
*dst++ = hexdig(src >> 8U & 15U);
}
*dst++ = hexdig(src >> 4U & 15U);
}
*dst++ = hexdig(src & 15U);
return dst;
}
static force_inline char *
hexstr_16x4 (char *dst,
uint64_t src,
unsigned len,
struct b24_syntax syn)
{
if (len) {
for (;; src >>= 4) {
dst = hexstr_u4(dst, src & 15U);
if (!--len)
break;
*dst++ = syn.sep;
*dst = syn.spc;
dst+= !!syn.spc;
}
}
return dst;
}
static char *
hexstr_u64 (char *dst,
uint64_t src,
bool pad)
{
char buf[16] = "000000000000000";
char *p = &buf[15];
for (;; --p) {
*p = hexdig(src & 15U);
src >>= 4U;
if (!src)
break;
}
if (pad)
p = &buf[0];
*dst++ = '0';
*dst++ = 'x';
for (;; ++p) {
*dst++ = *p;
if (p == &buf[15])
break;
}
return dst;
}
static force_inline char *
decstr_16x4 (char *dst,
uint64_t src,
unsigned len,
struct b24_syntax syn,
bool aln,
bool sig)
{
if (len) {
bool w = syn.spc;
if (sig) {
for (;; src >>= 4) {
dst = decstr_s4(dst, src & 15U, aln);
if (!--len)
break;
*dst++ = syn.sep;
*dst = syn.spc;
dst += w;
}
} else {
for (;; src >>= 4) {
dst = decstr_u4(dst, src & 15U, aln);
if (!--len)
break;
*dst++ = syn.sep;
*dst = syn.spc;
dst += w;
}
}
}
return dst;
}
static char *
b24_cfg_trace_print (struct b24_cfg const *cfg,
char *dst,
struct trace src,
unsigned idx,
struct b24_syntax syn)
{
bool align = !b24_has_option(cfg, b24_opt_no_align);
bool zeros = !b24_has_option(cfg, b24_opt_no_zeros);
bool expand = b24_has_option(cfg, b24_opt_expand);
bool all = !b24_has_option(cfg, b24_opt_one_path) &&
!expand;
if (all) {
*dst++ = syn.bra;
dst = hexstr_u16(dst, (uint16_t)src.seq, zeros);
*dst++ = syn.sep;
*dst = syn.spc;
dst+= !!syn.spc;
dst = decstr_u5(dst, idx, align);
*dst++ = syn.sep;
*dst = syn.spc;
dst+= !!syn.spc;
dst = hexstr_u16(dst, src.map, zeros);
*dst++ = syn.sep;
*dst = syn.spc;
dst+= !!syn.spc;
dst = decstr_u5(dst, src.len, align);
*dst++ = syn.sep;
*dst = syn.spc;
dst+= !!syn.spc;
}
if (b24_has_option(cfg, b24_opt_big_hex)) {
dst = hexstr_u64(dst, src.cyc, zeros);
} else {
*dst++ = syn.bra;
dst = b24_has_option(cfg, b24_opt_hex_path)
? hexstr_16x4(dst, src.cyc, src.len, syn)
: decstr_16x4(dst, src.cyc, src.len, syn, align,
b24_has_option(cfg, b24_opt_sig_path));
*dst++ = syn.ket;
}
if (all)
*dst++ = syn.ket;
*dst = '\0';
return dst;
}
static struct trace
b24_expand (struct trace tc)
{
tc.cyc = seq_expand((uint16_t)tc.seq);
tc.len = 16U;
tc.map = 0xffffU;
return tc;
}
static void
b24_cfg_trace (struct b24_cfg const *cfg)
{
struct b24_syntax syn = (struct b24_syntax[]){
{'{', '}', ',', ' '},
{'[', ']', ',', ' '}, // json or python
{'{', '}', ',', 0x0}, // no space
{'[', ']', ',', 0x0}, // js/py, no space
}[b24_has_option(cfg, b24_opt_json) |
b24_has_option(cfg, b24_opt_python) |
(b24_has_option(cfg, b24_opt_no_space) << 1U)];
bool expand = b24_has_option(cfg, b24_opt_expand);
bool once = b24_has_option(cfg, b24_opt_one_path);
bool all = !once && !expand;
bool please = false;
for (unsigned i = 0; i < cfg->n_seq;) {
struct trace tc = b24_trace_init(cfg, i);
unsigned cycles = 0;
for (;; ++cycles) {
struct trace t;
if (once) {
t = b24_trace_from(tc, cfg->offset);
} else if (expand) {
t = b24_expand(tc);
} else {
t = b24_trace_next(tc);
if (!t.map)
break;
tc.cyc |= t.cyc << (tc.len << 2U);
tc.len += t.len;
tc.map |= t.map;
}
char buf[160];
char *p = b24_cfg_trace_print(cfg, buf, t,
cycles, syn);
if (p) {
if (please) {
(void)putchar(syn.sep);
(void)putchar('\n');
}
(void)fputs(buf, stdout);
please = true;
}
if (!all)
break;
}
if (++i == cfg->n_seq && please)
(void)putchar('\n');
}
}
static void
b24_help (void)
{
(void)printf(
"Usage: b24 [OPTIONS]... [<0..65535>]...\n"
"Options:\n"
" -%c Print this help text and exit\n"
" -%c <state> Trace a neuron's state evolution\n"
" -%c <0..15> Trace a single path at offset\n"
" -%c Expand subsequences in place\n"
"\n"
"Input options:\n"
" -%c <0..15> Add distinct B(2, 4) as input\n"
" -%c <0..15> Apply rotation before tracing\n"
"\n"
"Output options:\n"
" -%c Only cycles, no entry paths\n"
" -%c Always find all entry paths\n"
"\n"
"Formatting options:\n"
" -%c Output in Graphviz format\n"
" -%c Output in JSON format\n"
" -%c Output in Python format\n"
" -%c Signed decimal path steps\n"
" -%c Hexadecimal path steps\n"
" -%c Print every sequence as one big hex number\n"
" -%c Don't align output columns\n"
" -%c Don't zero-pad hexadecimals\n"
" -%c Omit non-separator whitespace\n",
b24_opt_char(help), b24_opt_char(neuron),
b24_opt_char(one_path), b24_opt_char(expand),
b24_opt_char(debruijn), b24_opt_char(rotation),
b24_opt_char(cycles), b24_opt_char(entries),
b24_opt_char(graphviz), b24_opt_char(json),
b24_opt_char(python), b24_opt_char(sig_path),
b24_opt_char(hex_path), b24_opt_char(big_hex),
b24_opt_char(no_align), b24_opt_char(no_zeros),
b24_opt_char(no_space)
);
}
struct str7 {
union {
char d[8U];
uint64_t u;
};
};
const_inline
static struct str7
b24_opt_char_str (char ch)
{
struct str7 str = {{{'"', '-', ch}}};
str.d[3U - !ch] = '"';
return str;
}
static void
b24_cfg_destroy (struct b24_cfg **cfg)
{
if (cfg && *cfg) {
free(*cfg);
*cfg = NULL;
}
}
_Noreturn static void
b24_helpful_exit (struct b24_cfg **cfg,
int ret)
{
b24_cfg_destroy(cfg);
b24_help();
exit(ret);
}
static struct b24_cfg *
b24_cfg_create (size_t n_seq)
{
struct b24_cfg *cfg = calloc(1, offsetof(struct b24_cfg, seq)
+ n_seq * sizeof cfg->seq[0]);
if (!cfg)
perror("calloc");
return cfg;
}
_Noreturn static void
b24_cosmic_ray (struct b24_cfg **cfg,
char const *what,
char const *func,
int line)
{
size_t n = what ? strlen(what) : 0;
if (!n)
what = "";
(void)fprintf(stderr, "[\033[31mERROR\033[m] %s:%d: %s%s\n",
func, line, what, &".\nFailure caused by the impact "
"of a high-energy cosmic particle or a high-density "
"program author."[n ? what[n-1U] == '.' : 2U]);
b24_cfg_destroy(cfg);
exit(EXIT_FAILURE);
}
static int
b24_options_incompatible (struct b24_cfg *cfg,
enum b24_opt opt,
b24_cfg_flags bad,
const char sep)
{
if (!b24_has_option(cfg, opt))
return 0;
bad &= cfg->have_opt;
if (!bad)
return 0;
cfg->have_opt &= ~((typeof(cfg->have_opt))1 << opt);
int n = popcnt(bad);
if (n >= (int)array_size(b24_opt_to_char)) {
b24_cosmic_ray(&cfg, "Option flag outside expected range",
__func__, __LINE__);
}
char buf[64] = "";
char *dst = &buf[0];
b24_cfg_flags bit = 1U;
for (int prev = 0, i = 0; i < n;) {
int o = __builtin_ctzl(bad);
if (o >= (int)array_size(b24_opt_to_char))
break;
bit <<= o - prev;
bad &= ~bit;
prev = o;
if (++i > 1) {
if (n > 2)
*dst++ = sep;
*dst++ = ' ';
if (i == n) {
*dst++ = 'o';
*dst++ = 'r';
*dst++ = ' ';
}
}
struct str7 s = b24_opt_char_str(b24_opt_to_char[o]);
for (char *src = s.d; *src; ++src) { *dst++ = *src; }
}
*dst = '\0';
(void)fprintf(stderr, "Option %s cannot be combined with %s\n",
b24_opt_char_str(b24_opt_to_char[opt]).d, buf);
return 1;
}
static void
b24_option_assert_once (struct b24_cfg *cfg,
enum b24_opt opt)
{
if (b24_has_option(cfg, opt)) {
(void)fprintf(stderr, "Option %s specified twice\n",
b24_opt_char_str(b24_opt_to_char[opt]).d);
b24_helpful_exit(&cfg, EXIT_FAILURE);
}
}
static inline uint64_t
b24_get_u64_arg (char *arg,
int *err)
{
if (!arg) {
*err = EFAULT;
return 0;
}
if (!*arg) {
*err = EINVAL;
return 0;
}
errno = 0;
uint64_t u64 = 0U;
char *endptr = arg;
int64_t i64 = _Generic(i64, long: strtol,
long long: strtoll)(arg, &endptr, 0);
int e = errno;
if (!e) {
u64 = (uint64_t)i64;
if (*endptr)
e = EINVAL;
} else if (e == ERANGE && i64 == _Generic(i64, long: LONG_MAX,
long long: LLONG_MAX)) {
errno = 0;
endptr = arg;
u64 = _Generic(u64, unsigned long: strtoul,
unsigned long long: strtoull)(arg, &endptr, 0);
e = errno;
if (!e && *endptr)
e = EINVAL;
}
*err = e;
return u64;
}
static uint64_t
b24_get_int_arg (char *arg,
int *err,
uint64_t max)
{
int e = 0;
uint64_t u64 = b24_get_u64_arg(arg, &e);
if (u64 > max) {
u64 = max;
if (!e)
e = ERANGE;
}
*err = e;
return u64;
}
static struct b24_cfg *
b24_parse_args (int argc,
char **argv)
{
struct b24_cfg *cfg = b24_cfg_create(argc > 1 ? argc - 1 : 0);
if (!cfg)
return NULL;
enum b24_opt expect = 0;
char c_ = '\0';
for (int i = 1; i < argc; i++) {
char *a = argv[i];
if (expect) {
if (!*a)
goto missing_arg;
parse_arg: do{}while(0);
pragma_msvc(warning(push))
pragma_msvc(warning(disable: 4061))
uint64_t max = 15U;
switch (expect) {
case b24_opt_neuron:
max = UINT64_MAX;
break;
case b24_opt_debruijn:
case b24_opt_one_path:
case b24_opt_rotation:
break;
default:
b24_cosmic_ray(&cfg, "", __func__, __LINE__);
}
int e = 0;
uint64_t n = b24_get_int_arg(a, &e, max);
if (e) {
(void)fprintf(stderr, "Option %s "
"parse error: %s\n",
b24_opt_char_str(c_).d,
strerror(e));
b24_helpful_exit(&cfg, EXIT_FAILURE);
}
switch (expect) {
case b24_opt_debruijn:
cfg->seq[cfg->n_seq++] = b24[n];
break;
case b24_opt_neuron:
cfg->neuron = n;
break;
case b24_opt_one_path:
cfg->offset = (b24_cfg_offset)n;
break;
case b24_opt_rotation:
cfg->rotation = (uint16_t)n;
default:
break;
}
pragma_msvc(warning(pop))
b24_option_add(cfg, expect);
expect = 0;
continue;
}
if (*a == '-') {
c_ = *++a;
concatenated_option:
unsigned ch_opt = b24_char_to_opt[(unsigned char)c_];
if (!ch_opt) {
(void)fprintf(stderr, "Option %s is unknown"
"\n", b24_opt_char_str(c_).d);
b24_helpful_exit(&cfg, EXIT_FAILURE);
}
enum b24_opt opt = ch_opt & ~(NEEDS_ARG | ONLY_ONCE);
if (ch_opt & ONLY_ONCE)
b24_option_assert_once(cfg, opt);
if (ch_opt & NEEDS_ARG) {
expect = opt;
if (*++a)
goto parse_arg;
} else {
b24_option_add(cfg, opt);
if (c_ && (c_ = *++a))
goto concatenated_option;
}
continue;
}
int e = 0;
uint64_t n = b24_get_int_arg(a, &e, UINT16_MAX);
if (e) {
(void)fprintf(stderr, "Bad sequence: %s\n",
strerror(e));
b24_cfg_destroy(&cfg);
return NULL;
}
cfg->seq[cfg->n_seq++] = (uint16_t)n;
}
int yikes = b24_options_incompatible(cfg, b24_opt_big_hex ,
1U << b24_opt_hex_path |
1U << b24_opt_sig_path , ',')
+ b24_options_incompatible(cfg, b24_opt_cycles ,
1U << b24_opt_entries |
1U << b24_opt_neuron |
1U << b24_opt_expand , ',')
+ b24_options_incompatible(cfg, b24_opt_entries ,
1U << b24_opt_cycles |
1U << b24_opt_neuron |
1U << b24_opt_expand , ',')
+ b24_options_incompatible(cfg, b24_opt_graphviz ,
1U << b24_opt_json |
1U << b24_opt_python , ',')
+ b24_options_incompatible(cfg, b24_opt_json ,
1U << b24_opt_python |
1U << b24_opt_graphviz , ',')
+ b24_options_incompatible(cfg, b24_opt_neuron ,
1U << b24_opt_one_path |
1U << b24_opt_expand , ',')
+ b24_options_incompatible(cfg, b24_opt_one_path ,
1U << b24_opt_expand , ',')
+ b24_options_incompatible(cfg, b24_opt_python ,
1U << b24_opt_json |
1U << b24_opt_graphviz , ',')
+ b24_options_incompatible(cfg, b24_opt_expand ,
1U << b24_opt_cycles |
1U << b24_opt_entries |
1U << b24_opt_one_path , ',')
+ b24_options_incompatible(cfg, b24_opt_sig_path ,
1U << b24_opt_hex_path , ',');
if (yikes)
b24_helpful_exit(&cfg, EXIT_FAILURE);
if (expect)
goto missing_arg;
if (b24_has_option(cfg, b24_opt_help))
b24_helpful_exit(&cfg, EXIT_SUCCESS);
if (!cfg->n_seq) {
(void)fputs("No sequence(s) specified\n", stderr);
b24_helpful_exit(&cfg, EXIT_FAILURE);
}
return cfg;
missing_arg:
(void)fprintf(stderr, "Option %s expects an argument\n",
b24_opt_char_str(c_).d);
b24_helpful_exit(&cfg, EXIT_FAILURE);
}
static char *
b24_render_gv_struct (char *dst,
size_t siz,
uint16_t seq,
unsigned _BitInt(4) id)
{
if (siz <
sizeof "S0 [label=\"<f>0|<e>0|<d>0|<c>0|"
"<b>0|<a>0|<9>0|<8>0|"
"<7>0|<6>0|<5>0|<4>0|"
"<3>0|<2>0|<1>0|<0>0\"];\n")
return NULL;
memcpy(dst, "S0 [label=\"", sizeof "S0 [label=\"" - 1U);
*++dst = hexdig(id);
dst += sizeof " [label=\"";
for (unsigned off = 15U;; --off) {
*dst++ = '<';
*dst++ = hexdig(off);
*dst++ = '>';
*dst++ = (char)((unsigned char)'0' + (seq >> off & 1U));
if (!off)
break;
*dst++ = '|';
}
*dst++ = '"';
*dst++ = ']';
*dst++ = ';';
*dst++ = '\n';
return dst;
}
int
main (int c,
char **v)
{
struct b24_cfg *cfg = b24_parse_args(c, v);
if (!cfg)
return EXIT_FAILURE;
b24_cfg_trace(cfg);
b24_cfg_destroy(&cfg);
return EXIT_SUCCESS;
}
pragma_msvc(warning(pop))
/** @file clock.h
*/
#ifndef B24_CLOCK_H_
#define B24_CLOCK_H_
#ifndef _MSC_VER
# include <time.h>
#endif
#include "vec.h"
static force_inline vec128
unix_epoch (void)
{
#ifdef _MSC_VER
FILETIME ft = {0};
GetSystemTimeAsFileTime(&ft);
return ((ULARGE_INTEGER){
.LowPart = ft.dwLowDateTime,
.HighPart = ft.dwHighDateTime,
}).QuadPart / UINT64_C(10000000) -
((369U * 365U + 89U) * UINT64_C(86400));
#else
struct timespec ts = {0}, ts2 = {0};
(void)clock_gettime(CLOCK_REALTIME, &ts);
(void)clock_gettime(CLOCK_MONOTONIC, &ts2);
vec128 epoch = {
.u32[0] = ts.tv_sec,
.u32[2] = ts2.tv_sec,
.u32[1] = ts2.tv_sec,
.u32[3] = ts.tv_sec
};
return epoch;
#endif
}
static force_inline void hol_up (uint64_t ns)
{
#ifdef _MSC_VER
LARGE_INTEGER t = {
.QuadPart = (int64_t)(ns / 100U) * -1
};
(void)NtDelayExecution(FALSE, &t);
#else
struct timespec t = {
.tv_sec = (typeof(t.tv_sec))(ns / UINT64_C(1000000000)),
.tv_nsec = (typeof(t.tv_nsec))(ns % UINT64_C(1000000000))
};
(void)clock_nanosleep(CLOCK_MONOTONIC, 0, &t, NULL);
#endif
}
#endif /* B24_CLOCK_H_ */
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include "cortex.h"
#include "seq.h"
struct cortex *
cortex_create (void)
{
struct cortex *ctx = calloc(1, sizeof *ctx);
if (!ctx) {
perror("calloc");
return NULL;
}
cortex_init(ctx);
return ctx;
}
void
cortex_destroy (struct cortex **ctx)
{
if (ctx) {
struct cortex *c = *ctx;
*ctx = NULL;
ctx = NULL;
free(c);
c = NULL;
}
}
void
cortex_init (struct cortex *ctx)
{
const uint16_t de_bruijn_seq_2_4[] = {
0x9afU, 0x9ebU, 0xa6fU, 0xa7bU,
0xb3dU, 0xb4fU, 0xbcdU, 0xbd3U,
0xcbdU, 0xd2fU, 0xd79U, 0xde5U,
0xf2dU, 0xf4bU, 0xf59U, 0xf65U,
};
for (uint32_t i = 0, y = 0; y < 16U; ++y) {
uint32_t seq = de_bruijn_seq_2_4[y];
seq |= seq << 16U;
for (uint32_t x = i + 16U; i < x; ++i) {
ctx->paths[fwd(32, 8, i)] =
seq_expand_vec128((uint16_t)seq);
seq >>= 1U;
}
}
for (uint32_t i = 0; i < array_size(ctx->layer); ++i) {
layer_init(&ctx->layer[i], (struct layer_cfg){
.self = i,
.prev = (i + array_size(ctx->layer)
- 1U) % array_size(ctx->layer),
.next = (i + 1U) % array_size(ctx->layer),
.mode = i,
}, &ctx->data[i], &ctx->data[i + 1U]);
}
}
/** @file cortex.h
*/
#ifndef B24_CORTEX_H_
#define B24_CORTEX_H_
#include "clock.h"
#include "layer.h"
#include "neuron.h"
#ifdef __aarch64__
# include "zp7.h"
#endif
enum ctx_flag {
CORTEX_TOCK = 1U
};
enum { DIR_IN, DIR_OUT };
/**
* @brief Cortex structure.
*
* This is the structure that holds all the neurons.
*
* More like a resonant bucket than a brain.
*/
struct cortex {
uint64_t resvd[7]; // align to cache line
uint64_t flags;
vec128 paths[256];
struct layer layer[16];
struct chunk data[17];
};
union ctx_addr {
struct {
uint8_t b8 : 3;
uint8_t u8 : 4;
uint8_t vec : 1;
uint8_t : 8;
};
struct {
uint8_t b64 : 6;
uint8_t u64 : 1;
uint8_t : 1;
uint8_t rot : 4;
uint8_t seq : 4;
};
struct {
uint8_t : 3;
uint8_t chn : 4;
uint8_t dir : 1;
uint8_t nrn : 8;
};
uint16_t raw;
};
_Static_assert(sizeof(union ctx_addr) == 2U, "");
static pure_inline union ctx_addr
ctx_addr (uint16_t raw)
{
return (union ctx_addr){.raw = raw};
}
extern struct cortex *
cortex_create (void);
extern void
cortex_destroy (struct cortex **ctx);
extern void
cortex_init (struct cortex *ctx);
struct cortex_loc {
uint16_t bit : 2;
uint16_t chn : 4;
uint16_t rot : 4;
uint16_t seq : 4;
uint16_t pad : 2;
};
struct linear_loc {
uint16_t col : 7;
uint16_t row : 7;
uint16_t pad : 2;
};
static const_inline struct cortex_loc
linear_loc_fwd (struct linear_loc loc)
{
union {
struct cortex_loc ctx;
struct linear_loc lin;
uint16_t raw;
} src = {
.lin = loc
}, dst = {
.raw = (uint16_t)fwd(32, 14, src.raw)
};
return dst.ctx;
}
static force_inline void
cortex_tick (struct cortex *ctx)
{
for (uint32_t i = 0; i < array_size(ctx->layer); ++i) {
layer_tick(&ctx->layer[i], &ctx->data[(i + 1U) & 15U]);
layer_tock(&ctx->layer[i], &ctx->data[i ]);
}
}
#endif /* B24_CORTEX_H_ */
#include <stdio.h>
#include "layer_priv.h"
void
layer_init (struct layer *obj,
struct layer_cfg cfg,
struct chunk *in,
struct chunk *out)
{
*obj = layer(cfg, in, out);
}
void
layer_fini (struct layer *obj)
{
(void)obj;
}
void
layer_run (struct layer *obj,
struct chunk *input)
{
bool init_now = !obj->cfg.init;
unsigned hole = 0;
for (uint16_t i = 0; i < (uint16_t)array_size(obj->data[0]->v); ++i) {
// internal connections
vec128 self_vec = *layer_vector(obj, (union ctx_addr){
.dir = DIR_IN,
.nrn = i,
});
vec128 peer_vec[16] = {0};
union ctx_addr peer_addr[16] = {0};
for (uint16_t j = 0; j < (uint16_t)array_size(self_vec.u8); ++j) {
uint16_t tmp = layer_map(obj, (union ctx_addr){
.chn = j,
.nrn = i,
});
peer_addr[j] = (union ctx_addr){
.chn = tmp & 15U,
.dir = DIR_IN,
.nrn = tmp >> 4U,
};
peer_vec[j] = *layer_vector(obj, peer_addr[j]);
}
for (uint16_t j = 0; j < (uint16_t)array_size(self_vec.u8); ++j) {
if (peer_addr[j].nrn == i) {
if (init_now)
++hole;
// self connection (external input)
// TOEDOE: run callback and assign result to self.u8[j]
//self_vec.u8[j] = input->v[i].u8[j];
continue;
}
uint8_t ch = self_vec.u8[j];
uint8_t up = layer_channel_whence(
obj,
(union ctx_addr) {
.chn = j,
.nrn = i,
}
);
bool match_lsb = (ch & 15U) == (up & 15U);
bool match_msb = (ch >> 4U) == j;
if ((match_msb ^ match_lsb)) {
peer_vec[j].u8[j] = self_vec.u8[j];
self_vec.u8[j] = 0;
} else if (match_msb) {
self_vec.u8[j] = 0;
} else {
self_vec.u8[j] = peer_vec[j].u8[j];
//peer_vec[j].u8[j] = 0;
}
layer_save_vector(obj, (union ctx_addr){
.dir = DIR_OUT,
.nrn = peer_addr[j].nrn,
}, peer_vec[j]);
}
struct neuron nrn = layer_make_neuron(
obj,
(union ctx_addr){
.dir = DIR_IN,
.nrn = i,
}
);
layer_save_vector(
obj,
(union ctx_addr) {
.dir = DIR_OUT,
.nrn = i,
},
neuron_send(&nrn, self_vec)
);
}
if (init_now) {
obj->cfg.init = 1;
obj->cfg.hole = hole;
(void)printf("inputs: %u\n", hole);
}
}
/** @file layer.h
*/
#ifndef B24_LAYER_H_
#define B24_LAYER_H_
#include "vec.h"
struct layer_cfg {
uint64_t self : 4;
uint64_t prev : 4;
uint64_t next : 4;
uint64_t mode : 4;
uint64_t hole : 8;
uint64_t init : 1;
uint64_t tock : 1;
uint64_t resv : 36;
};
_Static_assert(sizeof(struct layer_cfg)
== sizeof(uint64_t), "");
/**
* @brief Network layer.
*/
struct layer {
struct layer_cfg cfg;
struct chunk *data[2];
};
_Static_assert(sizeof(struct layer)
== sizeof(struct layer_cfg)
+ sizeof(struct chunk *) * 2U, "");
extern void
layer_init (struct layer *obj,
struct layer_cfg cfg,
struct chunk *in,
struct chunk *out);
extern void
layer_fini (struct layer *obj);
extern void
layer_run (struct layer *obj,
struct chunk *in);
static force_inline void
layer_tick (struct layer *obj,
struct chunk *in)
{
obj->cfg.tock = 0;
layer_run(obj, in);
}
static force_inline void
layer_tock (struct layer *obj,
struct chunk *in)
{
obj->cfg.tock = 1;
layer_run(obj, in);
}
#endif /* B24_LAYER_H_ */
/** @file layer_priv.h
*/
#ifndef B24_LAYER_PRIV_H_
#define B24_LAYER_PRIV_H_
#include <stdio.h>
#include "cortex.h"
#include "layer.h"
static pure_inline struct cortex *
to_cortex (struct layer const *obj)
{
return container_of(obj, struct cortex, layer[obj->cfg.self]);
}
static pure_inline struct vec128 *
layer_vector (struct layer const *obj,
union ctx_addr addr)
{
return &obj->data[addr.dir ^ obj->cfg.tock]->v[addr.nrn];
}
static pure_inline struct neuron
layer_make_neuron (struct layer const *obj,
union ctx_addr addr)
{
return neuron_from_vec128(
&to_cortex(obj)->paths[addr.nrn],
layer_vector(obj, addr)
);
}
static pure_inline uint8_t
layer_channel_whence (struct layer const *obj,
union ctx_addr addr)
{
return to_cortex(obj)->paths[addr.nrn].u8[addr.chn];
}
static pure_inline uint8_t
layer_channel_get (struct layer const *obj,
union ctx_addr addr)
{
return layer_vector(obj, addr)->u8[addr.chn];
}
static force_inline void
layer_channel_set (struct layer *obj,
union ctx_addr addr,
uint8_t data)
{
layer_vector(obj, addr)->u8[addr.chn] = data;
}
static pure_inline uint8_t
layer_channel_swap (struct layer *obj,
union ctx_addr addr,
uint8_t data)
{
uint8_t ret = layer_channel_get(obj, addr);
addr.dir ^= 1U;
layer_channel_set(obj, addr, data);
return ret;
}
static force_inline struct layer
layer (struct layer_cfg cfg,
struct chunk *in,
struct chunk *out)
{
return (struct layer) {
.cfg = cfg,
.data = {in, out}
};
}
static force_inline void
layer_save_vector (struct layer *obj,
union ctx_addr addr,
vec128 data)
{
*layer_vector(obj, addr) = data;
}
static pure_inline uint16_t
layer_map (struct layer const *obj,
union ctx_addr addr)
{
struct cortex *ctx = to_cortex(obj);
const uint16_t index[16] = {
// cortex::paths is shuffled with rev(32, 8),
// the original indices are the EOL comments.
83, 36, 100, 99, 110, 129, 213, 200, // 29, 66, 74, 89, 122, 129, 143, 168,
204, 160, 224, 240, 167, 249, 253, 170, // 170, 192, 200, 204, 211, 237, 239, 240,
};
uint16_t s = index[addr.chn];
uint16_t r = index[obj->cfg.mode];
uint16_t them = (uint16_t)ctx->paths[s].u8[addr.seq] << 8U
| (uint16_t)ctx->paths[r].u8[addr.rot] << 4U
| (uint16_t)ctx->paths[r].u8[addr.chn];
return (uint16_t)rev(32, 12, them);
}
#endif /* B24_LAYER_PRIV_H_ */
/**
* @file neuron.h
*/
#ifndef B24_NEURON_H_
#define B24_NEURON_H_
#include <limits.h>
#include "seq.h"
#include "vec.h"
/**
* @brief Runtime neuron structure.
*
* This is what is actually used with the SIMD instructions.
*/
struct neuron {
u8x16 paths;
u8x16 state;
};
static pure_inline struct neuron
neuron_from_vec128 (vec128 *paths,
vec128 *state)
{
return (struct neuron){
.paths = u8x16_from_vec128(*paths),
.state = u8x16_from_vec128(*state)
};
}
static pure_inline struct neuron
neuron (uint16_t seq,
vec128 state)
{
return (struct neuron){
.paths = seq_expand_u8x16(seq),
.state = u8x16_from_vec128(state)
};
}
static force_inline u8x16
neuron_tick (struct neuron *nrn,
u8x16 in)
{
u8x16 state = nrn->state;
nrn->state = age_and_mutate(
merge_input(
state,
in
),
nrn->paths
);
return state;
}
static force_inline vec128
neuron_tock (struct neuron *nrn,
vec128 in)
{
nrn->state = age_and_mutate(
merge_input(
nrn->state,
u8x16_from_vec128(in)
),
nrn->paths
);
return vec128_from_u8x16(nrn->state);
}
static force_inline vec128
neuron_send (struct neuron *nrn,
vec128 in)
{
nrn->state = mutate(
select_nonzero_input(
nrn->state,
u8x16_from_vec128(in)
),
nrn->paths
);
return vec128_from_u8x16(nrn->state);
}
#endif /* B24_NEURON_H_ */
/** @file popcnt.h
*/
#ifndef B24_POPCNT_H_
#define B24_POPCNT_H_
#include <limits.h>
#include <stdint.h>
#include "util.h"
#undef define_popcnt
#undef popcnt16_impl
#undef popcnt32_impl
#undef popcnt64_impl
#undef popcnt_compat
#ifdef _MSC_VER
# define popcnt16_impl return __popcnt16
# pragma intrinsic(__popcnt16)
# define popcnt32_impl return __popcnt
# pragma intrinsic(__popcnt)
# ifdef _M_IX86
# undef popcnt32x2
# define popcnt64_impl return popcnt32x2
# define popcnt32x2(x) \
(__popcnt((uint32_t)(x >> 32U)) + \
__popcnt((uint32_t)(x & ~(uint32_t)0)))
# else
# define popcnt64_impl return __popcnt64
# pragma intrinsic(__popcnt64)
# endif
# define popcnt_pre_pragma _Pragma("warning(push)") \
_Pragma("warning(disable: 4116)")
# define popcnt_post_pragma _Pragma("warning(pop)")
#else
# if __has_builtin(__builtin_popcount) && (UINT_MAX == UINT32_MAX)
# define popcnt16_impl return (uint16_t)__builtin_popcount
# define popcnt32_impl return (uint32_t)__builtin_popcount
# endif
# if !defined(popcnt32_impl) && \
__has_builtin(__builtin_popcountl) && (ULONG_MAX == UINT32_MAX)
# define popcnt16_impl return (uint16_t)__builtin_popcountl
# define popcnt32_impl return (uint32_t)__builtin_popcountl
# endif
# if __has_builtin(__builtin_popcountl) && (ULONG_MAX == UINT64_MAX)
# define popcnt64_impl return (uint64_t)__builtin_popcountl
# endif
# if !defined(popcnt64_impl) && \
__has_builtin(__builtin_popcountll) && (ULLONG_MAX == UINT64_MAX)
# define popcnt64_impl return (uint64_t)__builtin_popcountll
# endif
# if __has_builtin(__builtin_popcountg)
# if !defined(popcnt16_impl)
# define popcnt16_impl return (uint16_t)__builtin_popcountg
# endif
# if !defined(popcnt32_impl)
# define popcnt32_impl return (uint32_t)__builtin_popcountg
# endif
# if !defined(popcnt64_impl)
# define popcnt64_impl return (uint64_t)__builtin_popcountg
# endif
# endif
#endif
#define define_popcnt(b, ...) const_inline \
static uint##b##_t popcnt##b(uint##b##_t val) \
{ __VA_ARGS__(val); } _Static_assert(b>0, #b)
#define popcnt_compat(x) typeof(_Generic( \
(char(*)[2 - (sizeof(x) > 4U)])0 \
,char(*)[1]: (uint64_t)0 \
,char(*)[2]: (uint32_t)0)) y = (x); \
y -= (typeof(y))-1/3 & (y >> 1U); \
y = ((typeof(y))-1/5 & (y >> 2U)) \
+ ((typeof(y))-1/5 & y); \
return ((typeof(y))-1/ 17 & ((y >> 4U) + y)) \
* ((typeof(y))-1/255) >> (sizeof y - 1U) \
* CHAR_BIT
#ifndef popcnt16_impl
# define popcnt16_impl popcnt_compat
#endif
#ifndef popcnt32_impl
# define popcnt32_impl popcnt_compat
#endif
#ifndef popcnt64_impl
# define popcnt64_impl popcnt_compat
#endif
#ifndef popcnt_pre_pragma
# define popcnt_pre_pragma
#endif
#ifndef popcnt_post_pragma
# define popcnt_post_pragma
#endif
define_popcnt(16, popcnt16_impl);
define_popcnt(32, popcnt32_impl);
define_popcnt(64, popcnt64_impl);
#undef define_popcnt
#undef popcnt16_impl
#undef popcnt32_impl
#undef popcnt64_impl
#undef popcnt_compat
#if defined(_MSC_VER) && defined(_M_IX86)
# undef popcnt32x2
#endif
#define popcnt(x) popcnt_pre_pragma _Generic((x), \
typeof(_Generic((char)0, \
signed char: (struct{int i;}){0}, \
unsigned char: (struct{int i;}){0}, \
default: (char)0)): popcnt16, \
signed char: popcnt16, short: popcnt16, \
unsigned char: popcnt16, int: popcnt32, \
unsigned short: popcnt16, unsigned: popcnt32, \
long long: popcnt64, unsigned long: _Generic( \
&(int[sizeof 1UL]){0}, \
int(*)[sizeof(uint32_t)]: popcnt32, \
int(*)[sizeof(uint64_t)]: popcnt64), \
unsigned long long: popcnt64, long: _Generic( \
&(int[sizeof 1L]){0}, \
int(*)[sizeof(int32_t)]: popcnt32, \
int(*)[sizeof(int64_t)]: popcnt64)) \
(_Generic((x), default: (x), \
long long: (unsigned long long)(x), \
signed char: (unsigned char)(x), \
short: (unsigned short)(x), \
typeof(_Generic((char)0, \
signed char: \
(struct{int i;}){0}, \
unsigned char: \
(struct{int i;}){0}, \
default: (char)0)): \
(unsigned char)(x), \
long: (unsigned long)(x), \
int: (unsigned)(x))) popcnt_post_pragma
#endif /* B24_POPCNT_H_ */
/** @file seq.h
*/
#ifndef B24_SEQ_H_
#define B24_SEQ_H_
#include "vec.h"
#ifdef __aarch64__
static const_inline u8x16
mutate (u8x16 state,
u8x16 paths)
{
return vqtbl1q_u8(state, paths);
}
static const_inline uint16_t
sum_of_abs_diff (u8x16 a,
u8x16 b)
{
return vaddvq_u8(vabdq_u8(a, b));
}
/**
* @brief Age and mutate the state.
*
* Decrement each non-zero strength by one and apply the given mutation.
*
* @param state The current state.
* @param paths The mutation descriptor.
* @return The resulting state.
*/
static const_inline u8x16
age_and_mutate (u8x16 state,
u8x16 paths)
{
return vqtbl1q_u8(
vsubq_u8(
state,
vbslq_u8(
vtstq_u8(
state,
vdupq_n_u8(0xf0)
),
vdupq_n_u8(0x10),
state
)
),
paths
);
}
/**
* @brief Merge input into the current state, preferring the input.
*
* For each channel, if the input channel has a non-zero strength,
* use that channel in the output, otherwise use the corresponding
* channel from the current state.
*
* @param state The current state.
* @param input The input to merge into the state.
* @return The merged state.
*/
static const_inline u8x16
merge_input (u8x16 state,
u8x16 input)
{
return vbslq_u8(
vtstq_u8(
input,
vdupq_n_u8(0xf0)
),
input,
state
);
}
static const_inline u8x16
select_nonzero_input (u8x16 state,
u8x16 input)
{
return vbslq_u8(
vtstq_u8(
input,
vdupq_n_u8(0xff)
),
input,
state
);
}
static force_inline uint64_t
seq_expand (uint16_t seq)
{
uint16_t rot = seq << 12U | seq >> 4U;
uint64_t ret = 0;
uint64x2_t x = vreinterpretq_u64_u8(
vqtbl1q_u8(
vreinterpretq_u8_u16(
vld1q_lane_u16(
&rot,
vld1q_lane_u16(
&seq,
vdupq_n_u16(0),
0
),
1
)
),
vld1q_u8(((uint8_t[16]){
0x00, 0x00, 0x00, 0x00,
0x02, 0x02, 0x02, 0x02,
0x01, 0x01, 0x01, 0x01,
0x03, 0x03, 0x03, 0x03})
)
)
);
x = vorrq_u64(
vandq_u64(
x,
vdupq_n_u64(UINT64_C(0x003c000f003c000f))
),
vshrq_n_u64(
vandq_u64(
x,
vdupq_n_u64(UINT64_C(0x78001e0078001e00))
),
5
)
);
x = vorrq_u64(
vandq_u64(
x,
vdupq_n_u64(UINT64_C(0x000000ff000000ff))
),
vshrq_n_u64(
vandq_u64(
x,
vdupq_n_u64(UINT64_C(0x03fc000003fc0000))
),
10
)
);
vst1q_lane_u64(
&ret,
vreinterpretq_u64_u8(
vqtbl1q_u8(
vreinterpretq_u8_u64(x),
vld1q_u8(((uint8_t[16]){
0x00, 0x01, 0x04, 0x05,
0x08, 0x09, 0x0c, 0x0d,
0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80})
)
)
),
0
);
return ret;
}
static force_inline u8x16
seq_expand_u8x16 (uint16_t seq)
{
uint64x2_t x = vreinterpretq_u64_u8(
vqtbl1q_u8(
vreinterpretq_u8_u16(
vld1q_lane_u16(
(uint16_t[1]){
seq << 12U |
seq >> 4U
},
vld1q_lane_u16(
&seq,
vdupq_n_u16(0),
0
),
1
)
),
vld1q_u8(((uint8_t[16]){
0x00, 0x00, 0x00, 0x00,
0x02, 0x02, 0x02, 0x02,
0x01, 0x01, 0x01, 0x01,
0x03, 0x03, 0x03, 0x03})
)
)
);
x = vorrq_u64(
vandq_u64(
x,
vdupq_n_u64(UINT64_C(0x003c000f003c000f))
),
vshrq_n_u64(
vandq_u64(
x,
vdupq_n_u64(UINT64_C(0x78001e0078001e00))
),
5
)
);
x = vorrq_u64(
vandq_u64(
x,
vdupq_n_u64(UINT64_C(0x000000ff000000ff))
),
vshrq_n_u64(
vandq_u64(
x,
vdupq_n_u64(UINT64_C(0x03fc000003fc0000))
),
10
)
);
x = vreinterpretq_u64_u8(
vqtbl1q_u8(
vreinterpretq_u8_u64(x),
vld1q_u8(((uint8_t[16]){
0x00, 0x80, 0x01, 0x80,
0x04, 0x80, 0x05, 0x80,
0x08, 0x80, 0x09, 0x80,
0x0c, 0x80, 0x0d, 0x80})
)
)
);
uint64x2_t y = vandq_u64(
x,
vdupq_n_u64(UINT64_C(0x00f000f000f000f0))
);
return vreinterpretq_u8_u64(
vorrq_u64(
veorq_u64(x, y),
vshlq_n_u64(y, 4)
)
);
}
static force_inline vec128
seq_expand_vec128 (uint16_t seq)
{
return vec128_from_u8x16(seq_expand_u8x16(seq));
}
#endif /* __aarch64__ */
#if defined(__x86_64__) || defined(_MSC_VER)
# define pext_mask 0x783c1e0f783c1e0fULL
# define pdep_mask 0x0f0f0f0f0f0f0f0fULL
static force_inline uint64_t
seq_expand (uint16_t seq)
{
u8x16 k = _mm_shuffle_epi8(
_mm_set_epi64x(
0, (uint32_t)(seq << 12U | seq >> 4U) << 16U | seq
),
_mm_set_epi64x(0x0303030301010101LL, 0x0202020200000000LL)
);
return _pext_u64((uint64_t)_mm_extract_epi64(k, 1), pext_mask) << 32U
| _pext_u64((uint64_t)_mm_extract_epi64(k, 0), pext_mask);
}
static force_inline vec128
seq_expand_vec128 (uint16_t seq)
{
u8x16 k = _mm_shuffle_epi8(
_mm_set_epi64x(
0, (uint32_t)(seq << 12U | seq >> 4U) << 16U | seq
),
_mm_set_epi64x(0x0303030301010101LL, 0x0202020200000000LL)
);
return (vec128){
.u64[0] = _pdep_u64(
_pext_u64(
(unsigned long long)_mm_extract_epi64(k, 0),
pext_mask
),
pdep_mask
),
.u64[1] = _pdep_u64(
_pext_u64(
(unsigned long long)_mm_extract_epi64(k, 1),
pext_mask
),
pdep_mask
)
};
}
static force_inline u8x16
seq_expand_u8x16 (uint16_t seq)
{
return u8x16_from_vec128(seq_expand_vec128(seq));
}
# undef pdep_mask
# undef pext_mask
static const_inline u8x16
mutate (u8x16 state,
u8x16 paths)
{
return _mm_shuffle_epi8(state, paths);
}
static const_inline uint16_t
sum_of_abs_diff (u8x16 a,
u8x16 b)
{
__m128i x = _mm_sad_epu8(a, b);
return _mm_extract_epi16(x, 0)
+ _mm_extract_epi16(x, 4);
}
static const_inline u8x16
age_and_mutate (u8x16 state,
u8x16 paths)
{
return _mm_shuffle_epi8(
_mm_subs_epu8(
state,
_mm_set_epi64x(
0x1010101010101010LL,
0x1010101010101010LL
)
),
paths
);
}
static const_inline u8x16
merge_input (u8x16 state,
u8x16 input)
{
return _mm_blendv_epi8(
state,
input,
/* Mask construction: adding 0x70 (saturated)
* sets the top bit iff the value is at least
* 0x10, and the result can be used as a mask
* in `_mm_blendv_epi8()`.
*/
_mm_adds_epu8(
input,
_mm_set_epi64x(
0x7070707070707070LL,
0x7070707070707070LL
)
)
);
}
static const_inline u8x16
select_nonzero_input (u8x16 state,
u8x16 input)
{
return _mm_blendv_epi8(
input,
state,
_mm_cmpeq_epi8(
input,
_mm_set_epi64x(0LL, 0LL)
)
);
}
#endif /* __x86_64__ || _MSC_VER */
static const_inline uint16_t
mutation_sad (u8x16 state,
u8x16 paths)
{
return sum_of_abs_diff(
state,
mutate(state, paths)
);
}
#endif /* B24_SEQ_H_ */
/**
* @file test-cortex.c
*
* Compiling with MSVC 19.41 or later:
*
* cl.exe /TC /std:clatest /O2 /Oi /GL /GF /Zo- /favor:AMD64 /arch:AVX2 cortex.c test-cortex.c /Fe: test-cortex.exe /MD /link ntdll.lib
*/
#include <inttypes.h>
#include <math.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "cortex.h"
int
main (void)
{
char screen[69897];
char *s = screen;
struct cortex *ctx = cortex_create();
*s++ = '\033';
*s++ = '[';
*s++ = '2';
*s++ = 'J';
for (uint64_t k = 0U;; ++k) {
*s++ = '\033';
*s++ = '[';
*s++ = 'H';
for (uint16_t row = 0U; row < 128U; row += 2U) {
uint8_t r = 255U;
for (uint16_t col = 0U; col < 128U; col += 2U) {
struct cortex_loc loc =
linear_loc_fwd(
(struct linear_loc){
.row = row,
.col = col
}
);
uint8_t b = ctx->data[0].v[loc.seq << 4U | loc.rot]
.u8[loc.chn ];
uint8_t p = b >> 4U;
uint8_t q = b & 15U;
if (p != r) {
r = p;
*s++ = '\033';
*s++ = '[';
*s++ = '3';
*s++ = '8';
*s++ = ';';
*s++ = '5';
*s++ = ';';
p = 232U + ((p & 8U)
? -((16U - p) * 23U - 3U)
: p);
if (p >= 10U) {
if (p >= 100U) {
*s++ = (char)(p / 100U + (uint8_t)'0');
p %= 100U;
}
*s++ = (char)(p / 10U + (uint8_t)'0');
p %= 10U;
}
*s++ = (char)(p + (uint8_t)'0');
*s++ = 'm';
}
*s++ = "0123456789abcdef"[p];
*s++ = "0123456789abcdef"[q];
}
*s++ = '\n';
}
*s++ = '\0';
s = screen;
printf("%s\033[m%128" PRIu64 "\n", s, k);
cortex_tick(ctx);
//hol_up(100000000);
}
cortex_destroy(&ctx);
}
/** @file util.h
*/
#ifndef B24_UTIL_H_
#define B24_UTIL_H_
#ifdef _MSC_VER
# define WIN32_LEAN_AND_MEAN
# include <intrin.h>
# include <windows.h>
# include <sysinfoapi.h>
#else
# if defined(__aarch64__)
# include <arm_neon.h>
# elif defined(__x86_64__)
# include <immintrin.h>
# endif
#endif
#include <stddef.h>
#include <stdint.h>
#ifdef __aarch64__
typedef uint8x16_t u8x16;
#else
typedef __m128i u8x16;
#endif
#ifndef __has_builtin
# define __has_builtin(...) 0
#endif
#ifdef _MSC_VER
# define __attribute__(...)
# define force_inline __forceinline
# define pragma_msvc(...) _Pragma(#__VA_ARGS__)
# define typeof __typeof__
#else
# define force_inline __attribute__((always_inline)) inline
# define pragma_msvc(...)
#endif
#define array_size(a) (sizeof (a) / sizeof (a)[0])
#define const_function __attribute__((const))
#define const_inline const_function force_inline
#define pure_inline __attribute__((pure)) force_inline
/**
* @brief Macros for doing Morton encoding stuff.
*
* We employ Morton encoding to map linear coordinates (e.g. array offsets
* and bit indices) onto a quad tree space. This allows us to think of the
* layer topology as recursive nested quadrants, which is a more intuitive
* way to be confused.
*
* What happens in practice is that we take the upper and lower halves of
* an index and interleave them to form a single Morton encoded address.
* `fwd()` converts a linear address to a quad tree representation, `rev()`
* does the inverse, and `M()` computes the mask for a given bit width and
* interleaving pattern.
*
* Common parameters:
*
* @param T The integer width to use for computing and storing the Morton
* encoded value (usually 32 or 64).
* @param W The number of value bits to encode. Must be a multiple of 2.
* @param odd Whether to operate on the odd or even bits of the value.
* @param v The value to encode.
*/
#define M(T, W, odd) (((uint##T##_t)-1) / 3U << !!(odd) >> ((8U << ((W > 32U) + 2U)) - W))
#define fwd(T, W, v) (_pdep_u##T((v), M(T, W, 0)) | _pdep_u##T((v) >> W / 2U, M(T, W, 1)))
#define rev(T, W, v) (_pext_u##T((v), M(T, W, 0)) | _pext_u##T((v), M(T, W, 1)) << W / 2U)
#define container_of(ptr, type, member) \
((type *)((char *)(1 ? (ptr) : &((type *)0)->member) - offsetof(type, member)))
#endif /* B24_UTIL_H_ */
/** @file vec.h
*/
#ifndef B24_VEC_H_
#define B24_VEC_H_
#include <stdint.h>
#include "util.h"
typedef struct vec4x2 {
uint8_t x : 4; uint8_t y : 4;
} vec4x2;
typedef struct vec4x4 {
uint16_t x0 : 4; uint16_t y0 : 4;
uint16_t x1 : 4; uint16_t y1 : 4;
} vec4x4;
typedef struct vec4x8 {
uint32_t x0 : 4; uint32_t y0 : 4;
uint32_t x1 : 4; uint32_t y1 : 4;
uint32_t x2 : 4; uint32_t y2 : 4;
uint32_t x3 : 4; uint32_t y3 : 4;
} vec4x8;
typedef struct vec4x16 {
uint64_t x0 : 4; uint64_t y0 : 4;
uint64_t x1 : 4; uint64_t y1 : 4;
uint64_t x2 : 4; uint64_t y2 : 4;
uint64_t x3 : 4; uint64_t y3 : 4;
uint64_t x4 : 4; uint64_t y4 : 4;
uint64_t x5 : 4; uint64_t y5 : 4;
uint64_t x6 : 4; uint64_t y6 : 4;
uint64_t x7 : 4; uint64_t y7 : 4;
} vec4x16;
typedef struct vec128 {
union {
uint8_t u8[ 16];
vec4x2 u4x2[16];
uint16_t u16[ 8];
vec4x4 u4x4[ 8];
uint32_t u32[ 4];
vec4x8 u4x8[ 4];
uint64_t u64[ 2];
vec4x16 u4x16[2];
};
} vec128;
_Static_assert(sizeof(vec4x2 ) == 1U,"");
_Static_assert(sizeof(vec4x4 ) == 2U,"");
_Static_assert(sizeof(vec4x8 ) == 4U,"");
_Static_assert(sizeof(vec4x16) == 8U,"");
_Static_assert(sizeof(vec128 ) == 16U,"");
struct chunk {
vec128 v[256];
};
#ifdef __aarch64__
static pure_inline u8x16
u8x16_from_vec128 (vec128 v)
{
return vld1q_u8(&v.u8[0]);
}
static pure_inline vec128
vec128_from_u8x16 (u8x16 v)
{
vec128 ret;
vst1q_u8(&ret.u8[0], v);
return ret;
}
#else
static pure_inline u8x16
u8x16_from_vec128 (vec128 v)
{
return _mm_set_epi64x(
(long long)v.u64[1],
(long long)v.u64[0]
);
}
static pure_inline vec128
vec128_from_u8x16 (u8x16 v)
{
return (vec128){
.u64[0] = (uint64_t)_mm_extract_epi64(v, 0),
.u64[1] = (uint64_t)_mm_extract_epi64(v, 1),
};
}
#endif
#endif /* B24_VEC_H_ */
// ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)
//
// Copyright (c) 2020 Zach Wegner
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "popcnt.h"
// ZP7: branchless PEXT/PDEP replacement code for non-Intel processors
//
// The PEXT/PDEP instructions are pretty cool, with various (usually arcane)
// uses, behaving like bitwise gather/scatter instructions. They were introduced
// by Intel with the BMI2 instructions on Haswell.
//
// AMD processors implement these instructions, but very slowly. PEXT/PDEP can
// take from 18 to ~300 cycles, depending on the input mask. See this table:
// https://mobile.twitter.com/InstLatX64/status/1209095219087585281
// Other processors don't have PEXT/PDEP at all. This code is a polyfill for
// these processors. It's much slower than the raw instructions on Intel chips
// (which are 3L1T), but should be faster than AMD's implementation.
//
// Description of the algorithm
// ====
//
// This code uses a "parallel prefix popcount" technique (hereafter PPP for
// brevity). What this means is that we determine, for every bit in the input
// mask, how many bits below it are set. Or rather, aren't set--we need to get
// a count of how many bits each input bit should be shifted to get to its final
// position, that is, the difference between the bit-index of its destination
// and its original bit-index. This is the same as the count of unset bits in
// the mask below each input bit.
//
// The dumb way to get this PPP would be to create a 64-element array in a loop,
// but we want to do this in a bit-parallel fashion. So we store the counts
// "vertically" across six 64-bit values: one 64-bit value holds bit 0 of each
// of the 64 counts, another holds bit 1, etc. We can compute these counts
// fairly easily using a parallel prefix XOR (XOR is equivalent to a 1-bit
// adder that wraps around and ignores carrying). Using parallel prefix XOR as
// a 1-bit adder, we can build an n-bit adder by shifting the result left by
// one and ANDing with the input bits: this computes the carry by seeing where
// an input bit causes the 1-bit sum to overflow from 1 to 0. The shift left
// is needed anyways, because we want the PPP values to represent population
// counts *below* each bit, not including the bit itself.
//
// For processors with the CLMUL instructions (most x86 CPUs since ~2010), we
// can do the parallel prefix XOR and left shift in one instruction, by
// doing a carry-less multiply by -2.
//
// Anyways, once we have these six 64-bit values of the PPP, we can use each
// PPP bit to shift input bits by a power of two. That is, input bits that are
// in the bit-0 PPP mask are shifted by 2**0==1, bits in the bit-1 mask get
// shifted by 2, and so on, for shifts by 4, 8, 16, and 32 bits. Out of these
// six shifts, any shift value between 0 and 63 can be composed.
//
// For PEXT, we have to perform each shift in increasing order (1, 2, ...32) so
// that input bits don't overlap in the intermediate results. PDEP is the
// opposite: the 32-bit shift needs to happen first to make room for the smaller
// shifts. There's also a small complication for PDEP in that the PPP values
// line up where the input bits *end*, rather than where the input bits start
// like for PEXT. This means each bit mask needs to be shifted backwards before
// ANDing with the input bits.
//
// For both PEXT/PDEP the input bits need to be pre-masked so that only the
// relevant bits are being shifted around. For PEXT, this is a simple AND
// (input &= mask), but for PDEP we have to mask out everything but the low N
// bits, where N is the population count of the mask.
#define N_BITS (6)
typedef struct {
uint64_t mask;
uint64_t ppp_bit[N_BITS];
} zp7_masks_64_t;
#if !(defined(__aarch64__) && defined(__ARM_FEATURE_AES)) && \
!((defined(__i386__) || defined(__x86_64__)) && defined(__PCLMUL__))
// If we don't have access to the CLMUL instruction, emulate it with
// shifts and XORs
# define prefix_sum(x) ({ \
typeof(_Generic(x, uint32_t:(uint32_t)0, uint64_t:(uint64_t)0)) y = x; \
y ^= y << 1U; y ^= y << 2U; y ^= y << 4U; y ^= y << 8U; y ^= y << 16U; \
_Generic(y, uint32_t:y, uint64_t:y ^ y << 32U); })
#endif
// Parallel-prefix-popcount. This is used by both the PEXT/PDEP polyfills.
// It can also be called separately and cached, if the mask values will be used
// more than once (these can be shared across PEXT and PDEP calls if they use
// the same masks).
const_function
static zp7_masks_64_t zp7_ppp_64(uint64_t mask) {
zp7_masks_64_t r;
r.mask = mask;
// Count *unset* bits
mask = ~mask;
#if defined(__aarch64__) && defined(__ARM_FEATURE_AES)
uint64x2_t m = vdupq_n_u64(mask);
uint64x2_t neg_2 = vdupq_n_u64(-2LL);
for (int i = 0; i < N_BITS - 1; i++) {
uint64x2_t bit = vreinterpretq_u64_p128(vmull_p64(
vgetq_lane_u64(m, 0), vgetq_lane_u64(neg_2, 0)));
r.ppp_bit[i] = vgetq_lane_u64(bit, 0);
m = vandq_u64(m, bit);
}
r.ppp_bit[N_BITS - 1] = -vgetq_lane_u64(m, 0) << 1;
#elif (defined(__i386__) || defined(__x86_64__)) && defined(__PCLMUL__)
// Move the mask and -2 to XMM registers for CLMUL
__m128i m = _mm_cvtsi64_si128(mask);
__m128i neg_2 = _mm_cvtsi64_si128(-2LL);
for (int i = 0; i < N_BITS - 1; i++) {
// Do a 1-bit parallel prefix popcount, shifted left by 1,
// in one carry-less multiply by -2.
__m128i bit = _mm_clmulepi64_si128(m, neg_2, 0);
r.ppp_bit[i] = _mm_cvtsi128_si64(bit);
// Get the carry bit of the 1-bit parallel prefix popcount. On
// the next iteration, we will sum this bit to get the next mask
m = _mm_and_si128(m, bit);
}
// For the last iteration, we can use a regular multiply by -2 instead of a
// carry-less one (or rather, a strength reduction of that, with
// neg/add/etc), since there can't be any carries anyways. That is because
// the last value of m (which has one bit set for every 32nd unset mask bit)
// has at most two bits set in it, when mask is zero and thus there are 64
// bits set in ~mask. If two bits are set, one of them is the top bit, which
// gets shifted out, since we're counting bits below each mask bit.
r.ppp_bit[N_BITS - 1] = -_mm_cvtsi128_si64(m) << 1;
#else
for (int i = 0; i < N_BITS - 1; i++) {
// Do a 1-bit parallel prefix popcount, shifted left by 1
uint64_t bit = prefix_sum(mask << 1);
r.ppp_bit[i] = bit;
// Get the carry bit of the 1-bit parallel prefix popcount. On
// the next iteration, we will sum this bit to get the next mask
mask &= bit;
}
// The last iteration won't carry, so just use neg/shift. See the CLMUL
// case above for justification.
r.ppp_bit[N_BITS - 1] = -mask << 1;
#endif
return r;
}
// PEXT
static force_inline uint64_t
zp7_pext_pre_64(uint64_t a, const zp7_masks_64_t *masks) {
// Mask only the bits that are set in the input mask. Otherwise they collide
// with input bits and screw everything up
a &= masks->mask;
// For each bit in the PPP, shift right only those bits that are set in
// that bit's mask
for (int i = 0; i < N_BITS; i++) {
uint64_t shift = 1 << i;
uint64_t bit = masks->ppp_bit[i];
// Shift only the input bits that are set in
a = (a & ~bit) | ((a & bit) >> shift);
}
return a;
}
uint64_t zp7_pext_64(uint64_t a, uint64_t mask) asm("_pext_u64");
uint64_t zp7_pext_64(uint64_t a, uint64_t mask) {
zp7_masks_64_t masks = zp7_ppp_64(mask);
return zp7_pext_pre_64(a, &masks);
}
uint32_t zp7_pext_32(uint32_t a, uint32_t mask) asm("_pext_u32");
uint32_t zp7_pext_32(uint32_t a, uint32_t mask) {
return (uint32_t)zp7_pext_64(a, mask);
}
// PDEP
static force_inline uint64_t
zp7_pdep_pre_64(uint64_t a, const zp7_masks_64_t *masks) {
uint64_t nbits = popcnt(masks->mask);
// Mask just the bits that will end up in the final result--the low P bits,
// where P is the popcount of the mask. The other bits would collide.
// We need special handling for the mask==-1 case: because 64-bit shifts are
// implicitly modulo 64 on x86 (and (uint64_t)1 << 64 is technically
// undefined behavior in C), the regular "a &= (1 << pop) - 1" doesn't
// work: (1 << popcnt(-1)) - 1 == (1 << 64) - 1 == (1 << 0) - 1 == 0, but
// this should be -1. The BZHI instruction (introduced with BMI2, the same
// instructions as PEXT/PDEP) handles this properly, but isn't portable.
#if (defined(__i386__) || defined(__x86_64__)) && defined(__BMI2__)
a = _bzhi_u64(a, nbits);
#else
// If we don't have BZHI, use a portable workaround. Since (mask == -1)
// is equivalent to popcnt(mask) >> 6, use that to mask out the 1 << 64
// case.
uint64_t pop_mask = (1ULL << nbits) & ~(nbits >> 6);
a &= pop_mask - 1;
#endif
// For each bit in the PPP, shift left only those bits that are set in
// that bit's mask. We do this in the opposite order compared to PEXT
for (int i = N_BITS - 1; i >= 0; i--) {
uint64_t shift = 1 << i;
uint64_t bit = masks->ppp_bit[i] >> shift;
// Micro-optimization: the bits that get shifted and those that don't
// will always be disjoint, so we can add them instead of ORing them.
// The shifts of 1 and 2 can thus merge with the adds to become LEAs.
a = (a & ~bit) + ((a & bit) << shift);
}
return a;
}
uint64_t zp7_pdep_64(uint64_t a, uint64_t mask) asm("_pdep_u64");
uint64_t zp7_pdep_64(uint64_t a, uint64_t mask) {
zp7_masks_64_t masks = zp7_ppp_64(mask);
return zp7_pdep_pre_64(a, &masks);
}
uint32_t zp7_pdep_32(uint32_t a, uint32_t mask) asm("_pdep_u32");
uint32_t zp7_pdep_32(uint32_t a, uint32_t mask) {
return (uint32_t)zp7_pdep_64(a, mask);
}
#ifndef ZP7_H_
#define ZP7_H_
#ifndef __cplusplus
#include <stdint.h>
#define STD
#else // __cplusplus
#include <cstdint>
#define STD ::std::
extern "C" {
#endif // __cplusplus
extern STD uint32_t _pext_u32(STD uint32_t a, STD uint32_t mask);
extern STD uint64_t _pext_u64(STD uint64_t a, STD uint64_t mask);
extern STD uint32_t _pdep_u32(STD uint32_t a, STD uint32_t mask);
extern STD uint64_t _pdep_u64(STD uint64_t a, STD uint64_t mask);
#undef STD
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // ZP7_H_
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment