Created
December 6, 2024 14:38
-
-
Save Marc-B-Reynolds/45a1742df711a1046981b8221311d1b4 to your computer and use it in GitHub Desktop.
Skims two methods: bisection method for random numbers with given popcount & branchfree random bit permutation with bit-scatter or sheep-and-goats op.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// UNLICENSE or whatever other you want that doesn't hold me | |
// responsible for you choosing to use any of this. | |
// a sketch of a few niche things: | |
// 1) generate a random number with a given population count | |
// 2) perform a random bit permutation of the input | |
// 3) not quite random versions of (2) | |
// 4) use (2) or (3) to produce stateful variants of (1) | |
#include <stdint.h> | |
static inline uint32_t pop_64(uint64_t x) { return (uint32_t)__builtin_popcountl(x); } | |
//************************************************************************** | |
// intel specific defs | |
#include <x86intrin.h> | |
static inline uint64_t bit_scatter_64(uint64_t x, uint64_t m) { return _pdep_u64(x, m); } | |
// just for minimal test driver at the end | |
static inline uint32_t ctz_64(uint64_t x) { return (uint32_t)_tzcnt_u64(x); } | |
static inline uint64_t bit_gzip_64(uint64_t v, uint64_t m) | |
{ | |
uint32_t p = pop_64(m) & 0x3f; | |
uint64_t a = bit_scatter_64(v, m); | |
uint64_t b = bit_scatter_64(v>>p,~m); | |
return a^b; | |
} | |
//************************************************************************** | |
// mini PRNG (a unparameterized PCG variant) | |
typedef struct { uint64_t state; } prng_t; | |
static inline uint64_t prng_mix_64(uint64_t x) | |
{ | |
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9; | |
x = (x ^ (x >> 27)) * 0x94d049bb133111eb; | |
x = (x ^ (x >> 31)); | |
return x; | |
} | |
static const uint64_t prng_mul_k = UINT64_C(0xd1342543de82ef95); | |
static const uint64_t prng_add_k = UINT64_C(0x2545f4914f6cdd1d); | |
static inline uint64_t prng_u64(prng_t* prng) | |
{ | |
uint64_t s = prng->state; | |
uint64_t r = prng_mix_64(s); | |
prng->state = prng_mul_k * s + prng_add_k; | |
return r; | |
} | |
//************************************************************************** | |
// minimal implementation of using bisection to produce a random number | |
// with a specified population count. As written is "portable" but it's | |
// nice to have hardware population count. | |
// | |
// The method works by using the bisection method. We track an | |
// lower and upper bounds {min,max} where pop(min) < target < pop(max) | |
// at each step a candidate 'x' is produce that contains all | |
// the set bits of 'min' and none of the clear bits of 'max'. | |
// then we compute the population count of 'x' and update the | |
// appropriate bound and loop until we've hit the target. | |
// | |
// This minimal implementation (on average) performs log2(b) steps | |
// (where b is the bit width so 6). Most of the convergence happens | |
// in the first few steps. As-is it isn't too bad. The std-dev is | |
// about ~2 so most calls will complete with ~4-8 iterations but it | |
// can have a long tail. An empirical test with ~2^26 samples is | |
// see iteration counts as high as the mid 30s. (sadface). | |
// First order statistics for this test are in this gist: | |
// https://gist.github.com/Marc-B-Reynolds/57509af21e42bc7ec9d2da5f7e55f02e | |
// | |
// There's some low-hanging fruit here. Noted below is changing the | |
// bound update to be branchfree. Another is initializing min/max | |
// out of the loop. The loop could be restructured, etc. etc. But | |
// I've never thought of a "portable" way to handle reducing the | |
// tail. One method could be to exit the loop when within a window | |
// of the target and uniformly set or clear bits but the only fast | |
// way that's occurred to me is using bit-scatter ops which is the | |
// core of the alternate method below. | |
// | |
// I'm ignoring the fact that specialized versions could be created | |
// when the target popcount is either low or high. beyond scope. | |
// (aside these are the same since we can BITNOT the output to flip | |
// between the two) | |
static inline uint64_t bit_random_pop(uint32_t pop, prng_t* prng) | |
{ | |
uint64_t min = 0; | |
uint64_t max = ~min; | |
uint32_t n = 0; | |
uint64_t x; | |
while (n != pop) { | |
x = prng_u64(prng); | |
x = min | (x & max); | |
n = pop_64(x); | |
// update the changed bound. branch is random | |
// so "should" be moved to brachfree select. | |
if (n > pop) | |
max = x; | |
else | |
min = x; | |
} | |
return min; | |
} | |
//************************************************************************** | |
// branch-free methods (but requires specific hardware ops) | |
// simply one random gzip or sheep-and-goats | |
// note that gzip isn't sheep-and-goats the selector 'm' becomes '~m' | |
// to flip between the two. since we don't need to produce the same | |
// sequence of results on different hardware (say some future hardware | |
// has a direct SAG op) don't need to perform that next step. | |
static inline uint64_t bit_random_permute_step_64(uint64_t x, prng_t* prng) | |
{ | |
return bit_gzip_64(x, prng_u64(prng)); | |
} | |
// any 'b' bit permutation can be produced by log2(b) sheep-and-goats | |
// steps. flip to making the shuffle random to produce a random | |
// bit permutation of the input. | |
static inline uint64_t bit_random_permute_64(uint64_t x, prng_t* prng) | |
{ | |
// log2(64) = 6 branchfree steps (the average of bisection method) | |
x = bit_random_permute_step_64(x, prng); | |
x = bit_random_permute_step_64(x, prng); | |
x = bit_random_permute_step_64(x, prng); | |
x = bit_random_permute_step_64(x, prng); | |
x = bit_random_permute_step_64(x, prng); | |
x = bit_random_permute_step_64(x, prng); | |
return x; | |
} | |
// a drop-in replacement for the bisection method (assuming producing | |
// the same sequence on different hardware isn't required). | |
// simply: | |
// 1) set 'x' to having some 'pop' bits set (lower here) | |
// 2) apply the random bit permutation | |
static inline uint64_t bit_random_pop_alt(uint32_t pop, prng_t* prng) | |
{ | |
// not bothering with pop = 0 case because that's goofy | |
uint64_t x = UINT64_MAX >> ((64-pop) & 63); | |
return bit_random_permute_64(x, prng); | |
} | |
// by adding a 'state' we can walk a sequence of x values | |
// 1) initialize 'x' to desired popcount | |
// 2) call with 'x' to walk the sequence | |
static inline uint64_t bit_random_pop_next(uint64_t x, prng_t* prng) | |
{ | |
return bit_random_permute_64(x, prng); | |
} | |
// finally just the previous but we can drop some of the steps. | |
// This makes the sequence "not random" in that successive elements | |
// aren't complete independent from one another. However they do | |
// (as a sequence) map all bits uniformly to all output bit | |
// positions. The more steps includes increases the independece. | |
static inline uint64_t bit_i_cant_believe_its_not_random_pop_next(uint64_t x, prng_t* prng) | |
{ | |
// one step is sufficient to be uniform over the sequence. | |
// after four steps we'd start passing a fair amount of | |
// statistical tests of independence. | |
x = bit_random_permute_step_64(x, prng); | |
//x = bit_random_permute_step_64(x, prng); | |
//x = bit_random_permute_step_64(x, prng); | |
//x = bit_random_permute_step_64(x, prng); | |
//x = bit_random_permute_step_64(x, prng); | |
//x = bit_random_permute_step_64(x, prng); | |
return x; | |
} | |
//************************************************************************** | |
// minimal demo of uniformity of output bit positions being visited by | |
// an single bit value. | |
#include <stdio.h> | |
#include <math.h> | |
// choose bit permutation method | |
#define STEP bit_i_cant_believe_its_not_random_pop_next | |
int main(void) | |
{ | |
prng_t prng; | |
uint32_t data[64] = {0}; | |
uint64_t x = 1; // any single bit value | |
uint32_t trials = 0x100000; | |
// randomize run | |
prng.state = __rdtsc()*UINT64_C(0x7fb5d329728ea185); | |
prng_u64(&prng); | |
prng_u64(&prng); | |
// just look at unformity of output positions | |
for(uint32_t i=0; i<trials; i++) { | |
x = bit_random_permute_64(x, &prng); | |
data[ctz_64(x)]++; | |
} | |
double expected = (double)trials * (1.0/64.0); | |
double chi = 0.0; | |
for(uint32_t i=0; i<64; i++) { | |
double d = (double)data[i]-expected; | |
chi = fma(d,d,chi); | |
} | |
chi /= expected; | |
//double pvalue = chi_squared_pvalue(chi,63); | |
printf("expected: %f\n", expected); | |
printf("chi-squared: %f\n", chi); | |
//printf("p-value: %f\n", pvalue); | |
// dump raw data | |
printf("{%u", data[0]); | |
for(uint32_t i=1; i<64; i++) { | |
printf(",%u",data[i]); | |
} | |
printf("}\n"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment