Skip to content

Instantly share code, notes, and snippets.

@Marc-B-Reynolds
Created December 6, 2024 14:38
Show Gist options
  • Save Marc-B-Reynolds/45a1742df711a1046981b8221311d1b4 to your computer and use it in GitHub Desktop.
Save Marc-B-Reynolds/45a1742df711a1046981b8221311d1b4 to your computer and use it in GitHub Desktop.
Skims two methods: bisection method for random numbers with given popcount & branchfree random bit permutation with bit-scatter or sheep-and-goats op.
// UNLICENSE or whatever other you want that doesn't hold me
// responsible for you choosing to use any of this.
// a sketch of a few niche things:
// 1) generate a random number with a given population count
// 2) perform a random bit permutation of the input
// 3) not quite random versions of (2)
// 4) use (2) or (3) to produce stateful variants of (1)
#include <stdint.h>
static inline uint32_t pop_64(uint64_t x) { return (uint32_t)__builtin_popcountl(x); }
//**************************************************************************
// intel specific defs
#include <x86intrin.h>
static inline uint64_t bit_scatter_64(uint64_t x, uint64_t m) { return _pdep_u64(x, m); }
// just for minimal test driver at the end
static inline uint32_t ctz_64(uint64_t x) { return (uint32_t)_tzcnt_u64(x); }
static inline uint64_t bit_gzip_64(uint64_t v, uint64_t m)
{
uint32_t p = pop_64(m) & 0x3f;
uint64_t a = bit_scatter_64(v, m);
uint64_t b = bit_scatter_64(v>>p,~m);
return a^b;
}
//**************************************************************************
// mini PRNG (a unparameterized PCG variant)
typedef struct { uint64_t state; } prng_t;
static inline uint64_t prng_mix_64(uint64_t x)
{
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9;
x = (x ^ (x >> 27)) * 0x94d049bb133111eb;
x = (x ^ (x >> 31));
return x;
}
static const uint64_t prng_mul_k = UINT64_C(0xd1342543de82ef95);
static const uint64_t prng_add_k = UINT64_C(0x2545f4914f6cdd1d);
static inline uint64_t prng_u64(prng_t* prng)
{
uint64_t s = prng->state;
uint64_t r = prng_mix_64(s);
prng->state = prng_mul_k * s + prng_add_k;
return r;
}
//**************************************************************************
// minimal implementation of using bisection to produce a random number
// with a specified population count. As written is "portable" but it's
// nice to have hardware population count.
//
// The method works by using the bisection method. We track an
// lower and upper bounds {min,max} where pop(min) < target < pop(max)
// at each step a candidate 'x' is produce that contains all
// the set bits of 'min' and none of the clear bits of 'max'.
// then we compute the population count of 'x' and update the
// appropriate bound and loop until we've hit the target.
//
// This minimal implementation (on average) performs log2(b) steps
// (where b is the bit width so 6). Most of the convergence happens
// in the first few steps. As-is it isn't too bad. The std-dev is
// about ~2 so most calls will complete with ~4-8 iterations but it
// can have a long tail. An empirical test with ~2^26 samples is
// see iteration counts as high as the mid 30s. (sadface).
// First order statistics for this test are in this gist:
// https://gist.github.com/Marc-B-Reynolds/57509af21e42bc7ec9d2da5f7e55f02e
//
// There's some low-hanging fruit here. Noted below is changing the
// bound update to be branchfree. Another is initializing min/max
// out of the loop. The loop could be restructured, etc. etc. But
// I've never thought of a "portable" way to handle reducing the
// tail. One method could be to exit the loop when within a window
// of the target and uniformly set or clear bits but the only fast
// way that's occurred to me is using bit-scatter ops which is the
// core of the alternate method below.
//
// I'm ignoring the fact that specialized versions could be created
// when the target popcount is either low or high. beyond scope.
// (aside these are the same since we can BITNOT the output to flip
// between the two)
static inline uint64_t bit_random_pop(uint32_t pop, prng_t* prng)
{
uint64_t min = 0;
uint64_t max = ~min;
uint32_t n = 0;
uint64_t x;
while (n != pop) {
x = prng_u64(prng);
x = min | (x & max);
n = pop_64(x);
// update the changed bound. branch is random
// so "should" be moved to brachfree select.
if (n > pop)
max = x;
else
min = x;
}
return min;
}
//**************************************************************************
// branch-free methods (but requires specific hardware ops)
// simply one random gzip or sheep-and-goats
// note that gzip isn't sheep-and-goats the selector 'm' becomes '~m'
// to flip between the two. since we don't need to produce the same
// sequence of results on different hardware (say some future hardware
// has a direct SAG op) don't need to perform that next step.
static inline uint64_t bit_random_permute_step_64(uint64_t x, prng_t* prng)
{
return bit_gzip_64(x, prng_u64(prng));
}
// any 'b' bit permutation can be produced by log2(b) sheep-and-goats
// steps. flip to making the shuffle random to produce a random
// bit permutation of the input.
static inline uint64_t bit_random_permute_64(uint64_t x, prng_t* prng)
{
// log2(64) = 6 branchfree steps (the average of bisection method)
x = bit_random_permute_step_64(x, prng);
x = bit_random_permute_step_64(x, prng);
x = bit_random_permute_step_64(x, prng);
x = bit_random_permute_step_64(x, prng);
x = bit_random_permute_step_64(x, prng);
x = bit_random_permute_step_64(x, prng);
return x;
}
// a drop-in replacement for the bisection method (assuming producing
// the same sequence on different hardware isn't required).
// simply:
// 1) set 'x' to having some 'pop' bits set (lower here)
// 2) apply the random bit permutation
static inline uint64_t bit_random_pop_alt(uint32_t pop, prng_t* prng)
{
// not bothering with pop = 0 case because that's goofy
uint64_t x = UINT64_MAX >> ((64-pop) & 63);
return bit_random_permute_64(x, prng);
}
// by adding a 'state' we can walk a sequence of x values
// 1) initialize 'x' to desired popcount
// 2) call with 'x' to walk the sequence
static inline uint64_t bit_random_pop_next(uint64_t x, prng_t* prng)
{
return bit_random_permute_64(x, prng);
}
// finally just the previous but we can drop some of the steps.
// This makes the sequence "not random" in that successive elements
// aren't complete independent from one another. However they do
// (as a sequence) map all bits uniformly to all output bit
// positions. The more steps includes increases the independece.
static inline uint64_t bit_i_cant_believe_its_not_random_pop_next(uint64_t x, prng_t* prng)
{
// one step is sufficient to be uniform over the sequence.
// after four steps we'd start passing a fair amount of
// statistical tests of independence.
x = bit_random_permute_step_64(x, prng);
//x = bit_random_permute_step_64(x, prng);
//x = bit_random_permute_step_64(x, prng);
//x = bit_random_permute_step_64(x, prng);
//x = bit_random_permute_step_64(x, prng);
//x = bit_random_permute_step_64(x, prng);
return x;
}
//**************************************************************************
// minimal demo of uniformity of output bit positions being visited by
// an single bit value.
#include <stdio.h>
#include <math.h>
// choose bit permutation method
#define STEP bit_i_cant_believe_its_not_random_pop_next
int main(void)
{
prng_t prng;
uint32_t data[64] = {0};
uint64_t x = 1; // any single bit value
uint32_t trials = 0x100000;
// randomize run
prng.state = __rdtsc()*UINT64_C(0x7fb5d329728ea185);
prng_u64(&prng);
prng_u64(&prng);
// just look at unformity of output positions
for(uint32_t i=0; i<trials; i++) {
x = bit_random_permute_64(x, &prng);
data[ctz_64(x)]++;
}
double expected = (double)trials * (1.0/64.0);
double chi = 0.0;
for(uint32_t i=0; i<64; i++) {
double d = (double)data[i]-expected;
chi = fma(d,d,chi);
}
chi /= expected;
//double pvalue = chi_squared_pvalue(chi,63);
printf("expected: %f\n", expected);
printf("chi-squared: %f\n", chi);
//printf("p-value: %f\n", pvalue);
// dump raw data
printf("{%u", data[0]);
for(uint32_t i=1; i<64; i++) {
printf(",%u",data[i]);
}
printf("}\n");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment