camel-cdr · February 1, 2025 21:06
diff --git a/dist_circle.h b/dist_circle.h
 /*
 * Using the Ziggurat Method for Sampling Random Coordinates From a Unit Circle
 * by Olaf Bernstein <[email protected]>.
 * Distributed under the MIT license, see license at the end of the file.
 * New versions available at https://github.com/camel-cdr/cauldron
 *
 * Table of contents ===========================================================
 *
 * 1. Introduction
 *     1.1 API Overview
 * 2. Ziggurat Method
 * 3. Benchmarking
 *     3.1 Alternative Implementations
 *     3.2 Benchmark
 *     3.3 Results
 * 4. Testing
 * References
 * Licensing
 *     MIT License
 *
 * 1. Introduction =============================================================
 *
 * This project was inspired by the article "When Greedy Algorithms Can Be
 * Faster" <1>, which covers an interesting problem:
 *
 * Generate a random coordinate that is uniformly distributed in a unit circle.
 *
 * An easy solution to this problem is to generate random coordinates in the
 * range [-1,1] and reject them if they don't fall in the unit circle:
 */

 #include <stdint.h>
 #include <math.h>
 #include <stddef.h>
 #include <string.h>

 typedef struct { float x, y; } Vec2f;
 static Vec2f
 dist_circlef(uint64_t (*rand64)(void*), void *rng)
 {
 	float fx, fy;
 	do {
 		uint64_t u64 = rand64(rng);
 		fx = (u64 >> (64 - 24)) * (1.0f / ((1u<<24)-1));
 		fy = (u64 & 0xffffff)   * (1.0f / ((1u<<24)-1));
 		fx = fx*2 - 1; /* scale to [-1,1] */
 		fy = fy*2 - 1;
 	} while (fx*fx + fy*fy >= 1.0); /* reject */
 	Vec2f ret = { fx, fy };
 	return ret;
 }

 /*
 * We'll subsequently call this implementation the "regular rejection method".
 * It needs to do on average 27% more iterations than the number of times it is
 * called, which causes lots of branch miss predictions.
 *
 * In the following, we'll present an optimized scalar solution, based on the
 * Ziggurat Method <2>.
 *
 * We won't cover SIMD optimized implementations. There the simple rejection
 * method combined with a compress instruction is likely the best approach.
 *
 * 1.1 API Overview ------------------------------------------------------------
 *
 * Random distributions:
 *     // random 2d coordinate from unit circle
 *     float dist_normalf(uint32_t (*)(void*), void *);
 *
 *     // a faster dist_normalf implementation using a lookup table
 *     void dist_circlef_zig_init(DistCirclefZig *);
 *     Vec2f dist_circlef_zig(DistCirclefZig *, uint64_t (*)(void*), void *);
 *
 *
 * 2. Ziggurat Method ==========================================================
 *
 * The Ziggurat Method <2> was originally introduced by George Marsaglia and
 * Wai Wan Tsang for generating normally distributed numbers <3>.
 *
 * It works by overlapping the normal distribution with boxes, that each
 * encapsulate the same area of the underlying distribution.
 * This can be used to quickly accept
 * coordinates using a small lookup table:     +----------------------------+
 *                                             |      exp(-(x*x)/2) ******  |
 * 1. A box is randomly selected and a         +****-------                 |
 * random x coordinate is generated within     |    ***   |                 |
 * that box, which reduces the number of       |      **  |                 |
 * total rejections.                           | Box 3 ***|                 |
 *                                             +---------***----            |
 * 2. If the x coordinate falls within the     |          :**  |            |
 * bounds of the box above the selected box,   | Box 2    : ***|            |
 * then it is directly accepted.               +--------------***----       |
 *                                             | Box 1         :*** |       |
 * 3. Otherwise, a concrete random y           +------------------****      |
 * coordinate within the box is generated      | Box 0              :****** |
 * and a high resolution rejection test is     +----------------------------+
 * applied to the coordinates.
 *
 *
 * We can apply a similar technique to our problem of sampling a 2d coordinate
 * from a unit circle.
 * For simplicity, we start off looking at only a quarter of the unit circle
 * that lies in the first quadrant of the coordinate system, and apply random
 * signs to the final coordinate later.
 * Now we can again subdivide the quarter circle in n boxes contain an equal
 * area A=pi/4/n of the circle.
 *
 * We can do the step 1. from above and select a random box, and generate a
 * random x, and this time also y coordinate within the box.
 * Since the "high resolution" rejection test is, in contrast to the normal,
 * distribution very cheap for our problem, we skip step 2. and do a quick
 * rejection test with x*x + y*y < 1.
 *
 * |```---...-----------------             |```----..
 * |         ``-..           |             |         `--.
 * |              ``-.       |             |            `--.
 * |    A1            `-.    |             |               ``.
 * |                     `.  |             |-----------------``.-----
 * |                       `.|             |                .-``.   ^
 * |-------------------------`-----        | A_trig      .-`    `.  |
 * |                          `.  |        |         .-``        `. | h
 * |    A2                      ` |        |      .-`     A_seg   : |
 * |                             `|        |  .-``  `.            : |
 * |------------------------------`.--     |.` alpha :            : v
 * |                               `.|     \-------------------------
 * |    A3                          :|
 * |________________________________-.
 * |                                 :|
 * |    A4                           :|  A1 = A2 = A3 = A4 = pi/4/n = pi/4/4
 * |                                 :|
 * \-----------------------------------
 *
 * The challenging part is computing the lookup table that contains the sizes of
 * our boxes.
 *
 * We need to calculate the height h under the quarter circle, given an area A.
 * This can be done by numerically solving the equation that calculates the
 * area A of the circle for a given height h of a box:
 *
 *     A = A_trig + A_seg
 *     A_trig = h*cos(alpha)/2
 *     A_seg = pi * r^2 * alpha/2/pi = alpha/2
 *     alpha = asin(h)
 *     A = A_trig + A_seg = h*cos(asin(h))/2 + asin(h)/2
 *
 * We know that h is in the range [0,1], so we can use a binary search to solve
 * the equation:
 */

 static double
 dist_circle_zig__solve(double area)
 {
 	double A, h = 0.5;
 	for (size_t i = 2; i < 64; ++i) {
 		A = h*cos(asin(h))/2 + asin(h)/2;
 		h += (A < area ? 1.0 : -1.0) / (1ull<<i);
 	}
 	return h;
 }

 /*
 * With this equation worked out, we can now initialize the lookup table.
 *
 * The regular rejection method needs to do on average of 27% more iterations,
 * while the ziggurat method with the default lookup table size of 128 entries,
 * we get down to 0.76% additional iterations.
 */

 #ifndef DIST_CIRCLEF_ZIG_COUNT
 # define DIST_CIRCLEF_ZIG_COUNT 128 /* must be a power of two and <=2^14 */
 #endif
 #define DIST_CIRCLEF_ZIG_AREA (0.7853981633974483/DIST_CIRCLEF_ZIG_COUNT) /* pi/4/n */

 typedef struct {
 	float tbl[DIST_CIRCLEF_ZIG_COUNT*2 + 2];
 } DistCirclefZig;

 static void
 dist_circlef_zig_init(DistCirclefZig *zig)
 {
 	for (size_t i = 0; i < DIST_CIRCLEF_ZIG_COUNT; ++i) {
 		double A = DIST_CIRCLEF_ZIG_AREA * i;
 		double h = dist_circle_zig__solve(A);
 		zig->tbl[i*2+0] = cos(asin(h));
 		zig->tbl[i*2+1] = h;
 	}
 	zig->tbl[DIST_CIRCLEF_ZIG_COUNT*2+0] = 0.0;
 	zig->tbl[DIST_CIRCLEF_ZIG_COUNT*2+1] = 1.0;
 }

 /*
 * Notice how tbl has on additional coordinate and uses a cumulative height.
 * This allows us to compute the height of a single box with
 * tbl[i*2+2]-tbl[i*2]. The vertical position is just tbl[i*2].
 *
 * Note that we don't use a Vec2f array for the lookup table, because the code
 * generation was considerably worse.
 */

 /* needed to efficiently apply sign */
 static float
 u32tof(uint32_t x)
 {
 #ifdef __cplusplus
 	float f; memcpy(&f, &x, sizeof f); return f;
 #else
 	union{ uint32_t i; float f; } u; u.i = x; return u.f;
 #endif
 }
 size_t cnt1 = 0, cnt2 = 0;

 static Vec2f
 dist_circlef_zig(DistCirclefZig const *zig, uint64_t (*rand64)(void*), void *rng)
 {
 	float fx, fy;
 	uint64_t signs;
 	++cnt1;
 	do {
 		++cnt2;
 		uint64_t u64 = rand64(rng);
 		fx = (u64       >> (64 - 24)) * (1.0f / ((1u<<24)-1));
 		fy = (u64 << 24 >> (64 - 24)) * (1.0f / ((1u<<24)-1));
 		signs = u64 >> (64-24*2-2);
 		uint64_t idx = u64 & (DIST_CIRCLEF_ZIG_COUNT-1);

 		float vx = zig->tbl[idx*2+0];
 		float vy = zig->tbl[idx*2+1];
 		float ny = zig->tbl[idx*2+3];
 		fx *= vx; /* scale to box */
 		fy = fy * (ny - vy) + vy;
 	} while (fx*fx + fy*fy >= 1.0); /* reject */
 	/* there usually is a sign injection instruction
 	 * so this compiles down quite neat */
 	Vec2f ret = {
 		copysignf(fx, u32tof(signs<<31)),
 		copysignf(fy, u32tof(signs>>1<<31))
 	};
 	return ret;
 }


 /*
 * 3. Benchmarking =============================================================
 *
 * 3.1 Alternative Implementations ---------------------------------------------
 *
 * One straight forward way to sample a random coordinate from a unit circle is
 * by generating polar coordinates and converting them to Cartesian
 * coordinates.
 * This does however require computing the sine and cosine of an angle, which
 * is a quite expensive operation.
 */

 #ifdef DIST_CIRCLE_BENCH

 static Vec2f
 dist_circlef_trig(uint64_t (*rand64)(void*), void *rng)
 {
 	uint64_t u64 = rand64(rng);
 	float f1 = (u64 >> (64 - 24)) * (1.0f / ((1u<<24)-1));
 	float f2 = (u64 & 0xffffff)   * (1.0f / ((1u<<24)-1));
 	float r = sqrtf(f1);
 	float theta = f2 * 6.283185307179586; /* f2*2*pi */
 	Vec2f ret = { r*cosf(theta), r*sinf(theta) };
 	return ret;
 }

 /*
 * The regular rejection method mentioned in the introduction suffers from a
 * lot of branch miss predictions, because rejections are quite common, with a
 * 21% probability.
 * If we want to generate multiple random coordinates at a time, then we can get
 * rid of the branch altogether and replace with a much less impactful
 * register dependency.
 */

 static void
 dist_circlef_rej_multi(uint64_t (*rand64)(void*), void *rng, Vec2f *ptr, size_t n)
 {
 	for (size_t i = 0; i < n; ) {
 		uint64_t u64 = rand64(rng);
 		float fx = (u64 >> (64 - 24)) * (1.0f / ((1u<<24)-1));
 		float fy = (u64 & 0xffffff)   * (1.0f / ((1u<<24)-1));
 		fx = fx*2 - 1;
 		fy = fy*2 - 1;
 		ptr[i].x += fx; // Note: we accumulate to make sure it's not
 		ptr[i].y += fy; // optimized away, see benchmark code below
 		i += fx*fx + fy*fy < 1.0;
 	}
 }

 /*
 * 3.2 Benchmark ---------------------------------------------------------------
 *
 * We accumulate the results into an array and sum it, to ensure that they
 * don't get optimized away.
 */

 #define RANDOM_H_IMPLEMENTATION
 #include <cauldron/random.h>
 /* https://github.com/camel-cdr/cauldron/blob/main/cauldron/random.h */
 #define BENCH_DONT_NORMALIZE
 #include <cauldron/bench.h>
 /* https://github.com/camel-cdr/cauldron/blob/main/cauldron/bench.h */
 #include <stdio.h>
 #include <stdlib.h>

 #define N (1024*1024)
 #define WARMUP 16
 #define REPEAT 64

 int
 main(void)
 {
 	DistCirclefZig zig;
 	dist_circlef_zig_init(&zig);
 	PRNG64RomuQuad prng64;
 	prng64_romu_quad_randomize(&prng64);

 	Vec2f *arr = malloc(N * sizeof *arr);
 	memset(arr, 0, N * sizeof *arr);
 	BENCH("trig", WARMUP, REPEAT) {
 		for (size_t i = 0; i < N; ++i) {
 			Vec2f v = dist_circlef_trig(prng64_romu_quad, &prng64);
 			arr[i].x += v.x; arr[i].y += v.y;
 		}
 		BENCH_CLOBBER();
 	}
 	BENCH("rej", WARMUP, REPEAT) {
 		for (size_t i = 0; i < N; ++i) {
 			Vec2f v = dist_circlef(prng64_romu_quad, &prng64);
 			arr[i].x += v.x; arr[i].y += v.y;
 		}
 		BENCH_CLOBBER();
 	}
 	BENCH("zig", WARMUP, REPEAT) {
 		for (size_t i = 0; i < N; ++i) {
 			Vec2f v = dist_circlef_zig(&zig, prng64_romu_quad, &prng64);
 			arr[i].x += v.x; arr[i].y += v.y;
 		}
 		BENCH_CLOBBER();
 	}
 	BENCH("rej_multi", WARMUP, REPEAT) {
 		dist_circlef_rej_multi(prng64_romu_quad, &prng64, arr, N);
 		BENCH_CLOBBER();
 	}
 	double sum = 0;
 	for (size_t i = 0; i < N; ++i)
 		sum += arr[i].x - arr[i].y;
 	printf("sum: %f\n", sum);

 	printf("%zu, %zu\n", cnt1, cnt2);
 	bench_done();
 	return 0;
 }
 #endif /* DIST_CIRCLE_BENCH */


 /*
 * 3.3 Results -----------------------------------------------------------------
 *
 * Run on Zen1 1600x:
 *
 *     relative time           trig   rej    zig    rej_multi
 *     clang -O3               7.72   2.94   1.23   1.15
 *     gcc   -O3               6.84   2.95   1.32   1.54
 *     clang -O3 -ffast-math   7.25   2.71   1.45   1.00
 *     gcc   -O3 -ffast-math   6.36   2.72   1.24   1.28
 *
 * The ziggurat method is a clear improvement over both the simple rejection
 * and trigonometric method.
 * It's likely the best for latency sensitive and one off usage.
 * If you need to generate multiple random coordinates uniformly distributed in
 * a unit circle, then the regular rejection methods ends up faster, although
 * there you might as well consider a SIMD implementation.
 *
 * For SIMD I'd recommend using a fast SIMD PRNG like SHISHUA <4>, convert the
 * random bits to a uniform coordinates in the range [-1,1], reject with
 * Pythagoras, and compress store the accepted values.
 *
 *
 * 4 Testing ===================================================================
 *
 * For now this is only a visual test, you can compile the file with
 *      cc -xc -DDIST_CIRCLE_TEST -O3 dist_circle.h -lm
 * and plot the data using
 *     ./a.out > /tmp/1; gnuplot -e 'plot "/tmp/1"' -p
 *
 */

 #ifdef DIST_CIRCLE_TEST

 #define RANDOM_H_IMPLEMENTATION
 #include <cauldron/random.h>
 /* https://github.com/camel-cdr/cauldron/blob/main/cauldron/random.h */
 #include <stdio.h>

 int
 main(void)
 {
 	DistCirclefZig zig;
 	dist_circlef_zig_init(&zig);
 	PRNG64RomuQuad prng64;
 	prng64_romu_quad_randomize(&prng64);

 	for (size_t i = 0; i < 10000; ++i) {
 #if 1
 		Vec2f v = dist_circlef_zig(&zig, prng64_romu_quad, &prng64);
 #else
 		Vec2f v = dist_circlef(prng64_romu_quad, &prng64);
 #endif
 		assert(v.x*v.x+v.y*v.y < 1);
 		printf("%f\t%f\n", v.x, v.y);
 	}

 	return 0;
 }
 #endif /* DIST_CIRCLE_TEST */

 /*
 * References ==================================================================
 *
 * <1> "When Greedy Algorithms Can Be Faster", Benjamin Summerton (2025)
 *     URL: "https://16bpp.net/blog/post/when-greedy-algorithms-can-be-faster/"
 *
 * <2> "Ziggurat algorithm", Wikipedia
 *     URL: "https://en.wikipedia.org/wiki/Ziggurat_algorithm"
 *
 * <3> Marsaglia, George, and Wai Wan Tsang (2000):
 *     "The ziggurat method for generating random variables."
 *     Journal of statistical software 5.8: 1-7.
 *
 * <4> " SHISHUA – The fastest PRNG in the world", Thaddée Tyl
 *     URL: "https://github.com/espadrine/shishua"
 */

 /*
 * Licensing ===================================================================
 *
 * MIT License -------------------------------------------------
 *
 * Copyright (c) 2025 Olaf Berstein
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
	/*
	* Using the Ziggurat Method for Sampling Random Coordinates From a Unit Circle
	* by Olaf Bernstein <[email protected]>.
	* Distributed under the MIT license, see license at the end of the file.
	* New versions available at https://github.com/camel-cdr/cauldron
	*
	* Table of contents ===========================================================
	*
	* 1. Introduction
	* 1.1 API Overview
	* 2. Ziggurat Method
	* 3. Benchmarking
	* 3.1 Alternative Implementations
	* 3.2 Benchmark
	* 3.3 Results
	* 4. Testing
	* References
	* Licensing
	* MIT License
	*
	* 1. Introduction =============================================================
	*
	* This project was inspired by the article "When Greedy Algorithms Can Be
	* Faster" <1>, which covers an interesting problem:
	*
	* Generate a random coordinate that is uniformly distributed in a unit circle.
	*
	* An easy solution to this problem is to generate random coordinates in the
	* range [-1,1] and reject them if they don't fall in the unit circle:
	*/

	#include <stdint.h>
	#include <math.h>
	#include <stddef.h>
	#include <string.h>

	typedef struct { float x, y; } Vec2f;
	static Vec2f
	dist_circlef(uint64_t (rand64)(void), void *rng)
	{
	float fx, fy;
	do {
	uint64_t u64 = rand64(rng);
	fx = (u64 >> (64 - 24)) * (1.0f / ((1u<<24)-1));
	fy = (u64 & 0xffffff) * (1.0f / ((1u<<24)-1));
	fx = fx2 - 1; / scale to [-1,1] */
	fy = fy*2 - 1;
	} while (fxfx + fyfy >= 1.0); /* reject */
	Vec2f ret = { fx, fy };
	return ret;
	}

	/*
	* We'll subsequently call this implementation the "regular rejection method".
	* It needs to do on average 27% more iterations than the number of times it is
	* called, which causes lots of branch miss predictions.
	*
	* In the following, we'll present an optimized scalar solution, based on the
	* Ziggurat Method <2>.
	*
	* We won't cover SIMD optimized implementations. There the simple rejection
	* method combined with a compress instruction is likely the best approach.
	*
	* 1.1 API Overview ------------------------------------------------------------
	*
	* Random distributions:
	* // random 2d coordinate from unit circle
	* float dist_normalf(uint32_t ()(void), void *);
	*
	* // a faster dist_normalf implementation using a lookup table
	* void dist_circlef_zig_init(DistCirclefZig *);
	* Vec2f dist_circlef_zig(DistCirclefZig , uint64_t ()(void), void );
	*
	*
	* 2. Ziggurat Method ==========================================================
	*
	* The Ziggurat Method <2> was originally introduced by George Marsaglia and
	* Wai Wan Tsang for generating normally distributed numbers <3>.
	*
	* It works by overlapping the normal distribution with boxes, that each
	* encapsulate the same area of the underlying distribution.
	* This can be used to quickly accept
	* coordinates using a small lookup table: +----------------------------+
	* \| exp(-(xx)/2) ***** \|
	* 1. A box is randomly selected and a +****------- \|
	* random x coordinate is generated within \| *** \| \|
	* that box, which reduces the number of \| ** \| \|
	* total rejections. \| Box 3 ***\| \|
	* +---------***---- \|
	* 2. If the x coordinate falls within the \| :** \| \|
	* bounds of the box above the selected box, \| Box 2 : ***\| \|
	* then it is directly accepted. +--------------***---- \|
	* \| Box 1 :*** \| \|
	* 3. Otherwise, a concrete random y +------------------**** \|
	* coordinate within the box is generated \| Box 0 :****** \|
	* and a high resolution rejection test is +----------------------------+
	* applied to the coordinates.
	*
	*
	* We can apply a similar technique to our problem of sampling a 2d coordinate
	* from a unit circle.
	* For simplicity, we start off looking at only a quarter of the unit circle
	* that lies in the first quadrant of the coordinate system, and apply random
	* signs to the final coordinate later.
	* Now we can again subdivide the quarter circle in n boxes contain an equal
	* area A=pi/4/n of the circle.
	*
	* We can do the step 1. from above and select a random box, and generate a
	* random x, and this time also y coordinate within the box.
	* Since the "high resolution" rejection test is, in contrast to the normal,
	* distribution very cheap for our problem, we skip step 2. and do a quick
	* rejection test with xx + yy < 1.
	*
	* \|```---...----------------- \|```----..
	* \| ``-.. \| \| `--.
	* \| ``-. \| \| `--.
	* \| A1 `-. \| \| ``.
	* \| `. \| \|-----------------``.-----
	* \| `.\| \| .-``. ^
	* \|-------------------------`----- \| A_trig .-` `. \|
	* \| `. \| \| .-`` `. \| h
	* \| A2 ` \| \| .-` A_seg : \|
	* \| `\| \| .-`` `. : \|
	* \|------------------------------`.-- \|.` alpha : : v
	* \| `.\| \-------------------------
	* \| A3 :\|
	* \|________________________________-.
	* \| :\|
	* \| A4 :\| A1 = A2 = A3 = A4 = pi/4/n = pi/4/4
	* \| :\|
	* \-----------------------------------
	*
	* The challenging part is computing the lookup table that contains the sizes of
	* our boxes.
	*
	* We need to calculate the height h under the quarter circle, given an area A.
	* This can be done by numerically solving the equation that calculates the
	* area A of the circle for a given height h of a box:
	*
	* A = A_trig + A_seg
	* A_trig = h*cos(alpha)/2
	* A_seg = pi * r^2 * alpha/2/pi = alpha/2
	* alpha = asin(h)
	* A = A_trig + A_seg = h*cos(asin(h))/2 + asin(h)/2
	*
	* We know that h is in the range [0,1], so we can use a binary search to solve
	* the equation:
	*/

	static double
	dist_circle_zig__solve(double area)
	{
	double A, h = 0.5;
	for (size_t i = 2; i < 64; ++i) {
	A = h*cos(asin(h))/2 + asin(h)/2;
	h += (A < area ? 1.0 : -1.0) / (1ull<<i);
	}
	return h;
	}

	/*
	* With this equation worked out, we can now initialize the lookup table.
	*
	* The regular rejection method needs to do on average of 27% more iterations,
	* while the ziggurat method with the default lookup table size of 128 entries,
	* we get down to 0.76% additional iterations.
	*/

	#ifndef DIST_CIRCLEF_ZIG_COUNT
	# define DIST_CIRCLEF_ZIG_COUNT 128 /* must be a power of two and <=2^14 */
	#endif
	#define DIST_CIRCLEF_ZIG_AREA (0.7853981633974483/DIST_CIRCLEF_ZIG_COUNT) /* pi/4/n */

	typedef struct {
	float tbl[DIST_CIRCLEF_ZIG_COUNT*2 + 2];
	} DistCirclefZig;

	static void
	dist_circlef_zig_init(DistCirclefZig *zig)
	{
	for (size_t i = 0; i < DIST_CIRCLEF_ZIG_COUNT; ++i) {
	double A = DIST_CIRCLEF_ZIG_AREA * i;
	double h = dist_circle_zig__solve(A);
	zig->tbl[i*2+0] = cos(asin(h));
	zig->tbl[i*2+1] = h;
	}
	zig->tbl[DIST_CIRCLEF_ZIG_COUNT*2+0] = 0.0;
	zig->tbl[DIST_CIRCLEF_ZIG_COUNT*2+1] = 1.0;
	}

	/*
	* Notice how tbl has on additional coordinate and uses a cumulative height.
	* This allows us to compute the height of a single box with
	* tbl[i2+2]-tbl[i2]. The vertical position is just tbl[i*2].
	*
	* Note that we don't use a Vec2f array for the lookup table, because the code
	* generation was considerably worse.
	*/

	/* needed to efficiently apply sign */
	static float
	u32tof(uint32_t x)
	{
	#ifdef __cplusplus
	float f; memcpy(&f, &x, sizeof f); return f;
	#else
	union{ uint32_t i; float f; } u; u.i = x; return u.f;
	#endif
	}
	size_t cnt1 = 0, cnt2 = 0;

	static Vec2f
	dist_circlef_zig(DistCirclefZig const zig, uint64_t (rand64)(void), void rng)
	{
	float fx, fy;
	uint64_t signs;
	++cnt1;
	do {
	++cnt2;
	uint64_t u64 = rand64(rng);
	fx = (u64 >> (64 - 24)) * (1.0f / ((1u<<24)-1));
	fy = (u64 << 24 >> (64 - 24)) * (1.0f / ((1u<<24)-1));
	signs = u64 >> (64-24*2-2);
	uint64_t idx = u64 & (DIST_CIRCLEF_ZIG_COUNT-1);

	float vx = zig->tbl[idx*2+0];
	float vy = zig->tbl[idx*2+1];
	float ny = zig->tbl[idx*2+3];
	fx = vx; / scale to box */
	fy = fy * (ny - vy) + vy;
	} while (fxfx + fyfy >= 1.0); /* reject */
	/* there usually is a sign injection instruction
	* so this compiles down quite neat */
	Vec2f ret = {
	copysignf(fx, u32tof(signs<<31)),
	copysignf(fy, u32tof(signs>>1<<31))
	};
	return ret;
	}


	/*
	* 3. Benchmarking =============================================================
	*
	* 3.1 Alternative Implementations ---------------------------------------------
	*
	* One straight forward way to sample a random coordinate from a unit circle is
	* by generating polar coordinates and converting them to Cartesian
	* coordinates.
	* This does however require computing the sine and cosine of an angle, which
	* is a quite expensive operation.
	*/

	#ifdef DIST_CIRCLE_BENCH

	static Vec2f
	dist_circlef_trig(uint64_t (rand64)(void), void *rng)
	{
	uint64_t u64 = rand64(rng);
	float f1 = (u64 >> (64 - 24)) * (1.0f / ((1u<<24)-1));
	float f2 = (u64 & 0xffffff) * (1.0f / ((1u<<24)-1));
	float r = sqrtf(f1);
	float theta = f2 * 6.283185307179586; /* f22pi */
	Vec2f ret = { rcosf(theta), rsinf(theta) };
	return ret;
	}

	/*
	* The regular rejection method mentioned in the introduction suffers from a
	* lot of branch miss predictions, because rejections are quite common, with a
	* 21% probability.
	* If we want to generate multiple random coordinates at a time, then we can get
	* rid of the branch altogether and replace with a much less impactful
	* register dependency.
	*/

	static void
	dist_circlef_rej_multi(uint64_t (rand64)(void), void rng, Vec2f ptr, size_t n)
	{
	for (size_t i = 0; i < n; ) {
	uint64_t u64 = rand64(rng);
	float fx = (u64 >> (64 - 24)) * (1.0f / ((1u<<24)-1));
	float fy = (u64 & 0xffffff) * (1.0f / ((1u<<24)-1));
	fx = fx*2 - 1;
	fy = fy*2 - 1;
	ptr[i].x += fx; // Note: we accumulate to make sure it's not
	ptr[i].y += fy; // optimized away, see benchmark code below
	i += fxfx + fyfy < 1.0;
	}
	}

	/*
	* 3.2 Benchmark ---------------------------------------------------------------
	*
	* We accumulate the results into an array and sum it, to ensure that they
	* don't get optimized away.
	*/

	#define RANDOM_H_IMPLEMENTATION
	#include <cauldron/random.h>
	/* https://github.com/camel-cdr/cauldron/blob/main/cauldron/random.h */
	#define BENCH_DONT_NORMALIZE
	#include <cauldron/bench.h>
	/* https://github.com/camel-cdr/cauldron/blob/main/cauldron/bench.h */
	#include <stdio.h>
	#include <stdlib.h>

	#define N (1024*1024)
	#define WARMUP 16
	#define REPEAT 64

	int
	main(void)
	{
	DistCirclefZig zig;
	dist_circlef_zig_init(&zig);
	PRNG64RomuQuad prng64;
	prng64_romu_quad_randomize(&prng64);

	Vec2f arr = malloc(N sizeof *arr);
	memset(arr, 0, N * sizeof *arr);
	BENCH("trig", WARMUP, REPEAT) {
	for (size_t i = 0; i < N; ++i) {
	Vec2f v = dist_circlef_trig(prng64_romu_quad, &prng64);
	arr[i].x += v.x; arr[i].y += v.y;
	}
	BENCH_CLOBBER();
	}
	BENCH("rej", WARMUP, REPEAT) {
	for (size_t i = 0; i < N; ++i) {
	Vec2f v = dist_circlef(prng64_romu_quad, &prng64);
	arr[i].x += v.x; arr[i].y += v.y;
	}
	BENCH_CLOBBER();
	}
	BENCH("zig", WARMUP, REPEAT) {
	for (size_t i = 0; i < N; ++i) {
	Vec2f v = dist_circlef_zig(&zig, prng64_romu_quad, &prng64);
	arr[i].x += v.x; arr[i].y += v.y;
	}
	BENCH_CLOBBER();
	}
	BENCH("rej_multi", WARMUP, REPEAT) {
	dist_circlef_rej_multi(prng64_romu_quad, &prng64, arr, N);
	BENCH_CLOBBER();
	}
	double sum = 0;
	for (size_t i = 0; i < N; ++i)
	sum += arr[i].x - arr[i].y;
	printf("sum: %f\n", sum);

	printf("%zu, %zu\n", cnt1, cnt2);
	bench_done();
	return 0;
	}
	#endif /* DIST_CIRCLE_BENCH */


	/*
	* 3.3 Results -----------------------------------------------------------------
	*
	* Run on Zen1 1600x:
	*
	* relative time trig rej zig rej_multi
	* clang -O3 7.72 2.94 1.23 1.15
	* gcc -O3 6.84 2.95 1.32 1.54
	* clang -O3 -ffast-math 7.25 2.71 1.45 1.00
	* gcc -O3 -ffast-math 6.36 2.72 1.24 1.28
	*
	* The ziggurat method is a clear improvement over both the simple rejection
	* and trigonometric method.
	* It's likely the best for latency sensitive and one off usage.
	* If you need to generate multiple random coordinates uniformly distributed in
	* a unit circle, then the regular rejection methods ends up faster, although
	* there you might as well consider a SIMD implementation.
	*
	* For SIMD I'd recommend using a fast SIMD PRNG like SHISHUA <4>, convert the
	* random bits to a uniform coordinates in the range [-1,1], reject with
	* Pythagoras, and compress store the accepted values.
	*
	*
	* 4 Testing ===================================================================
	*
	* For now this is only a visual test, you can compile the file with
	* cc -xc -DDIST_CIRCLE_TEST -O3 dist_circle.h -lm
	* and plot the data using
	* ./a.out > /tmp/1; gnuplot -e 'plot "/tmp/1"' -p
	*
	*/

	#ifdef DIST_CIRCLE_TEST

	#define RANDOM_H_IMPLEMENTATION
	#include <cauldron/random.h>
	/* https://github.com/camel-cdr/cauldron/blob/main/cauldron/random.h */
	#include <stdio.h>

	int
	main(void)
	{
	DistCirclefZig zig;
	dist_circlef_zig_init(&zig);
	PRNG64RomuQuad prng64;
	prng64_romu_quad_randomize(&prng64);

	for (size_t i = 0; i < 10000; ++i) {
	#if 1
	Vec2f v = dist_circlef_zig(&zig, prng64_romu_quad, &prng64);
	#else
	Vec2f v = dist_circlef(prng64_romu_quad, &prng64);
	#endif
	assert(v.xv.x+v.yv.y < 1);
	printf("%f\t%f\n", v.x, v.y);
	}

	return 0;
	}
	#endif /* DIST_CIRCLE_TEST */

	/*
	* References ==================================================================
	*
	* <1> "When Greedy Algorithms Can Be Faster", Benjamin Summerton (2025)
	* URL: "https://16bpp.net/blog/post/when-greedy-algorithms-can-be-faster/"
	*
	* <2> "Ziggurat algorithm", Wikipedia
	* URL: "https://en.wikipedia.org/wiki/Ziggurat_algorithm"
	*
	* <3> Marsaglia, George, and Wai Wan Tsang (2000):
	* "The ziggurat method for generating random variables."
	* Journal of statistical software 5.8: 1-7.
	*
	* <4> " SHISHUA – The fastest PRNG in the world", Thaddée Tyl
	* URL: "https://github.com/espadrine/shishua"
	*/

	/*
	* Licensing ===================================================================
	*
	* MIT License -------------------------------------------------
	*
	* Copyright (c) 2025 Olaf Berstein
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*
	*/