Marc-B-Reynolds · January 20, 2025 21:11
diff --git a/atan_spitball.c b/atan_spitball.c
 // Public Domain under http://unlicense.org, see link for details.
 // EXCEPT "CORE-MATH" marked section

 // {gcc,clang} -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno -ffp-contract=off atan_spitball.c -o atan_spitball -lm

 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
 #include <assert.h>

 //************************************
 // minimal spitball atan(x) on [-1,1]

 // classic polynomial: abs_error <= 4.952073e-03 (radians)
 float atan_p(float x)
 {
  static const float C1 =  0.97239410877227783203125f;
  static const float C3 = -0.19194793701171875f;
  
  return x*fmaf(C3*x,x,C1);
 }

 // use |x| instead of x^2 variant
 // abs_error <= 3.347754e-03 (radians)
 float atan_x(float x)
 {
  static const float C1 =  1.0546815395355224609375;
  static const float C3 = -0.2659356594085693359375;
  
  return x*fmaf(C3,fabsf(x),C1);
 }

 // |x| + lower error at x = +/-1 variant
 // abs_error <= 3.741682e-03 (radians)
 float atan_e(float x)
 {
  static const float C1 =  1.0584795985416213f;
  static const float C3 = -0.273081435144173f;
  
  return x*fmaf(C3,fabsf(x),C1);
 }


 // ************************************
 // The CORE-MATH routine fall under:
 //  
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 //copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 // 
 // The above copyright notice and this permission notice shall be included in all
 // copies or substantial portions of the Software.
 // 
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.


 //**********************************************************************

 #define FUNC_TYPE_ODD  1
 #define FUNC_TYPE_EVEN 2

 // bunch of wip temp hacks

 typedef struct {
  float (*f)(float);
  char*    name;
  char*    notes;
  uint32_t flags;
  uint32_t type;
 } func_entry_t;

 typedef struct {
  uint32_t x0;
  uint32_t x1;
  char*    name;
 } func_range_entry_t;

 typedef struct {
  char*    name;
  func_range_entry_t* table;
  uint32_t            num;
 } func_range_table_t;

 #define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))
 #define STRINGIFY(S)  STRINGIFY_(S)
 #define STRINGIFY_(S) #S
 #define ENTRY(X) { .f=&X, .name=STRINGIFY(X) }

 float libm(float x) { return atanf(x); }


 //**********************************************************************
 // SEE: https://core-math.gitlabpages.inria.fr
 // and license info at top of file.

 // oh my!
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpragmas"
 #pragma GCC diagnostic ignored "-Wpedantic"
 #pragma GCC diagnostic ignored "-Wunknown-warning-option"
 #pragma GCC diagnostic ignored "-Wconversion"
 #pragma GCC diagnostic ignored "-Wsign-conversion"
 #pragma GCC diagnostic ignored "-Wshorten-64-to-32"
 #pragma GCC diagnostic ignored "-Wimplicit-float-conversion"
 #pragma GCC diagnostic ignored "-Wimplicit-int-conversion"
 #pragma GCC diagnostic ignored "-Wfloat-conversion"
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"

 #include <fenv.h>

 typedef union {float f; uint32_t u;} b32u32_u;
 typedef union {double f; uint64_t u;} b64u64_u;
 typedef uint64_t u64;

 float cr_atanf(float x){
  const double pi2 = 0x1.921fb54442d18p+0;
  b32u32_u t = {.f = x};
  int e = (t.u>>23)&0xff, gt = e>=127;
  if(__builtin_expect(e==0xff, 0)) {
    if(t.u<<9) return x; // nan
    return __builtin_copysign(pi2,(double)x); // inf
  }
  if (__builtin_expect(e<127-13, 0)){
    if (__builtin_expect(e<127-25, 0))
      return __builtin_fmaf(-x, __builtin_fabsf(x), x);
    return __builtin_fmaf(-0x1.5555555555555p-2f*x, x*x, x);
  }
  /* now |x| >= 0x1p-13 */
  double z = x;
  if (gt) z = 1/z; /* gt is non-zero for |x| >= 1 */
  double z2 = z*z, z4 = z2*z2, z8 = z4*z4;
  /* polynomials generated using rminimax
     (https://gitlab.inria.fr/sfilip/rminimax) with the following command:
     ./ratapprox --function="atan(x)" --dom=[0.000122070,1] --num=[x,x^3,x^5,x^7,x^9,x^11,x^13] --den=[1,x^2,x^4,x^6,x^8,x^10,x^12] --output=atanf.sollya --log
     (see output atanf.sollya)
     The coefficient cd[0] was slightly reduced from the original value
     0x1.51eccde075d67p-2 to avoid an exceptional case for |x| = 0x1.1ad646p-4
     and rounding to nearest.
  */
  static const double cn[] =
    {0x1.51eccde075d67p-2, 0x1.a76bb5637f2f2p-1, 0x1.81e0eed20de88p-1,
     0x1.376c8ca67d11dp-2, 0x1.aec7b69202ac6p-5, 0x1.9561899acc73ep-9,
     0x1.bf9fa5b67e6p-16};
  static const double cd[] =
    {0x1.51eccde075d66p-2, 0x1.dfbdd7b392d28p-1, 0x1p+0,
     0x1.fd22bf0e89b54p-2, 0x1.d91ff8b576282p-4, 0x1.653ea99fc9bbp-7,
     0x1.1e7fcc202340ap-12};
  double cn0 = cn[0] + z2*cn[1];
  double cn2 = cn[2] + z2*cn[3];
  double cn4 = cn[4] + z2*cn[5];
  double cn6 = cn[6];
  cn0 += z4*cn2;
  cn4 += z4*cn6;
  cn0 += z8*cn4;
  cn0 *= z;
  double cd0 = cd[0] + z2*cd[1];
  double cd2 = cd[2] + z2*cd[3];
  double cd4 = cd[4] + z2*cd[5];
  double cd6 = cd[6];
  cd0 += z4*cd2;
  cd4 += z4*cd6;
  cd0 += z8*cd4;
  double r = cn0/cd0;
  if (!gt) return r; /* for |x| < 1, (float) r is correctly rounded */

  /* now |x| >= 1 */
  r = __builtin_copysign(0x1.0fdaa22168c23p-7, z) - r + __builtin_copysign(0x1.9p0, z);
  return r;
 }

 #pragma GCC diagnostic pop


 //********************************************************



 func_entry_t func_table[] =
 {
 //ENTRY(libm),
  ENTRY(atan_p),
  ENTRY(atan_x),
  ENTRY(atan_e),
 };

 const char* func_name = "atan";

 float cr_func(float x) { return cr_atanf(x); }


 //********************************************************

 static const uint32_t f32_sign_bit_k = UINT32_C(0x80000000);

 static inline uint32_t f32_to_bits(float x)
 {
  uint32_t u; memcpy(&u, &x, 4); return u;
 }

 static inline float f32_from_bits(uint32_t x)
 {
  float f; memcpy(&f, &x, 4); return f;
 }

 static inline uint32_t u32_abs(uint32_t x)
 {
  return (int32_t)x >= 0 ? x : -x;
 }


 // ulp distance provided a & b are finite
 // and have the same sign
 static inline uint32_t f32_ulp_dist_ss(float a, float b)
 {
  uint32_t ua = f32_to_bits(a);
  uint32_t ub = f32_to_bits(b);
  return u32_abs(ua-ub);
 }

 // ulp distance for finite a & b
 static inline uint32_t f32_ulp_dist(float a, float b)
 {
  uint32_t ua = f32_to_bits(a);
  uint32_t ub = f32_to_bits(b);

  if ((int32_t)(ub^ua) >= 0)
    return u32_abs(ua-ub);
  
  return ua+ub+f32_sign_bit_k;
 }

 typedef struct {
  float    abs;
  uint32_t max;
  uint32_t ulp[4];
 } func_error_t;

 // tracks total error data across multiple intervals
 func_error_t func_error[LENGTHOF(func_table)] = {{0}};


 // add error data from current interval to the totals
 void error_to_totals(func_error_t* e)
 {
  for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++, e++) {
    func_error[fi].ulp[0] += e->ulp[0];
    func_error[fi].ulp[1] += e->ulp[1];
    func_error[fi].ulp[2] += e->ulp[2];
    func_error[fi].ulp[3] += e->ulp[3];

    if (e->max > func_error[fi].max) func_error[fi].max = e->max;
    if (e->abs > func_error[fi].abs) func_error[fi].abs = e->abs;
  }
 }

 void error_dump_i(func_error_t* e)
 {
  printf("|%15s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%12s|\n",
 	 "func", "max ULP", "CR", "FR", "2 ULP", "> 2 ULP",
 	 "CR%", "FR%", "2 ULP%","> 2 ULP%", "abs");

  printf("|%15s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%12s|\n",
 	 "---:", "---:", "---:", "---:", "---:", "---:",
 	 "---:", "---:", "---:","---:", "---:");

  for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++, e++) {
    uint32_t u0 = e->ulp[0];
    uint32_t u1 = e->ulp[1];
    uint32_t u2 = e->ulp[2];
    uint32_t u3 = e->ulp[3];
    uint32_t t  = (u0+u1+u2+u3);
    double   s  = 100.0/(double)t;

    printf("|%15s|%10u|%10u|%10u|%10u|%10u|%10f|%10f|%10f|%10f|%e|\n",
 	   func_table[fi].name, e->max,
 	   u0, u1, u2, u3,
 	   s*u0, s*u1, s*u2, s*u3, e->abs
 	   );
  }
 }

 void error_dump(void)
 {
  printf("\nTOTAL: %s\n", func_name);
  error_dump_i(func_error);
 }

 static inline void test_error_add(func_error_t* error, float e, float a)
 {
  float d = fabsf(e-a);

  if (d == 0.f) { error->ulp[0]++; return; };

  uint32_t ulp = f32_ulp_dist(e,a);

  if (ulp > error->max) { error->max = ulp; }
  if (d   > error->abs) { error->abs = d;   }
  
  if (ulp > 3) ulp = 3;
  
  error->ulp[ulp]++;
 }

 void test_force(uint32_t x0, uint32_t x1)
 {
  if (x1 < x0) { uint32_t t = x0; x0=x1; x1=t; }

  float f0 = f32_from_bits(x0);
  float f1 = f32_from_bits(x1);
  
  printf("\nchecking: %s on [%08x,%08x] [%e,%e]\n", func_name, x0,x1,f0,f1);

  func_error_t error[LENGTHOF(func_table)] = {{0}};
  
  for(uint32_t ix=x0; ix<=x1; ix++) {
    float x  = f32_from_bits(ix);
    float cr = cr_func(x);
    
    for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++) {
      float r = func_table[fi].f(x);
      test_error_add(error+fi, cr,r);
    }
  }
  
  error_to_totals(error);
  error_dump_i(error);
 }

 void test_linear_range(uint32_t x0, uint32_t x1, float k)
 {
  func_error_t error[LENGTHOF(func_table)] = {{0}};
  
  float f0 = f32_from_bits(x0);
  float f1 = f32_from_bits(x1);
  
  printf("\nchecking: %s on [%08x,%08x] [%e,%e] : {linear result range %a*x}\n", func_name, x0,x1,f0,f1,k);
  
  for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++) {
    for(uint32_t xi=x0; xi<=x1; xi++) {
      float x  = f32_from_bits(xi);
      float r  = func_table[fi].f(x);
      float cr = k*x;
      test_error_add(error+fi,cr,r);
    }
  }
  error_to_totals(error);
  error_dump_i(error);
 }


 // test a full power-of-two interval starting from x. (need not start on a boundary)
 void test_1pot(float x)
 {
  uint32_t x0 = f32_to_bits(x);
  uint32_t x1 = f32_to_bits(2.f*x)-1;
  test_force(x0,x1);
 }

 // correctly rounded f(x) = k on [x0,x1]
 void test_const_range(uint32_t x0, uint32_t x1, float k)
 {
  func_error_t error[LENGTHOF(func_table)] = {{0}};
  
  float f0 = f32_from_bits(x0);
  float f1 = f32_from_bits(x1);

  printf("\nchecking: %s on [%08x,%08x] [%e,%e] : {constant result range %a}\n", func_name, x0,x1,f0,f1,k);
  
  for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++) {
    for(uint32_t xi=x0; xi<=x1; xi++) {
      float x = f32_from_bits(xi);
      float r = func_table[fi].f(x);
      test_error_add(error+fi,k,r);
    }
  }
  error_to_totals(error);
  error_dump_i(error);
 }

 void test_all(void)
 {
  uint32_t x0 = 0;
  uint32_t x1 = 0x39b89ba2;
  
  test_linear_range(x0, x1, 1.f);

  x0 = x1+1; x1=f32_to_bits(1.f/64.f);
  test_force(x0,x1);

  // break-down the interior a bit. probably overkill WRT breakdown
  test_1pot(1.f/64.f);
  test_1pot(1.f/32.f);
  test_1pot(1.f/16.f);
  test_1pot(1.f/ 8.f);
  test_1pot(1.f/ 4.f);
  test_1pot(1.f/ 2.f);
 }

 int main(void)
 {
  printf("<details markdown=\"1\"><summary>click for range breakdown</summary>\n\n");

  test_all();

  printf("\n</details>\n\n");

  // dump summary
  error_dump();

  return 0;
 }
diff --git a/output.md b/output.md
	// Public Domain under http://unlicense.org, see link for details.
	// EXCEPT "CORE-MATH" marked section

	// {gcc,clang} -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno -ffp-contract=off atan_spitball.c -o atan_spitball -lm

	#include <stdint.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <math.h>
	#include <string.h>
	#include <assert.h>

	//************************************
	// minimal spitball atan(x) on [-1,1]

	// classic polynomial: abs_error <= 4.952073e-03 (radians)
	float atan_p(float x)
	{
	static const float C1 = 0.97239410877227783203125f;
	static const float C3 = -0.19194793701171875f;

	return xfmaf(C3x,x,C1);
	}

	// use \|x\| instead of x^2 variant
	// abs_error <= 3.347754e-03 (radians)
	float atan_x(float x)
	{
	static const float C1 = 1.0546815395355224609375;
	static const float C3 = -0.2659356594085693359375;

	return x*fmaf(C3,fabsf(x),C1);
	}

	// \|x\| + lower error at x = +/-1 variant
	// abs_error <= 3.741682e-03 (radians)
	float atan_e(float x)
	{
	static const float C1 = 1.0584795985416213f;
	static const float C3 = -0.273081435144173f;

	return x*fmaf(C3,fabsf(x),C1);
	}


	// ************************************
	// The CORE-MATH routine fall under:
	//
	// Permission is hereby granted, free of charge, to any person obtaining a copy
	// of this software and associated documentation files (the "Software"), to deal
	// in the Software without restriction, including without limitation the rights
	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	//copies of the Software, and to permit persons to whom the Software is
	// furnished to do so, subject to the following conditions:
	//
	// The above copyright notice and this permission notice shall be included in all
	// copies or substantial portions of the Software.
	//
	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	// SOFTWARE.


	//**********************************************************************

	#define FUNC_TYPE_ODD 1
	#define FUNC_TYPE_EVEN 2

	// bunch of wip temp hacks

	typedef struct {
	float (*f)(float);
	char* name;
	char* notes;
	uint32_t flags;
	uint32_t type;
	} func_entry_t;

	typedef struct {
	uint32_t x0;
	uint32_t x1;
	char* name;
	} func_range_entry_t;

	typedef struct {
	char* name;
	func_range_entry_t* table;
	uint32_t num;
	} func_range_table_t;

	#define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))
	#define STRINGIFY(S) STRINGIFY_(S)
	#define STRINGIFY_(S) #S
	#define ENTRY(X) { .f=&X, .name=STRINGIFY(X) }

	float libm(float x) { return atanf(x); }


	//**********************************************************************
	// SEE: https://core-math.gitlabpages.inria.fr
	// and license info at top of file.

	// oh my!
	#pragma GCC diagnostic push
	#pragma GCC diagnostic ignored "-Wpragmas"
	#pragma GCC diagnostic ignored "-Wpedantic"
	#pragma GCC diagnostic ignored "-Wunknown-warning-option"
	#pragma GCC diagnostic ignored "-Wconversion"
	#pragma GCC diagnostic ignored "-Wsign-conversion"
	#pragma GCC diagnostic ignored "-Wshorten-64-to-32"
	#pragma GCC diagnostic ignored "-Wimplicit-float-conversion"
	#pragma GCC diagnostic ignored "-Wimplicit-int-conversion"
	#pragma GCC diagnostic ignored "-Wfloat-conversion"
	#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"

	#include <fenv.h>

	typedef union {float f; uint32_t u;} b32u32_u;
	typedef union {double f; uint64_t u;} b64u64_u;
	typedef uint64_t u64;

	float cr_atanf(float x){
	const double pi2 = 0x1.921fb54442d18p+0;
	b32u32_u t = {.f = x};
	int e = (t.u>>23)&0xff, gt = e>=127;
	if(__builtin_expect(e==0xff, 0)) {
	if(t.u<<9) return x; // nan
	return __builtin_copysign(pi2,(double)x); // inf
	}
	if (__builtin_expect(e<127-13, 0)){
	if (__builtin_expect(e<127-25, 0))
	return __builtin_fmaf(-x, __builtin_fabsf(x), x);
	return __builtin_fmaf(-0x1.5555555555555p-2fx, xx, x);
	}
	/* now \|x\| >= 0x1p-13 */
	double z = x;
	if (gt) z = 1/z; /* gt is non-zero for \|x\| >= 1 */
	double z2 = zz, z4 = z2z2, z8 = z4*z4;
	/* polynomials generated using rminimax
	(https://gitlab.inria.fr/sfilip/rminimax) with the following command:
	./ratapprox --function="atan(x)" --dom=[0.000122070,1] --num=[x,x^3,x^5,x^7,x^9,x^11,x^13] --den=[1,x^2,x^4,x^6,x^8,x^10,x^12] --output=atanf.sollya --log
	(see output atanf.sollya)
	The coefficient cd[0] was slightly reduced from the original value
	0x1.51eccde075d67p-2 to avoid an exceptional case for \|x\| = 0x1.1ad646p-4
	and rounding to nearest.
	*/
	static const double cn[] =
	{0x1.51eccde075d67p-2, 0x1.a76bb5637f2f2p-1, 0x1.81e0eed20de88p-1,
	0x1.376c8ca67d11dp-2, 0x1.aec7b69202ac6p-5, 0x1.9561899acc73ep-9,
	0x1.bf9fa5b67e6p-16};
	static const double cd[] =
	{0x1.51eccde075d66p-2, 0x1.dfbdd7b392d28p-1, 0x1p+0,
	0x1.fd22bf0e89b54p-2, 0x1.d91ff8b576282p-4, 0x1.653ea99fc9bbp-7,
	0x1.1e7fcc202340ap-12};
	double cn0 = cn[0] + z2*cn[1];
	double cn2 = cn[2] + z2*cn[3];
	double cn4 = cn[4] + z2*cn[5];
	double cn6 = cn[6];
	cn0 += z4*cn2;
	cn4 += z4*cn6;
	cn0 += z8*cn4;
	cn0 *= z;
	double cd0 = cd[0] + z2*cd[1];
	double cd2 = cd[2] + z2*cd[3];
	double cd4 = cd[4] + z2*cd[5];
	double cd6 = cd[6];
	cd0 += z4*cd2;
	cd4 += z4*cd6;
	cd0 += z8*cd4;
	double r = cn0/cd0;
	if (!gt) return r; /* for \|x\| < 1, (float) r is correctly rounded */

	/* now \|x\| >= 1 */
	r = __builtin_copysign(0x1.0fdaa22168c23p-7, z) - r + __builtin_copysign(0x1.9p0, z);
	return r;
	}

	#pragma GCC diagnostic pop


	//********************************************************



	func_entry_t func_table[] =
	{
	//ENTRY(libm),
	ENTRY(atan_p),
	ENTRY(atan_x),
	ENTRY(atan_e),
	};

	const char* func_name = "atan";

	float cr_func(float x) { return cr_atanf(x); }


	//********************************************************

	static const uint32_t f32_sign_bit_k = UINT32_C(0x80000000);

	static inline uint32_t f32_to_bits(float x)
	{
	uint32_t u; memcpy(&u, &x, 4); return u;
	}

	static inline float f32_from_bits(uint32_t x)
	{
	float f; memcpy(&f, &x, 4); return f;
	}

	static inline uint32_t u32_abs(uint32_t x)
	{
	return (int32_t)x >= 0 ? x : -x;
	}


	// ulp distance provided a & b are finite
	// and have the same sign
	static inline uint32_t f32_ulp_dist_ss(float a, float b)
	{
	uint32_t ua = f32_to_bits(a);
	uint32_t ub = f32_to_bits(b);
	return u32_abs(ua-ub);
	}

	// ulp distance for finite a & b
	static inline uint32_t f32_ulp_dist(float a, float b)
	{
	uint32_t ua = f32_to_bits(a);
	uint32_t ub = f32_to_bits(b);

	if ((int32_t)(ub^ua) >= 0)
	return u32_abs(ua-ub);

	return ua+ub+f32_sign_bit_k;
	}

	typedef struct {
	float abs;
	uint32_t max;
	uint32_t ulp[4];
	} func_error_t;

	// tracks total error data across multiple intervals
	func_error_t func_error[LENGTHOF(func_table)] = {{0}};


	// add error data from current interval to the totals
	void error_to_totals(func_error_t* e)
	{
	for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++, e++) {
	func_error[fi].ulp[0] += e->ulp[0];
	func_error[fi].ulp[1] += e->ulp[1];
	func_error[fi].ulp[2] += e->ulp[2];
	func_error[fi].ulp[3] += e->ulp[3];

	if (e->max > func_error[fi].max) func_error[fi].max = e->max;
	if (e->abs > func_error[fi].abs) func_error[fi].abs = e->abs;
	}
	}

	void error_dump_i(func_error_t* e)
	{
	printf("\|%15s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%12s\|\n",
	"func", "max ULP", "CR", "FR", "2 ULP", "> 2 ULP",
	"CR%", "FR%", "2 ULP%","> 2 ULP%", "abs");

	printf("\|%15s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%10s\|%12s\|\n",
	"---:", "---:", "---:", "---:", "---:", "---:",
	"---:", "---:", "---:","---:", "---:");

	for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++, e++) {
	uint32_t u0 = e->ulp[0];
	uint32_t u1 = e->ulp[1];
	uint32_t u2 = e->ulp[2];
	uint32_t u3 = e->ulp[3];
	uint32_t t = (u0+u1+u2+u3);
	double s = 100.0/(double)t;

	printf("\|%15s\|%10u\|%10u\|%10u\|%10u\|%10u\|%10f\|%10f\|%10f\|%10f\|%e\|\n",
	func_table[fi].name, e->max,
	u0, u1, u2, u3,
	su0, su1, su2, su3, e->abs
	);
	}
	}

	void error_dump(void)
	{
	printf("\nTOTAL: %s\n", func_name);
	error_dump_i(func_error);
	}

	static inline void test_error_add(func_error_t* error, float e, float a)
	{
	float d = fabsf(e-a);

	if (d == 0.f) { error->ulp[0]++; return; };

	uint32_t ulp = f32_ulp_dist(e,a);

	if (ulp > error->max) { error->max = ulp; }
	if (d > error->abs) { error->abs = d; }

	if (ulp > 3) ulp = 3;

	error->ulp[ulp]++;
	}

	void test_force(uint32_t x0, uint32_t x1)
	{
	if (x1 < x0) { uint32_t t = x0; x0=x1; x1=t; }

	float f0 = f32_from_bits(x0);
	float f1 = f32_from_bits(x1);

	printf("\nchecking: %s on [%08x,%08x] [%e,%e]\n", func_name, x0,x1,f0,f1);

	func_error_t error[LENGTHOF(func_table)] = {{0}};

	for(uint32_t ix=x0; ix<=x1; ix++) {
	float x = f32_from_bits(ix);
	float cr = cr_func(x);

	for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++) {
	float r = func_table[fi].f(x);
	test_error_add(error+fi, cr,r);
	}
	}

	error_to_totals(error);
	error_dump_i(error);
	}

	void test_linear_range(uint32_t x0, uint32_t x1, float k)
	{
	func_error_t error[LENGTHOF(func_table)] = {{0}};

	float f0 = f32_from_bits(x0);
	float f1 = f32_from_bits(x1);

	printf("\nchecking: %s on [%08x,%08x] [%e,%e] : {linear result range %a*x}\n", func_name, x0,x1,f0,f1,k);

	for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++) {
	for(uint32_t xi=x0; xi<=x1; xi++) {
	float x = f32_from_bits(xi);
	float r = func_table[fi].f(x);
	float cr = k*x;
	test_error_add(error+fi,cr,r);
	}
	}
	error_to_totals(error);
	error_dump_i(error);
	}


	// test a full power-of-two interval starting from x. (need not start on a boundary)
	void test_1pot(float x)
	{
	uint32_t x0 = f32_to_bits(x);
	uint32_t x1 = f32_to_bits(2.f*x)-1;
	test_force(x0,x1);
	}

	// correctly rounded f(x) = k on [x0,x1]
	void test_const_range(uint32_t x0, uint32_t x1, float k)
	{
	func_error_t error[LENGTHOF(func_table)] = {{0}};

	float f0 = f32_from_bits(x0);
	float f1 = f32_from_bits(x1);

	printf("\nchecking: %s on [%08x,%08x] [%e,%e] : {constant result range %a}\n", func_name, x0,x1,f0,f1,k);

	for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++) {
	for(uint32_t xi=x0; xi<=x1; xi++) {
	float x = f32_from_bits(xi);
	float r = func_table[fi].f(x);
	test_error_add(error+fi,k,r);
	}
	}
	error_to_totals(error);
	error_dump_i(error);
	}

	void test_all(void)
	{
	uint32_t x0 = 0;
	uint32_t x1 = 0x39b89ba2;

	test_linear_range(x0, x1, 1.f);

	x0 = x1+1; x1=f32_to_bits(1.f/64.f);
	test_force(x0,x1);

	// break-down the interior a bit. probably overkill WRT breakdown
	test_1pot(1.f/64.f);
	test_1pot(1.f/32.f);
	test_1pot(1.f/16.f);
	test_1pot(1.f/ 8.f);
	test_1pot(1.f/ 4.f);
	test_1pot(1.f/ 2.f);
	}

	int main(void)
	{
	printf("<details markdown=\"1\"><summary>click for range breakdown</summary>\n\n");

	test_all();

	printf("\n</details>\n\n");

	// dump summary
	error_dump();

	return 0;
	}
func	max ULP	CR	FR	2 ULP	> 2 ULP	CR%	FR%	2 ULP%	> 2 ULP%	abs
atan_p	463150	19	36	36	968399688	0.000002	0.000004	0.000004	99.999991	9.720359e-06
atan_x	869840	10	18	18	968399733	0.000001	0.000002	0.000002	99.999995	1.922101e-05
atan_e	926918	9	17	17	968399736	0.000001	0.000002	0.000002	99.999996	2.055746e-05
func	max ULP	> 2 ULP	> 2 ULP%	abs
atan_p	463150	46621790	100.000000	4.308028e-04
atan_x	867983	46621790	100.000000	7.907441e-04
atan_e	925032	46621790	100.000000	8.483445e-04
func	max ULP	> 2 ULP	> 2 ULP%	abs
atan_p	462609	8388608	100.000000	8.583758e-04
atan_x	753944	8388608	100.000000	1.459263e-03
atan_e	809023	8388608	100.000000	1.570972e-03
func	max ULP	> 2 ULP	> 2 ULP%	abs
atan_p	460986	8388608	100.000000	1.691040e-03
atan_x	644221	8388608	100.000000	2.459977e-03
atan_e	697071	8388608	100.000000	2.669439e-03
func	max ULP	> 2 ULP	> 2 ULP%	abs
atan_p	454503	8388608	100.000000	3.180631e-03
atan_x	444822	8388608	100.000000	3.324963e-03
atan_e	492504	8388608	100.000000	3.688060e-03
func	max ULP	> 2 ULP	> 2 ULP%	abs
atan_p	428760	8388608	100.000000	4.879326e-03
atan_x	266419	8388608	100.000000	3.347740e-03
atan_e	290786	8388608	100.000000	3.741622e-03
func	max ULP	CR	FR	2 ULP	> 2 ULP	CR%	FR%	2 ULP%	> 2 ULP%	abs
atan_p	329009	0	0	0	8388608	0.000000	0.000000	0.000000	100.000000	4.952013e-03
atan_x	138966	65	85	77	8388381	0.000775	0.001013	0.000918	99.997294	2.790779e-03
atan_e	172714	50	99	76	8388383	0.000596	0.001180	0.000906	99.997318	2.678216e-03
func	max ULP	CR	FR	2 ULP	> 2 ULP	CR%	FR%	2 ULP%	> 2 ULP%	abs
atan_p	83082	53	93	84	8388378	0.000632	0.001109	0.001001	99.997258	4.952073e-03
atan_x	107120	45	91	90	8388382	0.000536	0.001085	0.001073	99.997306	3.347754e-03
atan_e	109058	6	54	82	8388466	0.000072	0.000644	0.000978	99.998307	3.741682e-03
func	max ULP	CR	FR	2 ULP	> 2 ULP	CR%	FR%	2 ULP%	> 2 ULP%	abs
atan_p	463150	72	129	120	1065352896	0.000007	0.000012	0.000011	99.999970	4.952073e-03
atan_x	869840	120	194	185	1065352718	0.000011	0.000018	0.000017	99.999953	3.347754e-03
atan_e	926918	65	170	175	1065352807	0.000006	0.000016	0.000016	99.999962	3.741682e-03