Marc-B-Reynolds · May 10, 2020 07:24
diff --git a/tanpi.c b/tanpi.c
 // Public Domain under http://unlicense.org, see link for details.

 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>

 #include "xmmintrin.h"
 #include "wmmintrin.h"
 #include "x86intrin.h"

 #define RANDOMIZE

 // histogram size
 #define HLEN 127
 #define TLEN 0xFFFFFF

 // external code: xoroshiro128+

 inline uint32_t f32_to_bits(float x)
 {
  uint32_t u; memcpy(&u,&x,4); return u;
 }

 inline float f32_from_bits(uint32_t x)
 {
  float u; memcpy(&u,&x,4); return u;
 }

 inline uint32_t u32_abs(uint32_t x)
 {
  return (int32_t)x >= 0 ? x : -x;
 }

 inline uint32_t f32_ulp_dist(float a, float b)
 {
  uint32_t ua = f32_to_bits(a);
  uint32_t ub = f32_to_bits(b);
  uint32_t s  = ub^ua;

  if ((int32_t)s >= 0)
    return u32_abs(ua-ub);
  
  return ua+ub+0x80000000;
 }

 //*******************

 uint64_t rng_state[2];

 inline uint64_t rotl(const uint64_t v, int i)
 {
  return (v << i)|(v >> (64-i));
 }

 inline uint64_t rng_u64(void)
 {
  uint64_t s0 = rng_state[0];
  uint64_t s1 = rng_state[1];
  uint64_t r  = s0 + s1;
  
  s1 ^= s0;
  rng_state[0] = rotl(s0,55) ^ s1 ^ (s1<<14);
  rng_state[1] = rotl(s1,36);
  
  return r;
 }
 // end: xoroshiro128+


 // double on [0,1)
 inline double rng_f64(void)
 {
  return (double)(rng_u64() >> (64-53))*0x1p-53;
 }

 inline float rng_f32(void)
 {
  return (float)(rng_u64() >> (64-24))*0x1p-24f;
 }

 void reset_generators(uint64_t s0, uint64_t s1)
 {
  rng_state[0] = s0;
  rng_state[1] = s1;
  rng_u64();
 }

 typedef struct {float a,b; } f32_pair_t;

 inline float f32_xor(float a, float b) { return f32_from_bits(f32_to_bits(a) ^ f32_to_bits(b)); }
 inline float f32_mulsign(float v, uint32_t s) { return f32_from_bits(f32_to_bits(v)^s); }

 inline float f32_mask_select(uint32_t m, float a, float b)
 {
  return f32_from_bits((f32_to_bits(a) & m)|(f32_to_bits(b) & ~m));
 }

 float f32_sin_pi_3_k[] = {0x1.91f5aap1f, -0x1.408cf2p2f};
 float f32_sin_pi_5_k[] = {0x1.921f8ep1f, -0x1.4aa618p2f, 0x1.3f3f3cp1f};
 float f32_sin_pi_7_k[] = {0x1.921fb6p1f, -0x1.4abbecp2f, 0x1.466b2p1f, -0x1.2f5992p-1f};

 float f32_cos_pi_4_k[] = {0x1.fffe74p-1f, -0x1.3ba0ecp2f, 0x1.f73ff8p1f};
 float f32_cos_pi_6_k[] = {0x1.fffffep-1f, -0x1.3bd37ep2f, 0x1.03acccp2f, -0x1.4dfd3ap0f};
 float f32_cos_pi_8_k[] = {0x1p0f, -0x1.3bd3ccp2f, 0x1.03c1b8p2f, -0x1.55b7cep0f, 0x1.d684aap-3f};

 // tan(pi x)
 // spitball of max relative error 2.38419*10^-7 (2 ulp)
 float f32_tanpi(float a)
 {
  static const float* S = f32_sin_pi_7_k;
  static const float* C = f32_cos_pi_6_k;

  float    c,s,a2,a3,r;
  uint32_t q,sgn;

  r   = rintf(a+a);
  a   = fmaf(r,-0.5f,a);
  q   = (uint32_t)r;    q = -(q&1);
  sgn = q << 31;
  a2  = a*a;

  c  = fmaf(C[3], a2, C[2]); s = fmaf(S[3], a2, S[2]); a3 = a2*a;
  c  = fmaf(c,    a2, C[1]); s = fmaf(s,    a2, S[1]); 
  c  = fmaf(c,    a2, C[0]); s = a3 * s;
  c  = f32_mulsign(c,sgn);   s = fmaf(a,    S[0], s);

  float n = f32_mask_select(q,c,s);
  float d = f32_mask_select(q,s,c);

  return n/d;
 }

 // tan(pi x)
 // spitball of max relative error ~0.0000207424 (174 ulp)
 float f32_tanpi_r(float a)
 {
  static const float* S = f32_sin_pi_5_k;
  static const float* C = f32_cos_pi_4_k;

  float    c,s,a2,a3,r;
  uint32_t q,sgn;

  r   = rintf(a+a);
  a   = fmaf(r,-0.5f,a);
  q   = (uint32_t)r;
  sgn = q << 31;   q = -(q&1);
  a2  = a*a;
  
  c  = fmaf(C[2], a2, C[1]); s = fmaf(S[2], a2, S[1]); a3 = a2*a;
  c  = fmaf(c,    a2, C[0]); s = a3 * s;
  c  = f32_mulsign(c,sgn);   s = fmaf(a,    S[0], s);

  float n = f32_mask_select(q,c,s);
  float d = f32_mask_select(q,s,c);

  return n/d;
 }

 #define HISTO_LEN 177
 #define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))

 void test()
 {
  uint32_t histo[HISTO_LEN] = {0};
  uint32_t maxD = 0;

  // bordering on useless perf test
 #if 0
  double   total=0.0;
  double   min = 0x1p23f;

  uint64_t t0;
  uint64_t t1;

  for(uint32_t j=0; j<32; j++) {
    t0 = _rdtsc();
    float x0 = rng_f32(); x0 = 2.f*x0-1.f;
    float t = 0.f;
    
    for(uint32_t i=0; i<0xffff; i++) {
      float x0 = rng_f32(); x0 = 2.f*x0-1.f;
      
      float tr = f32_tanpi_r(x0);
      
      t += tr;
    }
    
    t1 = _rdtsc();
    double dt = (double)(t1-t0)*(1.0/65536.0);
    if (min > dt) min = dt;
    total += dt;
    printf("time: %f (%f)\n", dt, t);
  }
  printf("      %f %f\n", total*(1.0/32.0), min);
 #endif  

  // questionable (to be generous) error testing
  // libm tan error bound is unknown but assumes
  // tan(PI*x) is correctly rounded and ignores
  // the double rounding when dropping to singles
  for(uint32_t i=0; i<0xffffff; i++) {
 #if 1
    float x0 = rng_f32(); x0 = 2.f*x0-1.f;

    // grow the range a bit
    //x0 *= 16.f;
 #else    
    float x0 = 0.25f*rng_f32();
 #endif    
    float t0,t1;
    
    double x1 = M_PI*x0;
    t1 = (float)tan(x1);
    t0 = f32_tanpi(x0);
    
    // simple instead of ss version & fixup
    uint32_t diff = f32_ulp_dist(t0,t1);

 #if 1
    if (diff > maxD /*|| diff >= HISTO_LEN*/) {
      maxD = diff;
      printf("%14a (%14e)| % f % f | %14a %14a | %2u\n", x0,x0,t1,t0,t1,t0,diff);
    }
 #endif  
    
    diff = (diff < HISTO_LEN) ? diff : HISTO_LEN-1;

    histo[diff]++;
  }

  printf("\ntan {");
  for(uint32_t i=0; i<HISTO_LEN; i++) {
    printf("%u,", histo[i]);
  }
  printf("}\n");
 }


 int main(void)
 {
  uint64_t s0;
  uint64_t s1;

 #ifdef RANDOMIZE
  s0 = _rdtsc();
 #else  
  s0 = 0x77535345;
 #endif  
  s1 = 0x1234567;

  reset_generators(s0,s1);

  test();
  
  return 0;
 }
	// Public Domain under http://unlicense.org, see link for details.

	#include <stdint.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <math.h>
	#include <string.h>

	#include "xmmintrin.h"
	#include "wmmintrin.h"
	#include "x86intrin.h"

	#define RANDOMIZE

	// histogram size
	#define HLEN 127
	#define TLEN 0xFFFFFF

	// external code: xoroshiro128+

	inline uint32_t f32_to_bits(float x)
	{
	uint32_t u; memcpy(&u,&x,4); return u;
	}

	inline float f32_from_bits(uint32_t x)
	{
	float u; memcpy(&u,&x,4); return u;
	}

	inline uint32_t u32_abs(uint32_t x)
	{
	return (int32_t)x >= 0 ? x : -x;
	}

	inline uint32_t f32_ulp_dist(float a, float b)
	{
	uint32_t ua = f32_to_bits(a);
	uint32_t ub = f32_to_bits(b);
	uint32_t s = ub^ua;

	if ((int32_t)s >= 0)
	return u32_abs(ua-ub);

	return ua+ub+0x80000000;
	}

	//*******************

	uint64_t rng_state[2];

	inline uint64_t rotl(const uint64_t v, int i)
	{
	return (v << i)\|(v >> (64-i));
	}

	inline uint64_t rng_u64(void)
	{
	uint64_t s0 = rng_state[0];
	uint64_t s1 = rng_state[1];
	uint64_t r = s0 + s1;

	s1 ^= s0;
	rng_state[0] = rotl(s0,55) ^ s1 ^ (s1<<14);
	rng_state[1] = rotl(s1,36);

	return r;
	}
	// end: xoroshiro128+


	// double on [0,1)
	inline double rng_f64(void)
	{
	return (double)(rng_u64() >> (64-53))*0x1p-53;
	}

	inline float rng_f32(void)
	{
	return (float)(rng_u64() >> (64-24))*0x1p-24f;
	}

	void reset_generators(uint64_t s0, uint64_t s1)
	{
	rng_state[0] = s0;
	rng_state[1] = s1;
	rng_u64();
	}

	typedef struct {float a,b; } f32_pair_t;

	inline float f32_xor(float a, float b) { return f32_from_bits(f32_to_bits(a) ^ f32_to_bits(b)); }
	inline float f32_mulsign(float v, uint32_t s) { return f32_from_bits(f32_to_bits(v)^s); }

	inline float f32_mask_select(uint32_t m, float a, float b)
	{
	return f32_from_bits((f32_to_bits(a) & m)\|(f32_to_bits(b) & ~m));
	}

	float f32_sin_pi_3_k[] = {0x1.91f5aap1f, -0x1.408cf2p2f};
	float f32_sin_pi_5_k[] = {0x1.921f8ep1f, -0x1.4aa618p2f, 0x1.3f3f3cp1f};
	float f32_sin_pi_7_k[] = {0x1.921fb6p1f, -0x1.4abbecp2f, 0x1.466b2p1f, -0x1.2f5992p-1f};

	float f32_cos_pi_4_k[] = {0x1.fffe74p-1f, -0x1.3ba0ecp2f, 0x1.f73ff8p1f};
	float f32_cos_pi_6_k[] = {0x1.fffffep-1f, -0x1.3bd37ep2f, 0x1.03acccp2f, -0x1.4dfd3ap0f};
	float f32_cos_pi_8_k[] = {0x1p0f, -0x1.3bd3ccp2f, 0x1.03c1b8p2f, -0x1.55b7cep0f, 0x1.d684aap-3f};

	// tan(pi x)
	// spitball of max relative error 2.38419*10^-7 (2 ulp)
	float f32_tanpi(float a)
	{
	static const float* S = f32_sin_pi_7_k;
	static const float* C = f32_cos_pi_6_k;

	float c,s,a2,a3,r;
	uint32_t q,sgn;

	r = rintf(a+a);
	a = fmaf(r,-0.5f,a);
	q = (uint32_t)r; q = -(q&1);
	sgn = q << 31;
	a2 = a*a;

	c = fmaf(C[3], a2, C[2]); s = fmaf(S[3], a2, S[2]); a3 = a2*a;
	c = fmaf(c, a2, C[1]); s = fmaf(s, a2, S[1]);
	c = fmaf(c, a2, C[0]); s = a3 * s;
	c = f32_mulsign(c,sgn); s = fmaf(a, S[0], s);

	float n = f32_mask_select(q,c,s);
	float d = f32_mask_select(q,s,c);

	return n/d;
	}

	// tan(pi x)
	// spitball of max relative error ~0.0000207424 (174 ulp)
	float f32_tanpi_r(float a)
	{
	static const float* S = f32_sin_pi_5_k;
	static const float* C = f32_cos_pi_4_k;

	float c,s,a2,a3,r;
	uint32_t q,sgn;

	r = rintf(a+a);
	a = fmaf(r,-0.5f,a);
	q = (uint32_t)r;
	sgn = q << 31; q = -(q&1);
	a2 = a*a;

	c = fmaf(C[2], a2, C[1]); s = fmaf(S[2], a2, S[1]); a3 = a2*a;
	c = fmaf(c, a2, C[0]); s = a3 * s;
	c = f32_mulsign(c,sgn); s = fmaf(a, S[0], s);

	float n = f32_mask_select(q,c,s);
	float d = f32_mask_select(q,s,c);

	return n/d;
	}

	#define HISTO_LEN 177
	#define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))

	void test()
	{
	uint32_t histo[HISTO_LEN] = {0};
	uint32_t maxD = 0;

	// bordering on useless perf test
	#if 0
	double total=0.0;
	double min = 0x1p23f;

	uint64_t t0;
	uint64_t t1;

	for(uint32_t j=0; j<32; j++) {
	t0 = _rdtsc();
	float x0 = rng_f32(); x0 = 2.f*x0-1.f;
	float t = 0.f;

	for(uint32_t i=0; i<0xffff; i++) {
	float x0 = rng_f32(); x0 = 2.f*x0-1.f;

	float tr = f32_tanpi_r(x0);

	t += tr;
	}

	t1 = _rdtsc();
	double dt = (double)(t1-t0)*(1.0/65536.0);
	if (min > dt) min = dt;
	total += dt;
	printf("time: %f (%f)\n", dt, t);
	}
	printf(" %f %f\n", total*(1.0/32.0), min);
	#endif

	// questionable (to be generous) error testing
	// libm tan error bound is unknown but assumes
	// tan(PI*x) is correctly rounded and ignores
	// the double rounding when dropping to singles
	for(uint32_t i=0; i<0xffffff; i++) {
	#if 1
	float x0 = rng_f32(); x0 = 2.f*x0-1.f;

	// grow the range a bit
	//x0 *= 16.f;
	#else
	float x0 = 0.25f*rng_f32();
	#endif
	float t0,t1;

	double x1 = M_PI*x0;
	t1 = (float)tan(x1);
	t0 = f32_tanpi(x0);

	// simple instead of ss version & fixup
	uint32_t diff = f32_ulp_dist(t0,t1);

	#if 1
	if (diff > maxD /\|\| diff >= HISTO_LEN/) {
	maxD = diff;
	printf("%14a (%14e)\| % f % f \| %14a %14a \| %2u\n", x0,x0,t1,t0,t1,t0,diff);
	}
	#endif

	diff = (diff < HISTO_LEN) ? diff : HISTO_LEN-1;

	histo[diff]++;
	}

	printf("\ntan {");
	for(uint32_t i=0; i<HISTO_LEN; i++) {
	printf("%u,", histo[i]);
	}
	printf("}\n");
	}


	int main(void)
	{
	uint64_t s0;
	uint64_t s1;

	#ifdef RANDOMIZE
	s0 = _rdtsc();
	#else
	s0 = 0x77535345;
	#endif
	s1 = 0x1234567;

	reset_generators(s0,s1);

	test();

	return 0;
	}