Created
May 10, 2020 07:24
-
-
Save Marc-B-Reynolds/707db8df879e5e34ea21d3c13b782221 to your computer and use it in GitHub Desktop.
hacky scalar branch-free tanpi (not intended for use)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Public Domain under http://unlicense.org, see link for details. | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <math.h> | |
#include <string.h> | |
#include "xmmintrin.h" | |
#include "wmmintrin.h" | |
#include "x86intrin.h" | |
#define RANDOMIZE | |
// histogram size | |
#define HLEN 127 | |
#define TLEN 0xFFFFFF | |
// external code: xoroshiro128+ | |
inline uint32_t f32_to_bits(float x) | |
{ | |
uint32_t u; memcpy(&u,&x,4); return u; | |
} | |
inline float f32_from_bits(uint32_t x) | |
{ | |
float u; memcpy(&u,&x,4); return u; | |
} | |
inline uint32_t u32_abs(uint32_t x) | |
{ | |
return (int32_t)x >= 0 ? x : -x; | |
} | |
inline uint32_t f32_ulp_dist(float a, float b) | |
{ | |
uint32_t ua = f32_to_bits(a); | |
uint32_t ub = f32_to_bits(b); | |
uint32_t s = ub^ua; | |
if ((int32_t)s >= 0) | |
return u32_abs(ua-ub); | |
return ua+ub+0x80000000; | |
} | |
//******************* | |
uint64_t rng_state[2]; | |
inline uint64_t rotl(const uint64_t v, int i) | |
{ | |
return (v << i)|(v >> (64-i)); | |
} | |
inline uint64_t rng_u64(void) | |
{ | |
uint64_t s0 = rng_state[0]; | |
uint64_t s1 = rng_state[1]; | |
uint64_t r = s0 + s1; | |
s1 ^= s0; | |
rng_state[0] = rotl(s0,55) ^ s1 ^ (s1<<14); | |
rng_state[1] = rotl(s1,36); | |
return r; | |
} | |
// end: xoroshiro128+ | |
// double on [0,1) | |
inline double rng_f64(void) | |
{ | |
return (double)(rng_u64() >> (64-53))*0x1p-53; | |
} | |
inline float rng_f32(void) | |
{ | |
return (float)(rng_u64() >> (64-24))*0x1p-24f; | |
} | |
void reset_generators(uint64_t s0, uint64_t s1) | |
{ | |
rng_state[0] = s0; | |
rng_state[1] = s1; | |
rng_u64(); | |
} | |
typedef struct {float a,b; } f32_pair_t; | |
inline float f32_xor(float a, float b) { return f32_from_bits(f32_to_bits(a) ^ f32_to_bits(b)); } | |
inline float f32_mulsign(float v, uint32_t s) { return f32_from_bits(f32_to_bits(v)^s); } | |
inline float f32_mask_select(uint32_t m, float a, float b) | |
{ | |
return f32_from_bits((f32_to_bits(a) & m)|(f32_to_bits(b) & ~m)); | |
} | |
float f32_sin_pi_3_k[] = {0x1.91f5aap1f, -0x1.408cf2p2f}; | |
float f32_sin_pi_5_k[] = {0x1.921f8ep1f, -0x1.4aa618p2f, 0x1.3f3f3cp1f}; | |
float f32_sin_pi_7_k[] = {0x1.921fb6p1f, -0x1.4abbecp2f, 0x1.466b2p1f, -0x1.2f5992p-1f}; | |
float f32_cos_pi_4_k[] = {0x1.fffe74p-1f, -0x1.3ba0ecp2f, 0x1.f73ff8p1f}; | |
float f32_cos_pi_6_k[] = {0x1.fffffep-1f, -0x1.3bd37ep2f, 0x1.03acccp2f, -0x1.4dfd3ap0f}; | |
float f32_cos_pi_8_k[] = {0x1p0f, -0x1.3bd3ccp2f, 0x1.03c1b8p2f, -0x1.55b7cep0f, 0x1.d684aap-3f}; | |
// tan(pi x) | |
// spitball of max relative error 2.38419*10^-7 (2 ulp) | |
float f32_tanpi(float a) | |
{ | |
static const float* S = f32_sin_pi_7_k; | |
static const float* C = f32_cos_pi_6_k; | |
float c,s,a2,a3,r; | |
uint32_t q,sgn; | |
r = rintf(a+a); | |
a = fmaf(r,-0.5f,a); | |
q = (uint32_t)r; q = -(q&1); | |
sgn = q << 31; | |
a2 = a*a; | |
c = fmaf(C[3], a2, C[2]); s = fmaf(S[3], a2, S[2]); a3 = a2*a; | |
c = fmaf(c, a2, C[1]); s = fmaf(s, a2, S[1]); | |
c = fmaf(c, a2, C[0]); s = a3 * s; | |
c = f32_mulsign(c,sgn); s = fmaf(a, S[0], s); | |
float n = f32_mask_select(q,c,s); | |
float d = f32_mask_select(q,s,c); | |
return n/d; | |
} | |
// tan(pi x) | |
// spitball of max relative error ~0.0000207424 (174 ulp) | |
float f32_tanpi_r(float a) | |
{ | |
static const float* S = f32_sin_pi_5_k; | |
static const float* C = f32_cos_pi_4_k; | |
float c,s,a2,a3,r; | |
uint32_t q,sgn; | |
r = rintf(a+a); | |
a = fmaf(r,-0.5f,a); | |
q = (uint32_t)r; | |
sgn = q << 31; q = -(q&1); | |
a2 = a*a; | |
c = fmaf(C[2], a2, C[1]); s = fmaf(S[2], a2, S[1]); a3 = a2*a; | |
c = fmaf(c, a2, C[0]); s = a3 * s; | |
c = f32_mulsign(c,sgn); s = fmaf(a, S[0], s); | |
float n = f32_mask_select(q,c,s); | |
float d = f32_mask_select(q,s,c); | |
return n/d; | |
} | |
#define HISTO_LEN 177 | |
#define LENGTHOF(X) (sizeof(X)/sizeof(X[0])) | |
void test() | |
{ | |
uint32_t histo[HISTO_LEN] = {0}; | |
uint32_t maxD = 0; | |
// bordering on useless perf test | |
#if 0 | |
double total=0.0; | |
double min = 0x1p23f; | |
uint64_t t0; | |
uint64_t t1; | |
for(uint32_t j=0; j<32; j++) { | |
t0 = _rdtsc(); | |
float x0 = rng_f32(); x0 = 2.f*x0-1.f; | |
float t = 0.f; | |
for(uint32_t i=0; i<0xffff; i++) { | |
float x0 = rng_f32(); x0 = 2.f*x0-1.f; | |
float tr = f32_tanpi_r(x0); | |
t += tr; | |
} | |
t1 = _rdtsc(); | |
double dt = (double)(t1-t0)*(1.0/65536.0); | |
if (min > dt) min = dt; | |
total += dt; | |
printf("time: %f (%f)\n", dt, t); | |
} | |
printf(" %f %f\n", total*(1.0/32.0), min); | |
#endif | |
// questionable (to be generous) error testing | |
// libm tan error bound is unknown but assumes | |
// tan(PI*x) is correctly rounded and ignores | |
// the double rounding when dropping to singles | |
for(uint32_t i=0; i<0xffffff; i++) { | |
#if 1 | |
float x0 = rng_f32(); x0 = 2.f*x0-1.f; | |
// grow the range a bit | |
//x0 *= 16.f; | |
#else | |
float x0 = 0.25f*rng_f32(); | |
#endif | |
float t0,t1; | |
double x1 = M_PI*x0; | |
t1 = (float)tan(x1); | |
t0 = f32_tanpi(x0); | |
// simple instead of ss version & fixup | |
uint32_t diff = f32_ulp_dist(t0,t1); | |
#if 1 | |
if (diff > maxD /*|| diff >= HISTO_LEN*/) { | |
maxD = diff; | |
printf("%14a (%14e)| % f % f | %14a %14a | %2u\n", x0,x0,t1,t0,t1,t0,diff); | |
} | |
#endif | |
diff = (diff < HISTO_LEN) ? diff : HISTO_LEN-1; | |
histo[diff]++; | |
} | |
printf("\ntan {"); | |
for(uint32_t i=0; i<HISTO_LEN; i++) { | |
printf("%u,", histo[i]); | |
} | |
printf("}\n"); | |
} | |
int main(void) | |
{ | |
uint64_t s0; | |
uint64_t s1; | |
#ifdef RANDOMIZE | |
s0 = _rdtsc(); | |
#else | |
s0 = 0x77535345; | |
#endif | |
s1 = 0x1234567; | |
reset_generators(s0,s1); | |
test(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment