Marc-B-Reynolds · February 23, 2023 18:03 · Marc-B-Reynolds · Feb 15, 2023 · morozlv · Feb 18, 2023
diff --git a/cbrt.c b/cbrt.c
 // Public Domain under http://unlicense.org, see link for details.
 // except:
 // * core-math function `cr_cbrtf`               (see license below)
 // * musl flavored fdlib function `fdlibm_cbrtf` (see license below)

 // code and test driver for cube root and it's reciprocal based on:
 // "Fast Calculation of Cube and Inverse Cube Roots Using
 // a Magic Constant and Its Implementation on Microcontrollers"
 // Moroz, Samotyy, Walczyk, Cieslinski, 2021
 // (PDF: https://www.mdpi.com/1996-1073/14/4/1058)
 //
 // Added some modification with higher powered devices in mind
 // (standard PCs, consoles, phones, etc). 

 // NOW INCLUDES: for testing normal cube root instead of MPFR. MPFR will be
 // used instead if USE_MPFR is defined. Can also "cheat" & test 1/cbrt
 // using libm & doubles. Needs to be validate on a given libm & range
 // So if USE_MPFR and USE_DOUBLES_LIKE_A_CHEATER are both defined then
 // MPFR isn't needed.

 /* Correctly-rounded cubic root of binary32 value.

 Copyright (c) 2022 Alexei Sibidanov.

 This file is part of the CORE-MATH project
 (https://core-math.gitlabpages.inria.fr/).

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 /* origin: FreeBSD /usr/src/lib/msun/src/s_cbrtf.c */
 /*
 * Conversion to float by Ian Lance Taylor, Cygnus Support, [email protected].
 * Debugged and optimized by Bruce D. Evans.
 */
 /*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunPro, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */

 // if define uses MPFR for test cbrt instead of core-math's `cr_cbrtf`
 //#define USE_MPFR

 // if defined uses `lib_rcbrt_dp` instead of MPFR to test reciprocial
 // cube root. On my system this happens to return correctly rounded results
 // for the primary test range.  (beware! should validate the range with a given
 // libm before using.)
 #define USE_DOUBLES_LIKE_A_CHEATER

 // compile with (or equiv). (requires MPFR installed..unless as noted above)
 // clang -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno -ffp-contract=off cbrt.c -o cbrt -lm -lmpfr

 // compile time macros:
 //   NO_RCBRT  : bypass testing 1/cbrt
 //   NO_CBRT   : bypass testing cbrt
 //   WIP_CBRT  : test only the cbrt function defined
 //   WIP_RCBRT : test only the 1/cbrt function defined


 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
 #include <assert.h>

 #if !defined(USE_MPFR) && !defined(USE_DOUBLES_LIKE_A_CHEATER)
 #include <mpfr.h>
 #endif

 //********************************************************
 // CONFIG STUFF HERE

 // number of ULP errors to track individually
 // (1 is used for post, 5 for source comments)
 // using define to avoid using a compiler extension. sigh.
 #define max_ulp_dist 5


 // choose smallest value of the test interval
 const float test_start_value = 0.25f;            // cover [1/4,2] (both sides of 1)

 //const float test_start_value = 0.5f;             // boring, but useful
 //const float test_start_value = 0x1.0p-126f;   // smallest mag. normal 
 //const float test_start_value = 0x1.0p+122f;   // big but no overflow
 // max finite exponent is 127, we need to subtract 3
 // to cover the test range (124) and compute values
 // to the 4th power (another -2 = 122)


 // full range variants and perform sign validation: cbrt(-x) = -cbrt(x)
 #define TEST_FULL_RANGE

 static inline uint32_t f32_to_bits(float x)
 {
  uint32_t u; memcpy(&u,&x,4); return u;
 }

 static inline float f32_from_bits(uint32_t x)
 {
  float u; memcpy(&u,&x,4); return u;
 }

 // if 'v' is float and 's' is all clear (except sign bit)
 static inline float f32_mulsign(float v, uint32_t s)
 {
  return f32_from_bits(f32_to_bits(v)^s);
 }


 //********************************************************
 // variants based on:
 // "Fast Calculation of Cube and Inverse Cube Roots Using
 // a Magic Constant and Its Implementation on Microcontrollers"
 // Moroz, Samotyy, Walczyk, Cieslinski, 2021
 // SEE: algorithm 5 (A5) and algorithm 6



 // common computation for x^(1/3) and x^(-1/3)
 //   x  is positive
 //   xi is IEEE bit pattern of x
 //
 // (reworked core portion of A5/A6 by Moroz,
 //  Samotyy, Walczyk, Cieslinski)
 //
 // (1) unsigned div by constant faster than signed
 // (2) breaks dependences up (y*x) & (y*y) can be
 //     'in-flight' at the same time. The other
 //     two choices the product chain is serial. 
 static inline float f32_cbrt_ki(float x, uint32_t xi)
 {
  // authors optimized Halley's method first step
  // given their "magic" constant (in 1)
  const float c0 =  0x1.c09806p0f;
  const float c1 = -0x1.403e6cp0f;
  const float c2 =  0x1.04cdb2p-1f;

  // initial guess
  xi = 0x548c2b4b - xi/3;               // (1)

  // modified Halley's method
  float y = f32_from_bits(xi);
  float c = (x*y)*(y*y);                // (2)
  
  y = y*fmaf(c, fmaf(c2,c,c1), c0);

  return y;
 }

 // Newton step:
 //   y = 1/cbrt(x)
 //   f(y)    = 1/y^3 - x
 //   f'(y)   = -3/y^4
 //   y_{n+1} = y - f(y)/f'(y)
 //           = y + 1/3 (1/y^3 - x) y^4
 //           = y + 1/3 (y - xy^4) 
 //           = y(1 + 1/3 (1 - xy^3))
 //           = y(1 + 1/3 c)  : c = 1 - xy^3 
 //           = y + 1/3 yc

 // SEE: https://marc-b-reynolds.github.io/math/2019/03/12/FpDiv.html
 // RN(1/3) = RU(1/3)
 static const float f32_third    =  0x1.555556p-2f;
 static const float f32_third_l  = -0x1.555556p-27f;

 // 1/cbrt(x) newton step with correctly rounded div by 3
 // y = y_n (estimate in)

 static inline float f32_rcbrt_newton(float x, float y)
 {
  float c = fmaf(x,y*y*y, -1.0f);                   // xy^3-1
  float t = fmaf(c, -f32_third, c*(-f32_third_l));  // -RN(c/3)
  
  return fmaf(y,t,y);
 }

 static inline float f32_rcbrt_newton_m(float x, float y)
 {
  float c = fmaf(x*y,y*y, -1.0f);
  float t = fmaf(c, -f32_third, c*(-f32_third_l));
  
  return fmaf(y,t,y);
 }

 // 1/cbrt(x) newton step with div by 3 as mul by recip
 // y = y_n (estimate in)
 static inline float f32_rcbrt_newton_fast(float x, float y)
 {
  float c = fmaf(x,y*y*y, -1.0f);

  y = y*fmaf(-f32_third,c,1.f);

  return y;
 }

 // dep-chain breakup: slightly hampers results
 static inline float f32_rcbrt_newton_fast_m(float x, float y)
 {
  float c = fmaf(x*y,y*y, -1.0f);

  y = y*fmaf(-f32_third,c,1.f);

  return y;
 }

 const double f64_third   = 0x1.5555555555555p-2;
 const double f64_third_l = 0x1.5555555555555p-56;

 // 1/cbrt(x) newton step with correctly rounded div by 3
 // y = y_n (estimate in)
 static inline double f64_rcbrt_newton(double x, double y)
 {
  double c = fma(x,y*y*y, -1.0);
  double t = fma(c, -f64_third, c*(-f64_third_l));

  y = fma(y,t,y);
  
  return y;
 }

 static inline double f64_rcbrt_newton_m(double x, double y)
 {
  double c = fma(x*y,y*y, -1.0);
  double t = fma(c, -f64_third, c*(-f64_third_l));

  y = fma(y,t,y);
  
  return y;
 }

 // 1/cbrt(x) newton step with div by 3 as mul by recip
 // y = y_n (estimate in)
 static inline double f64_rcbrt_newton_fast(double x, double y)
 {
  double c = fma(x,y*y*y, -1.0);

  y = y*fma(-f64_third,c,1.0);

  return y;
 }

 static inline double f64_rcbrt_newton_fast_m(double x, double y)
 {
  double c = fma(x*y,y*y, -1.0);

  y = y*fma(-f64_third,c,1.0);

  return y;
 }

 // Newton step:
 //   y       = cbrt(x)
 //   f(y)    = y^3 - x
 //   f'(y)   = 3y^2
 //   y_{n+1} = y - f(y)/f'(y)
 //           = y - (y^3 - x)/3y^2


 // cbrt(x) newton step. y = ~1/cbrt(x)
 static inline float f32_cbrt_newton_fast_(float x, float y)
 {
  float d = x*(y*y);
  float c = fmaf(d,y,-1.f);
  float t = fmaf(c, -f32_third, 1.f);
  y = d*(t*t);

  return y;
 }

 static inline float f32_cbrt_newton_fast(float x, float y)
 {
  float a = x*y;
  float d = a*y;
  float c = fmaf(d,y,-1.f);
  float t = fmaf(c, -f32_third, 1.f);
  y = d*(t*t);

  return y;
 }

 static inline double f64_cbrt_newton_fast(double x, double y)
 {
  double a = x*y;
  double d = a*y;
  double c = fma(d,y,-1.0);
  double t = fma(c, -f64_third, 1.0);
  y = d*(t*t);

  return y;
 }

 // temp hack (not reworked yet)
 static inline float f32_cbrt_newton(float x, float y)
 {
  float d = x*(y*y);
  float c = fmaf(d,y,-1.f);
  float s = fmaf(c, -f32_third, c*(-f32_third_l));  // -RN(c/3)
  float t = s + 1.f;
  y = d*(t*t);

  return y;
 }

 // ( currently unused )
 // 1/cbrt(x) Halley step:
 //   y_{n+1} = y (1 + c(1/3 + 2/9 c)) , c = 1-xy^3
 //           = y (1 + (1/3c+ 2/9 c^2)), 
 //           = y (1 + (t + 2t^2)),   t = c/3
 //           = y (1 + a),   t = c/3

 static inline float f32_rcbrt_halley(float x, float y)
 {
  float c = fmaf(x,y*y*y, -1.0f);                   // xy^3-1
  float t = fmaf(c, -f32_third, c*(-f32_third_l));  // -RN(c/3)
  float a = fmaf(2.f*t,t,t);
  return fmaf(y,a,y);
 }

 static inline float f32_rcbrt_halley_m(float x, float y)
 {
  float c = fmaf(x*y,y*y, -1.0f);                   // xy^3-1
  float t = fmaf(c, -f32_third, c*(-f32_third_l));  // -RN(c/3)
  float a = fmaf(2.f*t,t,t);
  return fmaf(y,a,y);
 }

 //********************************************************
 // 

 // expand a cbrt/rcbrt core method for positive only input
 static inline float f32_cbrt_px(float x, float (*f)(float, uint32_t))
 {
  return f(x, f32_to_bits(x));                   
 }

 // expand a cbrt/rcbrt core method for full range: cbrt(-x) = -cbrt(x)
 static inline float f32_cbrt_fx(float x, float (*f)(float, uint32_t))
 {
  uint32_t ix = f32_to_bits(x);     // bit pattern of x
  uint32_t ax = ix & 0x7fffffff;    // |x|     {bits}
  uint32_t sx = ix & 0x80000000;    // sign(x) {bit }
  float    a  = f32_from_bits(ax);  // |x|
  float    r  = f(a,ax);            // core approximation
  
  return f32_mulsign(r, sx);        // restore sign
 }


 //********************************************************
 // cycle numbers spitball for full range expanded

 // Author's corrected version (algorithm 6.1).
 // (modified)
 // ulp   3
 // correctly:   52.266572% (13153314)
 // faithfully:  45.591750% (11473540)
 //   2 ulp:      2.141619% (538956)
 //   3 ulp:      0.000060% (15)
 static inline float f32_cbrt_k_1(float x, uint32_t ix)
 {
  float y = f32_cbrt_ki(x,ix);

  return f32_cbrt_newton_fast(x,y);
 }


 // sqrt(x*x^(-1/3)) = sqrt(x^(2/3)) = x^(1/3)
 // ulp   2
 // correctly:   77.553369% (19516945)
 // faithfully:  22.445829% (5648678)
 //   2 ulp:      0.000803% (202)
 static inline float f32_cbrt_k_2(float x, uint32_t ix)
 {
  float y = f32_cbrt_ki(x,ix);

  return sqrtf(x * f32_rcbrt_newton_fast_m(x,y));
 }

 // ulp   1
 // correctly:   80.880794% (20354319)
 // faithfully:  19.119206% (4811506)
 static inline float f32_cbrt_k_3(float x, uint32_t ix)
 {
  float y = f32_cbrt_ki(x,ix);

  return sqrtf(x * f32_rcbrt_newton_m(x,y));
 }

 // correctly:   98.239267% (24722722)
 // faithfully:   1.760733% (443103)
 static inline float f32_cbrt_k_4(float x, uint32_t ix)
 {
  double y = (double)f32_cbrt_ki(x,ix);
  double d = (double)x;

  y = f64_cbrt_newton_fast(d,y);

  return (float)y;
 }

 // correctly rounded : but questionable perf vs. fdlibm
 static inline float f32_cbrt_k_5(float x, uint32_t ix)
 {
  double y = (double)f32_cbrt_ki(x,ix);
  double d = (double)x;

  y = f64_rcbrt_newton_fast(d,y);
  y = f64_cbrt_newton_fast(d,y);

  return (float)y;
 }


 //********************************************************
 // expand full range cbrt variants

 float f32_cbrt_1(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_1); }
 float f32_cbrt_2(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_2); }
 float f32_cbrt_3(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_3); }
 float f32_cbrt_4(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_4); }
 float f32_cbrt_5(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_5); }
 //float f32_cbrt_6(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_6); }
 //float f32_cbrt_7(float x) { return f32_cbrt_fx(x, &f32_cbrt_k_7); }

 // expand postive input cbrt variants
 float f32_cbrt_1p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_1); }
 float f32_cbrt_2p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_2); }
 float f32_cbrt_3p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_3); }
 float f32_cbrt_4p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_4); }
 float f32_cbrt_5p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_5); }
 //float f32_cbrt_6p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_6); }
 //float f32_cbrt_7p(float x) { return f32_cbrt_px(x, &f32_cbrt_k_7); }


 //********************************************************
 // x^(-1/3) aka 1/cbrt(x)


 // A5
 // ulp   2
 // correctly:   72.750709% (18308316)
 // faithfully:  27.207556% (6847006)
 //   2 ulp:      0.041735% (10503)
 // ~17 cycles
 static inline float f32_rcbrt_k_1(float x, uint32_t ix)
 {
  float y = f32_cbrt_ki(x, ix);

  return f32_rcbrt_newton_fast(x,y);
 //return f32_rcbrt_newton_fast_m(x,y);
 }

 // A5: modified by
 // * correctly rounded divison by 3
 // ulp   1
 // correctly:   88.337028% (22230742)
 // faithfully:  11.662972% (2935083)
 // ~17 cycles
 static inline float f32_rcbrt_k_2(float x, uint32_t ix)
 {
  float y = f32_cbrt_ki(x,ix);

  return f32_rcbrt_newton(x,y);
 //return f32_rcbrt_newton_m(x,y);
 }

 // ulp   1
 // correctly:   99.150757% (24952106)
 // faithfully:   0.849243% (213719)
 static inline float f32_rcbrt_k_3(float x, uint32_t ix)
 {
  double y = (double)f32_cbrt_ki(x,ix);
  double d = (double)x;

  y = f64_rcbrt_newton_fast(d,y);

  return (float)y;
 }

 // appears correctly rounded: also appears to underperform
 // vs. locally compiled fdlibm version YMMV. fdlibm version
 // performs 2 floating-point divisions
 static inline float f32_rcbrt_k_4(float x, uint32_t ix)
 {
  double y = (double)f32_cbrt_ki(x,ix);
  double d = (double)x;

  y = f64_rcbrt_newton_fast(d,y);
  y = f64_rcbrt_newton_fast(d,y);

  return (float)y;
 }


 //*****************************************************
 // expand full/positive only 1/cbrt

 float f32_rcbrt_1(float x)  { return f32_cbrt_fx(x, &f32_rcbrt_k_1); }
 float f32_rcbrt_2(float x)  { return f32_cbrt_fx(x, &f32_rcbrt_k_2); }
 float f32_rcbrt_3(float x)  { return f32_cbrt_fx(x, &f32_rcbrt_k_3); }
 float f32_rcbrt_4(float x)  { return f32_cbrt_fx(x, &f32_rcbrt_k_4); }

 float f32_rcbrt_1p(float x) { return f32_cbrt_px(x, &f32_rcbrt_k_1); }
 float f32_rcbrt_2p(float x) { return f32_cbrt_px(x, &f32_rcbrt_k_2); }
 float f32_rcbrt_3p(float x) { return f32_cbrt_px(x, &f32_rcbrt_k_3); }
 float f32_rcbrt_4p(float x) { return f32_cbrt_px(x, &f32_rcbrt_k_4); }

 //*****************************************************

 // for fast testing 
 float lib_rcbrt_dp(float x)
 {
  return (float)(1.0/cbrt((double)x));
 }


 // compiler provided libm cbrtf
 // ulp   1
 // correctly:   89.394180% (22496783)
 // faithfully:  10.605820% (2669042)
 // ~60.991279 cycle 
 float lib_cbrtf(float x)
 {
  return cbrtf(x);
 }

 // ulp   1
 // correctly:   85.348766% (21478721)
 // faithfully:  14.651234% (3687104)
 // ~35.4 cycles
 // WARNING:  The error skyrockets for small/large inputs
 // (well up to 15 ulp error)
 float lol_cbrt(float x)
 {
  return powf(x, 1.f/3.f);
 }

 // compiler provided libm cbrtf + divide
 // ulp   2
 // correctly:   72.071696% (18137437)
 // faithfully:  27.859262% (7011013)
 //   2 ulp:      0.069042% (17375)
 // ~ 64 cycles
 float lib_rcbrt(float x)
 {
  return 1.f/cbrtf(x);
 }

 // this is much better than I expected
 // ulp   1
 // correctly:   88.660451% (22312134)
 // faithfully:  11.339549% (2853691)
 // ~35.4 cycles
 // WARNING:  The error skyrockets for small/large inputs
 // (well up to 15 ulp error)
 // NOTE: positive only as written
 float lol_rcbrt(float x)
 {
  return powf(x, -f32_third);
 }

 //**************************** start of musl

 // correctly rounded : ~26.540119 cycles

 static const unsigned
 B1 = 709958130, /* B1 = (127-127.0/3-0.03306235651)*2**23 */
 B2 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */

 float fdlibm_cbrtf(float x)
 {
  double_t r,T;
  union {float f; uint32_t i;} u = {x};
  uint32_t hx = u.i & 0x7fffffff;
  
  if (hx >= 0x7f800000)  /* cbrt(NaN,INF) is itself */
    return x + x;
  
  /* rough cbrt to 5 bits */
  if (hx < 0x00800000) {  /* zero or subnormal? */
    if (hx == 0)
      return x;  /* cbrt(+-0) is itself */
    u.f = x*0x1p24f;
    hx = u.i & 0x7fffffff;
    hx = hx/3 + B2;
  } else {
    hx = hx/3 + B1;
  }    

  u.i &= 0x80000000;
  u.i |= hx;
  
  /*
   * First step Newton iteration (solving t*t-x/t == 0) to 16 bits.  In
   * double precision so that its terms can be arranged for efficiency
   * without causing overflow or underflow.
   */
  T = u.f;
  r = T*T*T;
  T = T*((double_t)x+x+r)/(x+r+r);
  
  /*
   * Second step Newton iteration to 47 bits.  In double precision for
   * efficiency and accuracy.
   */
  r = T*T*T;
  T = T*((double_t)x+x+r)/(x+r+r);
  
  /* rounding to 24 bits is perfect in round-to-nearest mode */
  return (float)T;
 }

 //**************************** end of musl

 //********************************************************
 // driver and testing junk below here

 #define LENGTHOF(X) (sizeof(X)/sizeof(X[0]))

 typedef struct {
  float (*f)(float);
  char*   name;
 } func_entry_t;

 #define STRINGIFY(S)  STRINGIFY_(S)
 #define STRINGIFY_(S) #S
 #define ENTRY(X) { .f=&X, .name=STRINGIFY(X) }


 func_entry_t rcbrt_table[] =
 {
 #if !defined(WIP_RCBRT)
  
  // this is only for validating it's OK
 #if !defined(USE_DOUBLES_LIKE_A_CHEATER)
  ENTRY(lib_rcbrt_dp), 
 #endif
  
  ENTRY(lib_rcbrt),

 #if defined(TEST_FULL_RANGE)    
  ENTRY(f32_rcbrt_1),
  ENTRY(f32_rcbrt_2),
  ENTRY(f32_rcbrt_3),
  ENTRY(f32_rcbrt_4),
 #else
  ENTRY(lol_rcbrt),
  ENTRY(f32_rcbrt_1p),
  ENTRY(f32_rcbrt_2p),
  ENTRY(f32_rcbrt_3p),
  ENTRY(f32_rcbrt_4p),
 #endif  

 #else
  ENTRY(WIP_RCBRT)
 #endif  
 };

 func_entry_t cbrt_table[] =
 {
 #if !defined(WIP_CBRT)

 #if defined(TEST_FULL_RANGE)    
  ENTRY(lib_cbrtf),
  ENTRY(fdlibm_cbrtf),
  ENTRY(f32_cbrt_1),
  ENTRY(f32_cbrt_2),
  ENTRY(f32_cbrt_3),
  ENTRY(f32_cbrt_4),
  ENTRY(f32_cbrt_5),
  //ENTRY(f32_cbrt_6),
  //ENTRY(f32_cbrt_7),
 #else
  ENTRY(lol_cbrt),
  ENTRY(f32_cbrt_1p),
  ENTRY(f32_cbrt_2p),
  ENTRY(f32_cbrt_3p),
  ENTRY(f32_cbrt_4p),
  ENTRY(f32_cbrt_5p),
  //ENTRY(f32_cbrt_6),
  //ENTRY(f32_cbrt_7),
 #endif  

 #else
  ENTRY(WIP_CBRT)
 #endif  
 };


 #undef ENTRY

 //********************************************************



 static inline uint32_t u32_abs(uint32_t x)
 {
  return (int32_t)x >= 0 ? x : -x;
 }

 // ulp distance provided a & b are finite
 // and have the same sign
 static inline uint32_t f32_ulp_dist_ss(float a, float b)
 {
  uint32_t ua = f32_to_bits(a);
  uint32_t ub = f32_to_bits(b);
  return u32_abs(ua-ub);
 }

 //**************************** start of core-math
 // SEE: https://core-math.gitlabpages.inria.fr
 // and license info at top of file.

 // oh my!
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpragmas"
 #pragma GCC diagnostic ignored "-Wpedantic"
 #pragma GCC diagnostic ignored "-Wunknown-warning-option"
 #pragma GCC diagnostic ignored "-Wconversion"
 #pragma GCC diagnostic ignored "-Wsign-conversion"
 #pragma GCC diagnostic ignored "-Wshorten-64-to-32"
 #pragma GCC diagnostic ignored "-Wimplicit-float-conversion"
 #pragma GCC diagnostic ignored "-Wimplicit-int-conversion"
 #pragma GCC diagnostic ignored "-Wfloat-conversion"
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"

 #define INEXACTFLAG 0

 typedef union {float f; uint32_t u;} b32u32_u;
 typedef union {double f; uint64_t u;} b64u64_u;


 float cr_cbrtf (float x){
  static const double escale[3] = {1.0, 0x1.428a2f98d728bp+0/* 2^(1/3) */, 0x1.965fea53d6e3dp+0/* 2^(2/3) */};
 #if INEXACTFLAG!=0
  volatile uint32_t flag = _mm_getcsr(); /* store MXCSR Control/Status Register */
 #endif
  b32u32_u cvt0 = {.f = x};
  uint32_t hx = cvt0.u, ix = 0x7fffffff&hx, e = ix>>23, mant = hx&0x7fffff;
  long sign = hx>>31;
  if(__builtin_expect(((e+1)&0xff)<2, 0)){
    if(e==0xff||ix==0) return x + x; /* 0, inf, nan */
    int nz = __builtin_clz(ix) - 8;  /* denormals */
    mant <<= nz;
    mant &= 0x7fffff;
    e -= nz - 1;
  }
  e += 899;
  b64u64_u cvt1 = {.u = (uint64_t)mant<<29|(0x3fful<<52)};
  uint32_t et = e/3, it = e%3;
  uint64_t isc = ((const uint64_t*)escale)[it];
  isc += (long)(et - 342)<<52;
  isc |= sign<<63;
  b64u64_u cvt2 = {.u = isc};
  static const double c[] = {0x1.1b0babccfef9cp-1, 0x1.2c9a3e94d1da5p-1, -0x1.4dc30b1a1ddbap-3, 0x1.7a8d3e4ec9b07p-6};
  const double u0 = 0x1.5555555555555p-2, u1 = 0x1.c71c71c71c71cp-3, u2 = 0x1.61f9add3c0ca4p-3;
  double z = cvt1.f, r = 1/z, z2 = z*z;
  double c0 = c[0] + z*c[1], c2 = c[2] + z*c[3], y = c0 + z2*c2, y2 = y*y;
  double w0 = y*u0, w1 = y*u1, w2 = y*u2;
  double h = y2*(y*r) - 1, h2 = h*h;
  y -= h*((w0 - w1*h) + w2*h2);
  y *= cvt2.f;
  b64u64_u cvt3 = {.f = y};
  float yf = y;
  long m0 = cvt3.u<<19, m1 = m0>>63;
  if(__builtin_expect((m0^m1)<(1l<<31),0)){
    b64u64_u cvt4 = {.u = (cvt3.u + (1ul<<31))&0xffffffff00000000ul};
    yf = cvt4.f;
 #if INEXACTFLAG!=0
    _mm_setcsr(flag); /* restore MXCSR Control/Status Register for exact roots to get rid of the inexact flag if risen inside the function */
 #endif
  }
  return yf;
 }

 #pragma GCC diagnostic pop

 //**************************** end of core-math

 void cbrt_test(func_entry_t* entry)
 {
  // with x on [1,8] -> cbrt(x) on [1,2]
  // assumming test_start_value = 1. otherwise likewise
  // scaled to cover all significand bit pattern range on
  // the output.
  const uint32_t xs = f32_to_bits(test_start_value);
  const uint32_t xe = f32_to_bits(8.f*test_start_value);

  const uint32_t total = xe-xs+1;

 #if defined(USE_MPFR)  
  mpfr_t mx,mr;
  
  mpfr_init2(mx, 24);
  mpfr_init2(mr, 24);
 #endif  

  uint32_t error_u = 0;
  uint32_t error_cnt = 0;

  printf("starting: %s on [%08x, %08x]\n", entry->name, xs,xe);

  uint32_t error[max_ulp_dist+1] = {0};

  float (*f)(float) = entry->f;

  for(uint32_t xi=xs; xi<=xe; xi++) {
    float x  = f32_from_bits(xi);

    // compute correctly rounded
 #if defined(USE_MPFR)  
    mpfr_set_flt(mx, x,  MPFR_RNDN);          // no error
    mpfr_cbrt(mr, mx, MPFR_RNDN);             // RN(x^(1/3))
    
    float cr = mpfr_get_flt(mr, MPFR_RNDN);   // no error
 #else
    float cr = cr_cbrtf(x);                   // core-math correctly rounded
 #endif    
    
    float r0 = f( x);

 #if defined(TEST_FULL_RANGE)    
    float r1 = f(-x);

    if (r0 != -r1) {
      printf("sign handling broken %a %a\n", r0,r1);
      exit(0);
    }
 #endif    

    if (r0 == cr)
      error[0]++;
    else {
      uint32_t u0 = f32_ulp_dist_ss(cr,r0);
      
      // track it if small enough for the buckets
      if (u0 <= max_ulp_dist)
 	error[u0]++;
      else
 	error_cnt++;
      
      if (u0 > error_u) error_u = u0;
    }
  }
  
 #if defined(USE_MPFR)  
  mpfr_clear(mr);
  mpfr_clear(mx);
 #endif  

  printf("// ulp   %d\n", error_u);
  printf("// correctly:  %10f%% (%d)\n", 100.0*(double)(error[0])/(double)(total), error[0]);

  if (error[1])
    printf("// faithfully: %10f%% (%d)\n", 100.0*(double)(error[1])/(double)(total), error[1]);

  for(uint32_t i=2; i<=max_ulp_dist; i++)
    if (error[i])
      printf("// %3d ulp:    %10f%% (%d)\n", i,100.0*(double)(error[i])/(double)(total), error[i]);

  if (error_cnt)
    printf("//  >%d ulp:    %10f%% (%d)\n", max_ulp_dist,100.0*(double)(error_cnt)/(double)(total), error_cnt);

  printf("\n");
 }


 void rcbrt_test(func_entry_t* entry)
 {
  // with x on [1,8] -> 1/cbrt(x) on [1/2,1]
  // assumming test_start_value = 1. otherwise likewise
  // scaled to cover all significand bit pattern range on
  // the output.
  const uint32_t xs = f32_to_bits(test_start_value);
  const uint32_t xe = f32_to_bits(8.f*test_start_value);

  const uint32_t total = xe-xs+1;

 #if !defined(USE_DOUBLES_LIKE_A_CHEATER)
  mpfr_t m1,mx,mr;
  
  mpfr_init2(m1, 24);
  mpfr_init2(mx, 128);
  mpfr_init2(mr, 24);

  mpfr_set_flt(m1, 1.f, MPFR_RNDN);
 #endif  

  uint32_t error_u = 0;
  uint32_t error_cnt = 0;

  printf("starting: %s on [%08x, %08x]\n", entry->name, xs,xe);

  uint32_t error[max_ulp_dist+1] = {0};

  float (*f)(float) = entry->f;

  for(uint32_t xi=xs; xi<=xe; xi++) {
    float x  = f32_from_bits(xi);

 #if !defined(USE_DOUBLES_LIKE_A_CHEATER)
    // compute correctly rounded (well..sloppy actually)
    // I "should" write a test that checks if the MP
    // computation rounded up & rounded down both round
    // to same single precision value. 
    mpfr_set_flt(mx, x,  MPFR_RNDN);        // no error
    mpfr_cbrt(mx, mx, MPFR_RNDN);           // t = RN(x^(1/3)) {128 bits}
    mpfr_div(mr,m1,mx, MPFR_RNDN);          // r = RN(1/t)     {128 bits}
    
    float cr = mpfr_get_flt(mr, MPFR_RNDN); // RN(t) {single precision}
 #else
    float cr = lib_rcbrt_dp(x);
 #endif    
    
    float r0 = f(x);

 #if defined(TEST_FULL_RANGE)    
    float r1 = f(-x);

    if (r0 != -r1) {
      printf("sign handling broken %a %a\n", r0,r1);
      exit(0);
    }
 #endif    
    
    if (r0 == cr)
      error[0]++;
    else {
      uint32_t u0 = f32_ulp_dist_ss(cr,r0);
      
      // track it if small enough for the buckets
      if (u0 <= max_ulp_dist)
 	error[u0]++;
      else
 	error_cnt++;
      
      if (u0 > error_u) error_u = u0;
    }
  }
  
 #if !defined(USE_DOUBLES_LIKE_A_CHEATER)
  mpfr_clear(mr);
  mpfr_clear(mx);
 #endif  

  printf("// ulp   %d\n", error_u);
  printf("// correctly:  %10f%% (%d)\n", 100.0*(double)(error[0])/(double)(total), error[0]);

  if (error[1])
    printf("// faithfully: %10f%% (%d)\n", 100.0*(double)(error[1])/(double)(total), error[1]);

  for(uint32_t i=2; i<=max_ulp_dist; i++)
    if (error[i])
      printf("// %3d ulp:    %10f%% (%d)\n", i,100.0*(double)(error[i])/(double)(total), error[i]);

  if (error_cnt)
    printf("//  >%d ulp:    %10f%% (%d)\n", max_ulp_dist,100.0*(double)(error_cnt)/(double)(total), error_cnt);

  printf("\n");
 }

 //********************************************************

 int main(void)
 {
  // compile time macros to bypass
 #if !defined(NO_RCBRT)
  for(uint32_t i=0; i<LENGTHOF(rcbrt_table); i++)
    rcbrt_test(rcbrt_table+i);
 #endif  

 #if !defined(NO_CBRT)
  for(uint32_t i=0; i<LENGTHOF(cbrt_table); i++)
    cbrt_test(cbrt_table+i);
 #endif

  return 0;
 }

diff --git a/summary.md b/summary.md
function	cycles	max	CR	FR	>ULP	Notes
lib_rcbrt	63.0	2	72.071696%	27.859262%	0.069042%	glibc 1/cbrtf
f32_rcbrt_1	16.2	2	72.746727%	27.211490%	0.041783%	paper version w mods
f32_rcbrt_2	17.5	1	88.337028%	11.662972%	---	CR div by 3
f32_rcbrt_3	20.1	1	99.150757%	0.849243%	---	double promote
f32_rcbrt_4	26.1	0	100.000000%	---	---	double promote + extra step
function	cycles	max	CR	FR	>ULP	Notes
lib_cbrtf	60.6	1	89.394180%	10.605820%	---	glib
fdlibm_cbrtf	26.2	0	100.000000%	---	---	in source copy
f32_cbrt_1	17.3	3	52.266572%	45.591750%	2.141679%	author's 6.1 with mods
f32_cbrt_2	18.5	2	77.553369%	22.445829%	0.000803%	sqrt variant
f32_cbrt_3	19.7	1	80.880794%	19.119206%	---	sqrt + CR div by 3
f32_cbrt_4	21.3	1	98.239267%	1.760733%	---	double promote
f32_cbrt_5	28.2	0	100.000000%	---	---	double promote + extra step