maikxchd · May 2, 2024 05:14
diff --git a/cbe_spu_fatan.c b/cbe_spu_fatan.c
 #ifndef CBE_SPU_FATAN
 #define CBE_SPU_FATAN

 #include <stdint.h>
 #include <spu_intrinsics.h>

 /* CBE_SPU_FATAN */

 // Implementation of atan for the CELL BE Synergistic Processing Unit
 // (SPU) using SIMD and IPL features

 // Helper functions to load constant values into quadwords
 // These functions load pre-computed constant values into quadwords,
 // which can be efficiently used in SIMD operations on the CELL SPE.
 // The constant values are stored as arrays of 32-bit integers, and
 // the functions use the si_lqa intrinsic to load these values into
 // quadwords for use in SIMD computations.

 // Global Floating-point constants (32 bit)                        
 // Constant is loaded in each element of 32 bit floating-point vector from local store.
 // Since the LS is so fast, loading these contants from SPU local memory is going to win 
 // over building the or storing them locally near the function.
 //                        
 // cp_flpio4()  +PI/+4 
 // cp_flt3p8()  tan( +3.0 * PI / +8.0 )
 // cp_flnpio2() -PI/+2
 // cp_flpio2()  +PI/+2
 // cp_flpt66()  +0.66
 // cp_flpi()    +PI
 // cp_flnpi()   -PI

 static const vector unsigned int _cp_f_pio4 = { 0x3f490fdb, 0x3f490fdb, 0x3f490fdb, 0x3f490fdb };
 static const vector unsigned int _cp_f_t3p8 = { 0x3fe5ec5d, 0x3fe5ec5d, 0x3fe5ec5d, 0x3fe5ec5d };
 static const vector unsigned int _cp_f_npio2 = { 0xbfc90fdb, 0xbfc90fdb, 0xbfc90fdb, 0xbfc90fdb };
 static const vector unsigned int _cp_f_pio2 = { 0x3fc90fdb, 0x3fc90fdb, 0x3fc90fdb, 0x3fc90fdb };
 static const vector unsigned int _cp_f_pt66 = { 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab };
 static const vector unsigned int _cp_f_pi = { 0x40490fdb, 0x40490fdb, 0x40490fdb, 0x40490fdb };
 static const vector unsigned int _cp_f_npi = { 0xc0490fdb, 0xc0490fdb, 0xc0490fdb, 0xc0490fdb };
 static const vector unsigned int _cp_f_morebits = { 0x38800000, 0x38800000, 0x38800000, 0x38800000 };
 static const vector unsigned int _cp_f_hmorebits = { 0x34000000, 0x34000000, 0x34000000, 0x34000000 };

 // Helper functions to load constant values into quadwords
 static inline qword cp_flpio4(void) { return si_lqa((intptr_t)&_cp_f_pio4); }
 static inline qword cp_flt3p8(void) { return si_lqa((intptr_t)&_cp_f_t3p8); }
 static inline qword cp_flnpio2(void) { return si_lqa((intptr_t)&_cp_f_npio2); }
 static inline qword cp_flpio2(void) { return si_lqa((intptr_t)&_cp_f_pio2); }
 static inline qword cp_flpt66(void) { return si_lqa((intptr_t)&_cp_f_pt66); }
 static inline qword cp_flpi(void) { return si_lqa((intptr_t)&_cp_f_pi); }
 static inline qword cp_flnpi(void) { return si_lqa((intptr_t)&_cp_f_npi); }
 static inline qword cp_filzero(void) { return si_ilhu((int16_t)0x0000); }
 static inline qword cp_filnzero(void) { return si_ilhu((int16_t)0x8000); }
 static inline qword cp_filone(void) { return si_ilhu((int16_t)0x3f80); }
 static inline qword cp_filtwo(void) { return si_ilhu((int16_t)0x4000); }
 static inline qword cp_filinf(void) { return si_ilhu((int16_t)0x7f80); }
 static inline qword cp_filninf(void) { return si_ilhu((int16_t)0xff80); }
 static inline qword cp_filnan(void) { return si_ilhu((int16_t)0x7fc0); }

 // Polynomial coefficients for cp_fatan approximation
 static const vector unsigned int _cp_f_atan_p7 = { 0x3c08876a, 0x3c08876a, 0x3c08876a, 0x3c08876a };
 static const vector unsigned int _cp_f_atan_p6 = { 0xbd954629, 0xbd954629, 0xbd954629, 0xbd954629 };
 static const vector unsigned int _cp_f_atan_p5 = { 0x3f8a07c1, 0x3f8a07c1, 0x3f8a07c1, 0x3f8a07c1 };
 static const vector unsigned int _cp_f_atan_p4 = { 0xbf49eee6, 0xbf49eee6, 0xbf49eee6, 0xbf49eee6 };
 static const vector unsigned int _cp_f_atan_p3 = { 0x3ee4f8b5, 0x3ee4f8b5, 0x3ee4f8b5, 0x3ee4f8b5 };
 static const vector unsigned int _cp_f_atan_p2 = { 0xbf62365c, 0xbf62365c, 0xbf62365c, 0xbf62365c };
 static const vector unsigned int _cp_f_atan_p1 = { 0x3f490965, 0x3f490965, 0x3f490965, 0x3f490965 };
 static const vector unsigned int _cp_f_atan_p0 = { 0xbf2697e0, 0xbf2697e0, 0xbf2697e0, 0xbf2697e0 };

 // Higher-degree polynomial coefficients for cp_fatan approximation
 // These are pre-computed constant vectors representing the coefficients
 // of the polynomials used to approximate the atan function.
 static const vector unsigned int _cp_f_atan_q7 = { 0x3c0897d0, 0x3c0897d0, 0x3c0897d0, 0x3c0897d0 };
 static const vector unsigned int _cp_f_atan_q6 = { 0xbd890e31, 0xbd890e31, 0xbd890e31, 0xbd890e31 };
 static const vector unsigned int _cp_f_atan_q5 = { 0x3f6c4616, 0x3f6c4616, 0x3f6c4616, 0x3f6c4616 };
 static const vector unsigned int _cp_f_atan_q4 = { 0xbf2bbfc2, 0xbf2bbfc2, 0xbf2bbfc2, 0xbf2bbfc2 };
 static const vector unsigned int _cp_f_atan_q3 = { 0x3eb6679b, 0x3eb6679b, 0x3eb6679b, 0x3eb6679b };
 static const vector unsigned int _cp_f_atan_q2 = { 0xbf56c0c9, 0xbf56c0c9, 0xbf56c0c9, 0xbf56c0c9 };
 static const vector unsigned int _cp_f_atan_q1 = { 0x3f7df927, 0x3f7df927, 0x3f7df927, 0x3f7df927 };
 static const vector unsigned int _cp_f_atan_q0 = { 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 };

 // _cp_fatan(x)
 //
 // Computes the arctangent of the input value x using a polynomial approximation.
 //
 // Input:
 //   x: A quadword representing a single-precision floating-point value or a vector
 //      of four single-precision floating-point values.
 //
 // Output:
 //   A quadword containing the arctangent of x (or a vector of arctangents if x was a vector).
 //
 // This function utilizes SIMD instructions and the ILP (Instruction-Level Parallelism)
 // capabilities of the CELL SPE to compute the arctangent of four single-precision
 // floating-point values simultaneously. The polynomial approximation is implemented
 // using a series of SIMD operations, taking advantage of the CELL SPE's vector operations
 // and the Fused Multiply-Add (FMA) instruction to achieve efficient parallel computation.
 //
 // The FMA instruction performs a single, combined multiply-add operation, computing
 // (a * b) + c with a single instruction. This can improve performance compared to
 // separate multiply and add instructions, as it reduces the number of instructions required
 // and can utilize specialized hardware for faster execution.
 //
 // In this implementation, the FMA instruction is used extensively in the polynomial
 // evaluation and range reduction steps, allowing for efficient computation of expressions
 // like (a * x^2) + (b * x) + c in a single instruction.
 static inline qword _cp_fatan(const qword x)
 {
    const qword f_one = cp_filone();
    const qword f_inf = cp_filinf();
    const qword f_ninf = cp_filninf();
    const qword f_msb = cp_filnzero();
    const qword f_zero = cp_filzero();
    const qword f_pt66 = cp_flpt66();
   const qword f_pio2 = cp_flpio2();
   const qword f_npio2 = cp_flnpio2();
   const qword f_pio4 = cp_flpio4();
   const qword f_t3p8 = cp_flt3p8();

   const qword f_atan_p0 = si_lqa((intptr_t)&_cp_f_atan_p0);
   const qword f_atan_p1 = si_lqa((intptr_t)&_cp_f_atan_p1);
   const qword f_atan_p2 = si_lqa((intptr_t)&_cp_f_atan_p2);
   const qword f_atan_p3 = si_lqa((intptr_t)&_cp_f_atan_p3);
   const qword f_atan_p4 = si_lqa((intptr_t)&_cp_f_atan_p4);
   const qword f_atan_q0 = si_lqa((intptr_t)&_cp_f_atan_q0);
   const qword f_atan_q1 = si_lqa((intptr_t)&_cp_f_atan_q1);
   const qword f_atan_q2 = si_lqa((intptr_t)&_cp_f_atan_q2);
   const qword f_atan_q3 = si_lqa((intptr_t)&_cp_f_atan_q3);
   const qword f_atan_q4 = si_lqa((intptr_t)&_cp_f_atan_q4);
   const qword f_morebits = si_lqa((intptr_t)&_cp_f_morebits);
   const qword f_hmorebits = si_lqa((intptr_t)&_cp_f_hmorebits);
   
   // ##                        
   // ## pos_x = -x            { x < 0
   // ##          x            { otherwise
   // ##                        
   // This section computes the absolute value of the input x, which is needed for the range
   // reduction step. The sign of the input is handled separately.
   //
   // neg_x is computed by XORing x with f_msb, which flips the sign bit of x, effectively
   // negating x. sign_mask is a mask that identifies if x is negative or not, using the si_fcgt 
   // intrinsic to perform a floating-point greater-than comparison with 0.0.
   //
   // pos_x is then computed by selecting between x and neg_x based on the sign_mask, using
   // the si_selb intrinsic. If x is negative, neg_x (-x) is selected, otherwise x is selected.
   // This operation effectively computes the absolute value of x.
   
   const qword neg_x = si_xor(x, f_msb);          
   const qword sign_mask = si_fcgt(f_zero, x);
   const qword pos_x = si_selb(x, neg_x, sign_mask);
               

   // Range reduction
   // ##                        
   // ## range0_mask = ( pos_x > tan( 3.0 * PI / 8.0 ) )
   // ## range1_mask = ( pos_x <= 0.66 )
   // ## range2_mask = !( range0_mask || range1_mask )
   // ##                        
   //
   // This section performs range reduction, which maps the input values into a smaller range
   // where the arctangent function can be more accurately approximated using a polynomial.
   //
   // range0_mask identifies cases where pos_x is greater than tan(3π/8), which is a constant
   // value f_t3p8. The si_fcgt intrinsic performs a floating-point greater-than comparison.
   //
   // range1_mask identifies cases where pos_x is less than or equal to 0.66, which is a constant
   // value f_pt66. It is computed by combining two masks using si_or: range1_gt_mask (pos_x < 0.66)
   // and range1_eq_mask (pos_x == 0.66).
   //
   // range2_mask identifies the remaining cases that are not covered by range0_mask or range1_mask.
   // It is computed by performing a bitwise NOR operation (si_nor) on range0_mask and range1_mask.
   //
   // These range masks are used to select different computations and approximations for the
   // arctangent function, based on the range in which the input value falls.                 
   
   const qword range0_mask = si_fcgt(pos_x, f_t3p8);
   const qword range1_gt_mask = si_fcgt(f_pt66, pos_x);
   const qword range1_eq_mask = si_fceq(f_pt66, pos_x);
   const qword range1_mask = si_or(range1_gt_mask, range1_eq_mask);
   const qword range2_mask = si_nor(range0_mask, range1_mask);

   // Range reduction using FMA
   // 
   // range0_x = -1.0 
   //            -----
   //            pos_x
   //
   // The range reduction step for the range0 case involves dividing -1.0 by pos_x.
   // This is implemented using a series of FMA (Fused Multiply-Add) instructions, which perform
   // the operation (a * b) + c in a single instruction for improved performance.
   //
   // The steps to compute range0_x are:
   // range0_x0 = pos_x - (trunc(pos_x) * pos_x)  // Compute the fractional part of pos_x
   // range0_x1 = trunc(pos_x)                    // Compute the integer part of pos_x
   // range0_x2 = -range0_x1 * pos_x + 1.0        // Compute -range0_x1 * pos_x and add 1.0
   // range0_x3 = range0_x2 * range0_x1 + range0_x1  // Final result: range0_x3 = -1.0 / pos_x
   //
   // The FMA instructions are used to improve performance by reducing the number of instructions
   // and utilizing specialized hardware for faster execution.                      
   
   const qword range0_x0 = si_frest(pos_x);
   const qword range0_x1 = si_fi(pos_x, range0_x0);
   const qword range0_x2 = si_fnms(range0_x1, pos_x, f_one);
   const qword range0_x3 = si_fma(range0_x2, range0_x1, range0_x1);
   const qword range0_x = si_xor(range0_x3, f_msb);
   const qword range0_y = f_pio2;
   
   // ##                        
   // ## range1_x = pos_x
   // ## range1_y = 0.0
   // ##                        
   //
   // For the range1 case, where the input value pos_x is less than or equal to 0.66,
   // the range-specific values are simply set to:
   //
   // range1_x = pos_x
   // range1_y = 0.0
   //
   // This means that the input value itself (pos_x) is used for the arctangent approximation
   // in this range, and the corresponding y-value is set to 0.0.                  
   
   const qword range1_x = pos_x;
   const qword range1_y = f_zero;
   
   
   // ##                        
   // ## range2_x = (pos_x-1.0)
   // ##            -----------
   // ##            (pos_x+1.0)
   // ##                        
   // ## range2_y = PI
   // ##            ---
   // ##            4.0
   // ##                        
   //
   // For the range2 case, the range reduction involves computing (pos_x - 1.0) / (pos_x + 1.0).
   // This is done using a series of steps:
   // range2_x0num = pos_x - 1.0             // Compute the numerator
   // range2_x0den = pos_x + 1.0             // Compute the denominator
   // range2_x0 = range2_x0den - (trunc(range2_x0den) * range2_x0den)  // Compute the fractional part of the denominator
   // range2_x1 = -range2_x0 * range2_x0den + 1.0  // Compute the reciprocal of the denominator
   // range2_x2 = range2_x1 * range2_x0             // Refine the reciprocal
   // range2_x = range2_x0num * range2_x2           // Final result: range2_x = (pos_x - 1.0) / (pos_x + 1.0)
   //
   // The range2_y value is set to PI/4.0, which is used in the final arctangent computation.    
   
   const qword range2_y = f_pio4;
   const qword range2_x0num = si_fs(pos_x, f_one);
   const qword range2_x0den = si_fa(pos_x, f_one);
   const qword range2_x0 = si_frest(range2_x0den);
   const qword range2_x1 = si_fnms(range2_x0, range2_x0den, f_one);
   const qword range2_x2 = si_fma(range2_x1, range2_x0, range2_x0);
   const qword range2_x = si_fm(range2_x0num, range2_x2);
   
   // ##                        
   // ## range_x  = range0_x { range0_mask
   // ##            range1_x { range1_mask
   // ##            range2_x { range2_mask
   // ##                        
   // ## range_y  = range0_y { range0_mask
   // ##            range1_y { range1_mask
   // ##            range2_y { range2_mask
   // ##                        
   //
   // Based on the range masks computed earlier (range0_mask, range1_mask, range2_mask),
   // this section selects the appropriate range-specific values for x (range_x) and y (range_y).
   //
   // The range-specific values (range0_x, range1_x, range2_x, range0_y, range1_y, range2_y)
   // are precomputed constants that are used in the arctangent approximation for each range.
   //
   // The si_selb intrinsic is used to select the appropriate value based on the range masks.
   // It performs a conditional selection operation, selecting the first value if the corresponding
   // mask bit is 0, and the second value if the mask bit is 1.
   //
   // For range_x:
   // 1. range_x0 is computed by selecting between range2_x and range0_x based on range0_mask.
   // 2. range_x is computed by selecting between range_x0 and range1_x based on range1_mask.
   //
   // For range_y:
   // 1. range_y0 is computed by selecting between range2_y and range0_y based on range0_mask.
   // 2. range_y is computed by selecting between range_y0 and range1_y based on range1_mask.
   //
   // These range-specific values are used in the subsequent steps of the arctangent approximation.                  
   
   const qword range_x0 = si_selb(range2_x, range0_x, range0_mask);
   const qword range_x = si_selb(range_x0, range1_x, range1_mask);
   const qword range_y0 = si_selb(range2_y, range0_y, range0_mask);
   const qword range_y = si_selb(range_y0, range1_y, range1_mask);
   
    // Polynomial approximation using FMA
    //
    // zdiv   =  P  + P xp2 + P xp2^2 + P xp2^3 + P xp2^4
    //            0    1       2        3       4
    //           ------------------------------------------
    //           Q  + Q xp2 + Q xp2^2 + Q xp2^3 + Q xp2^4 + xp2^5
    //            0    1       2        3       4
    //
    // The polynomial approximation is implemented using Estrin's method, evaluating
    // the numerator and denominator polynomials separately using FMA instructions,
    // and then dividing the numerator by the denominator to obtain the final result.
    //
    // The numerator is computed as:
    // znum0 = f_atan_p0
    // znum1 = (znum0 * xp2) + f_atan_p1
    // znum2 = (znum1 * xp2) + f_atan_p2
    // znum3 = (znum2 * xp2) + f_atan_p3
    // znum  = (znum3 * xp2) + f_atan_p4
    //
    // The denominator is computed as:
    // zden0 = xp2 + f_atan_q0
    // zden1 = (zden0 * xp2) + f_atan_q1
    // zden2 = (zden1 * xp2) + f_atan_q2
    // zden3 = (zden2 * xp2) + f_atan_q3
    // zden  = (zden3 * xp2) + f_atan_q4 + xp2
    //
    // The final result, zdiv, is computed by dividing znum by zden using
    // additional FMA instructions for improved accuracy. 
   
   const qword xp2 = si_fm(range_x, range_x);
   const qword znum0 = f_atan_p0;
   const qword znum1 = si_fma(znum0, xp2, f_atan_p1); // FMA: (znum0 * xp2) + f_atan_p1
   const qword znum2 = si_fma(znum1, xp2, f_atan_p2); // FMA: (znum1 * xp2) + f_atan_p2
   const qword znum3 = si_fma(znum2, xp2, f_atan_p3); // FMA: (znum2 * xp2) + f_atan_p3
   const qword znum = si_fma(znum3, xp2, f_atan_p4); // FMA: (znum3 * xp2) + f_atan_p4
   const qword zden0 = si_fa(xp2, f_atan_q0); 
   const qword zden1 = si_fma(zden0, xp2, f_atan_q1); // FMA: (zden0 * xp2) + f_atan_q1
   const qword zden2 = si_fma(zden1, xp2, f_atan_q2); // FMA: (zden1 * xp2) + f_atan_q2
   const qword zden3 = si_fma(zden2, xp2, f_atan_q3); // FMA: (zden2 * xp2) + f_atan_q3
   const qword zden = si_fma(zden3, xp2, f_atan_q4); // FMA: (zden3 * xp2) + f_atan_q4
   const qword zden_r0 = si_frest(zden);
   const qword zden_r1 = si_fnms(zden_r0, zden, f_one);
   const qword zden_r = si_fma(zden_r1, zden_r0, zden_r0);
   const qword zdiv = si_fm(znum, zden_r);
   const qword z0 = si_fm(xp2, zdiv);
   const qword z1 = si_fma(range_x, z0, range_x);
   
   // ##                        
   // ## zadd      =  z1 + 0.5 * MOREBITS { range2_mask
   // ##              z1 + MOREBITS       { range1_mask
   // ##              z1                  { otherwise
   // ##                        
   // ## yaddz     = range_y + zadd
   // ##                        
   // ## pos_yaddz = yaddz      { yaddz >= 0
   // ##             -yaddz     { yaddz <  0
   // ##                        
   
   const qword zadd0 = si_selb(f_zero, f_hmorebits, range2_mask);
   const qword zadd1 = si_selb(zadd0, f_morebits, range1_mask);
   const qword zadd = si_fa(z1, zadd1);
   const qword yaddz = si_fa(range_y, zadd);
   const qword neg_yaddz = si_xor(yaddz, f_msb);
   const qword pos_yaddz = si_selb(yaddz, neg_yaddz, sign_mask);
   
    // Handling special cases
    //
    // result_y0 = 0.0        { x == 0.0
    //             pos_yaddz  { otherwise
    //
    // In this step, the function checks if the input value x is equal to 0.0.
    // If it is, the result is set to 0.0. Otherwise, the result is set to pos_yaddz,
    // which is the computed arctangent value after range reduction and polynomial
    // approximation.                   
   
   const qword x_eqz_mask = si_fceq(f_zero, x);
   const qword result_y0 = si_selb(pos_yaddz, x, x_eqz_mask);
   
    // result_y2 = +PI         {
    //             ---         { x == INF
    //             2.0         {
    //
    //             -PI         {
    //             ---         { x == -INF
    //             2.0         {
    //
    //             result_y0   { otherwise
    //
    // This section handles the cases where the input value x is positive or negative
    // infinity. If x is positive infinity, the result is set to pi/2.0. If x is negative
    // infinity, the result is set to -pi/2.0. Otherwise, the result is set to result_y0,
    // which is the computed arctangent value from the previous step.              
   
   const qword x_eqinf_mask = si_fceq(f_inf, x);
   const qword x_eqninf_mask = si_fceq(f_ninf, x);
   const qword result_y1 = si_selb(result_y0, f_pio2, x_eqinf_mask);
   const qword result = si_selb(result_y1, f_npio2, x_eqninf_mask);

   return result;
 }

 // cp_fatan(x)
 //
 // Computes the arctangent of the input vector float x using _cp_fatan.
 //
 // Input:
 //   x: A vector float representing four single-precision floating-point values.
 //
 // Output:
 //   A vector float containing the arctangent of each element in x.
 //
 // This function acts as a wrapper around _cp_fatan, converting the input
 // vector float to a quadword, computing the arctangent using _cp_fatan,
 // and then converting the result back to a vector float.

 static inline vector float cp_fatan(const vector float x)
 {
   return (vector float)(_cp_fatan((qword)x));
 }

 // cp_fatan_scalar(x)
 //
 // Computes the arctangent of the input single-precision float x using _cp_fatan.
 //
 // Input:
 //   x: A single-precision floating-point value.
 //
 // Output:
 //   The arctangent of x as a single-precision floating-point value.
 //
 // This function acts as a wrapper around _cp_fatan, converting the input
 // float to a quadword, computing the arctangent using _cp_fatan, and then
 // converting the result back to a float.
 //
 // The steps involved are:
 // 1. Convert the input float x to a quadword vx using si_from_float.
 // 2. Call _cp_fatan(vx) to compute the arctangent of vx, storing the result in vresult.
 // 3. Convert the quadword vresult back to a float result using si_to_float.
 // 4. Return the float result.
 //
 // This function provides a convenient way to compute the arctangent of a single float value,
 // while still leveraging the SIMD capabilities of the _cp_fatan function.

 static inline float cp_fatan_scalar(const float x)
 {
   const qword vx = si_from_float(x);
   const qword vresult = _cp_fatan(vx);
   const float result = si_to_float(vresult);

   return result;
 }

 static inline float cp_fatan_scalar(const float x)
 {
   const qword vx = si_from_float(x);
   const qword vresult = _cp_fatan(vx);
   const float result = si_to_float(vresult);

   return result;
 }

 // ## cp_fatan2(y, x)
 // ##
 // ## Function to compute the angle (in radians) between the positive x-axis and the 
 // ## point (x, y) in the Cartesian plane, using the atan2 function.
 // ##
 // ## Input:
 // ##   y: float or vector float representing the y-coordinate
 // ##   x: float or vector float representing the x-coordinate
 // ##
 // ## Output:
 // ##   The angle (theta) in radians between the positive x-axis and the point (x, y)
 // ##   in the range [-pi, pi].
 // ##
 // ## Mathematical Description:
 // ##   The atan2 function is a variation of the arctangent function, which computes
 // ##   the angle (in radians) between the positive x-axis and the vector (x, y) in the
 // ##   Cartesian plane. It is defined as:
 // ##
 // ##     atan2(y, x) = arctan(y/x)
 // ##
 // ##   However, the standard arctangent function (arctan) has a range of [-pi/2, pi/2],
 // ##   which is not sufficient to represent all possible angles in the Cartesian plane.
 // ##   The atan2 function extends the range to [-pi, pi] by considering the signs of
 // ##   both x and y.
 // ##
 // ##   The mathematical relationships governing atan2 are as follows:
 // ##
 // ##     atan2(y, x) = arctan(y/x)                     (for x > 0)
 // ##     atan2(y, x) = arctan(y/x) + pi                (for x < 0 and y >= 0)
 // ##     atan2(y, x) = arctan(y/x) - pi                (for x < 0 and y < 0)
 // ##
 // ##   Additionally, the atan2 function handles special cases where either x or y is zero,
 // ##   positive infinity, or negative infinity, as described in the special cases section.
 // ##
 // ##   The implementation of cp_fatan2 follows these mathematical relationships and special
 // ##   cases to compute the correct angle for any given (x, y) coordinate pair.
 // ##
 // ##   Special cases:
 // ##     - If y is +0 and x is negative, the result is +pi
 // ##     - If y is -0 and x is negative, the result is -pi
 // ##     - If y is +0 and x is positive, the result is +0
 // ##     - If y is -0 and x is positive, the result is -0
 // ##     - If y is negative and x is 0, the result is -pi or pi/2 (depending on sign of y)
 // ##     - If y is positive and x is 0, the result is pi or pi/2 (depending on sign of y)
 // ##     - If either y or x is NaN, the result is NaN
 // ##     - If y is +0 and x is -0, the result is +pi
 // ##     - If y is -0 and x is -0, the result is -pi
 // ##     - If y is +0 and x is +0, the result is +0
 // ##     - If y is -0 and x is +0, the result is -0
 // ##     - If y is +inf and x is +inf, the result is pi/4
 // ##     - If y is -inf and x is +inf, the result is -pi/4
 // ##     - If y is +inf and x is -inf, the result is 3pi/4
 // ##     - If y is -inf and x is -inf, the result is -3pi/4
 // ##     - If y is finite and positive, and x is -inf, the result is +pi
 // ##     - If y is finite and negative, and x is -inf, the result is -pi
 // ##     - If y is finite and positive, and x is +inf, the result is +0
 // ##     - If y is finite and negative, and x is +inf, the result is -0
 // ##     - If x is finite and y is +inf, the result is pi/2
 // ##     - If x is finite and y is -inf, the result is -pi/2
 // ##
 // ##   General cases:
 // ##     - If x is negative and y is non-negative, the result is pi + atan(y/x)
 // ##     - If x is negative and y is negative, the result is -pi + atan(y/x)
 // ##     - Otherwise, the result is atan(y/x)

 qword _cp_fatan2(qword y, qword x)
 {
   const qword f_one = cp_filone();
   const qword f_zero = cp_filzero();
   const qword f_pi = cp_flpi();
   const qword f_npi = cp_flnpi();
   const qword f_inf = cp_filinf();
   const qword f_ninf = cp_filninf();

   // ##                        
   // ## yox = y
   // ##       -
   // ##       x
   // ##                        
   //
   // This section computes the ratio yox = y / x, which is a key step in the atan2(y, x) computation.
   // The ratio yox is computed using a series of FMA (Fused Multiply-Add) instructions for improved
   // accuracy and performance.
   //
   // First, x_r0 computes the fractional part of x using the si_frest intrinsic.
   // x_r1 computes (1 - x_r0) * x using the si_fnms (Fused Negative Multiply-Subtract) intrinsic.
   // x_r is then computed by refining the value of (1 - x_r0) using an FMA instruction.
   //
   // Finally, yox is computed by dividing y by x_r using the si_fm (Floating-Point Multiplication) intrinsic.

   const qword x_r0 = si_frest(x);
   const qword x_r1 = si_fnms(x_r0, x, f_one);
   const qword x_r = si_fma(x_r1, x_r0, x_r0);
   const qword yox = si_fm(y, x_r);

   // ##                        
   // ## z   = +PI + cp_atan(yox) { (x < 0.0) && (y >= 0.0)
   // ##       -PI + cp_atan(yox) { (x < 0.0) && (y < 0.0)
   // ##       0.0 + cp_atan(yox) { otherwise
   // ##                        
   //
   // Based on the signs of x and y, this section computes the final result z by adding or subtracting
   // multiples of PI to the result of cp_atan(yox), which is the arctangent of yox computed using
   // the polynomial approximation.
   //
   // First, masks are created to identify the cases where x is negative (x_ltz_mask), y is negative (y_ltz_mask),
   // and both x and y are negative (xy_ltz_mask).
   //
   // zadd0 is computed by selecting between 0.0 and +PI based on x_ltz_mask, which identifies if x is negative.
   //
   // zadd is then computed by selecting between zadd0 and -PI based on xy_ltz_mask, which identifies if both
   // x and y are negative.
   //
   // The atan_yox value is computed by calling the _cp_fatan function, which computes the arctangent of yox
   // using the polynomial approximation.
   //
   // Finally, result is computed by adding zadd (which can be 0.0, +PI, or -PI) to atan_yox, effectively
   // adjusting the arctangent result based on the signs of x and y.

   const qword x_ltz_mask = si_fcgt(f_zero, x);
   const qword y_ltz_mask = si_fcgt(f_zero, y);
   const qword xy_ltz_mask = si_and(x_ltz_mask, y_ltz_mask);
   const qword zadd0 = si_selb(f_zero, f_pi, x_ltz_mask);
   const qword zadd = si_selb(zadd0, f_npi, xy_ltz_mask);
   const qword atan_yox = _cp_fatan(yox);
   const qword result = si_fa(zadd, atan_yox);

   // Improved special case handling
   // Special cases need to be handled carefully to ensure correct results when dealing with
   // corner cases or exceptional inputs. The CELL BE architecture provides efficient SIMD
   // operations and intrinsics for handling these special cases in a parallel manner.
   // The following special cases are handled in this implementation:

   // Create masks to identify if x or y are finite values (not NaN or infinity).
   // The si_andc intrinsic performs a bitwise AND with the complement, clearing the
   // exponent bits of x and y. The si_fcgt intrinsic performs a floating-point greater-than
   // comparison with +inf, effectively checking if the values are finite.
   const qword x_isfinite_mask = si_fcgt(f_inf, si_andc(x, spu_splats(0x7f800000)));
   const qword y_isfinite_mask = si_fcgt(f_inf, si_andc(y, spu_splats(0x7f800000)));

   // Create masks to identify if x or y are equal to positive or negative infinity.
   // The si_fceq intrinsic performs a floating-point equal-to comparison with +inf and -inf.
   const qword x_eqinf_mask = si_fceq(f_inf, x);
   const qword x_eqninf_mask = si_fceq(f_ninf, x);
   const qword y_eqinf_mask = si_fceq(f_inf, y);
   const qword y_eqninf_mask = si_fceq(f_ninf, y);

   // Create masks to identify if x or y are equal to zero (positive or negative).
   // The si_fceq intrinsic performs a floating-point equal-to comparison with +0.0.
   const qword x_eqz_mask = si_fceq(f_zero, x);
   const qword y_eqz_mask = si_fceq(f_zero, y);
   
   // Combine masks to identify cases where both x and y are positive or negative infinity.
   const qword xy_eqinf_mask = si_and(x_eqinf_mask, y_eqinf_mask);
   const qword xy_eqninf_mask = si_and(x_eqninf_mask, y_eqninf_mask);

   // Create masks for cases where one input is infinity and the other is not.
   const qword y_posinf_x_neginf_mask = si_and(y_eqinf_mask, x_eqninf_mask);
   const qword y_neginf_x_neginf_mask = si_and(y_eqninf_mask, x_eqninf_mask);
   const qword y_posinf_x_posinf_mask = si_and(y_eqinf_mask, x_eqinf_mask);
   const qword y_neginf_x_posinf_mask = si_and(y_eqninf_mask, x_eqinf_mask);

   // Create masks for cases where one input is finite and the other is negative infinity.
   const qword y_posfinite_x_neginf_mask = si_and(y_isfinite_mask, si_fceq(f_ninf, x));
   const qword y_negfinite_x_neginf_mask = si_and(si_fcgt(f_zero, y), si_fceq(f_ninf, x));
   const qword y_posfinite_x_posinf_mask = si_and(y_isfinite_mask, si_fceq(f_inf, x));
   const qword y_negfinite_x_posinf_mask = si_and(si_fcgt(f_zero, y), si_fceq(f_inf, x));

   // Create masks for cases where x is finite and y is positive or negative infinity.
   const qword x_isfinite_y_posinf_mask = si_and(x_isfinite_mask, y_eqinf_mask);
   const qword x_isfinite_y_neginf_mask = si_and(x_isfinite_mask, y_eqninf_mask);

   // Combine all the special case masks using bitwise OR operations.
   const qword special_case_mask = si_or(xy_eqinf_mask, xy_eqninf_mask);
   special_case_mask = si_or(special_case_mask, y_posinf_x_neginf_mask);
   special_case_mask = si_or(special_case_mask, y_neginf_x_neginf_mask);
   special_case_mask = si_or(special_case_mask, y_posinf_x_posinf_mask);
   special_case_mask = si_or(special_case_mask, y_neginf_x_posinf_mask);
   special_case_mask = si_or(special_case_mask, y_posfinite_x_neginf_mask);
   special_case_mask = si_or(special_case_mask, y_negfinite_x_neginf_mask);
   special_case_mask = si_or(special_case_mask, y_posfinite_x_posinf_mask);
   special_case_mask = si_or(special_case_mask, y_negfinite_x_posinf_mask);
   special_case_mask = si_or(special_case_mask, x_isfinite_y_posinf_mask);
   special_case_mask = si_or(special_case_mask, x_isfinite_y_neginf_mask);
   special_case_mask = si_or(special_case_mask, x_eqz_mask);
   special_case_mask = si_or(special_case_mask, y_eqz_mask);

   // Compute the result for special cases using SIMD select operations.
   const qword y_posinf_x_neginf_result = si_fm(si_ilhu(0x4049), f_one);  // 3*pi/4
   const qword y_neginf_x_neginf_result = si_fm(si_ilhu(0xc049), f_one);  // -3*pi/4
   const qword y_posinf_x_posinf_result = si_fm(f_pio2, si_ilhu(0x3f80)); // pi/4
   const qword y_neginf_x_posinf_result = si_fm(f_npio2, si_ilhu(0x3f80)); // -pi/4
   const qword y_posfinite_x_neginf_result = f_pi;
   const qword y_negfinite_x_neginf_result = f_npi;
   const qword y_posfinite_x_posinf_result = f_zero;
   const qword y_negfinite_x_posinf_result = si_xor(f_zero, f_msb);
   const qword x_isfinite_y_posinf_result = si_fm(f_pio2, si_ilhu(0x3f80)); // pi/2
   const qword x_isfinite_y_neginf_result = si_fm(f_npio2, si_ilhu(0x3f80)); // -pi/2
   const qword x_eqz_result = si_selb(f_pi, f_zero, si_fcgt(f_zero, y));
   const qword y_eqz_result = si_selb(f_zero, si_xor(f_zero, f_msb), si_fcgt(f_zero, x));

   // Select the appropriate result value based on the special case masks using si_selb.
   const qword special_result = si_selb(result, y_posinf_x_neginf_result, y_posinf_x_neginf_mask);
   special_result = si_selb(special_result, y_neginf_x_neginf_result, y_neginf_x_neginf_mask);
   special_result = si_selb(special_result, y_posinf_x_posinf_result, y_posinf_x_posinf_mask);
   special_result = si_selb(special_result, y_neginf_x_posinf_result, y_neginf_x_posinf_mask);
   special_result = si_selb(special_result, y_posfinite_x_neginf_result, y_posfinite_x_neginf_mask);
   special_result = si_selb(special_result, y_negfinite_x_neginf_result, y_negfinite_x_neginf_mask);
   special_result = si_selb(special_result, y_posfinite_x_posinf_result, y_posfinite_x_posinf_mask);
   special_result = si_selb(special_result, y_negfinite_x_posinf_result, y_negfinite_x_posinf_mask);
   special_result = si_selb(special_result, x_isfinite_y_posinf_result, x_isfinite_y_posinf_mask);
   special_result = si_selb(special_result, x_isfinite_y_neginf_result, x_isfinite_y_neginf_mask);
   special_result = si_selb(special_result, x_eqz_result, x_eqz_mask);
   special_result = si_selb(special_result, y_eqz_result, y_eqz_mask);

   return si_selb(result, special_result, special_case_mask);
 }

 vector float cp_fatan2(vector float arg0, vector float arg1)
 {
   const qword y = (qword)arg0;
   const qword x = (qword)arg1;
   const qword result = _cp_fatan2(y, x);

   return (vector float)result;
 }

 // cp_fatan2_scalar(arg0, arg1)
 //
 // Computes the atan2(y, x) function using _cp_fatan2 for single-precision float inputs.
 //
 // Input:
 //   arg0: The y-coordinate as a single-precision float.
 //   arg1: The x-coordinate as a single-precision float.
 //
 // Output:
 //   The angle (theta) in radians between the positive x-axis and the point (x, y),
 //   in the range [-pi, pi], as a single-precision float.
 //
 // This function is a scalar wrapper around _cp_fatan2, which computes atan2(y, x) using
 // SIMD operations. The steps involved are:
 // 1. Convert the input floats arg0 (y) and arg1 (x) to quadwords y and x using si_from_float.
 // 2. Call _cp_fatan2(y, x) to compute the atan2(y, x) value, storing the result in z.
 // 3. Convert the quadword z back to a float result using si_to_float.
 // 4. Return the float result.
 //
 // This function provides a convenient way to compute atan2(y, x) for single-precision float
 // inputs while leveraging the SIMD capabilities of the _cp_fatan2 function.

 float cp_fatan2_scalar(float arg0, float arg1)
 {
   const qword y = si_from_float(arg0);
   const qword x = si_from_float(arg1);
   const qword z = _cp_fatan2(y, x);
   const float result = si_to_float(z);

   return result;
 }

 #endif