Skip to content

Instantly share code, notes, and snippets.

@maikxchd
Last active May 2, 2024 05:14
Show Gist options
  • Save maikxchd/b2419c047de81c082bf0ab5683a9b934 to your computer and use it in GitHub Desktop.
Save maikxchd/b2419c047de81c082bf0ab5683a9b934 to your computer and use it in GitHub Desktop.
Implementation of Atan on the CELL SPU
#ifndef CBE_SPU_FATAN
#define CBE_SPU_FATAN
#include <stdint.h>
#include <spu_intrinsics.h>
/* CBE_SPU_FATAN */
// Implementation of atan for the CELL BE Synergistic Processing Unit
// (SPU) using SIMD and IPL features
// Helper functions to load constant values into quadwords
// These functions load pre-computed constant values into quadwords,
// which can be efficiently used in SIMD operations on the CELL SPE.
// The constant values are stored as arrays of 32-bit integers, and
// the functions use the si_lqa intrinsic to load these values into
// quadwords for use in SIMD computations.
// Global Floating-point constants (32 bit)
// Constant is loaded in each element of 32 bit floating-point vector from local store.
// Since the LS is so fast, loading these contants from SPU local memory is going to win
// over building the or storing them locally near the function.
//
// cp_flpio4() +PI/+4
// cp_flt3p8() tan( +3.0 * PI / +8.0 )
// cp_flnpio2() -PI/+2
// cp_flpio2() +PI/+2
// cp_flpt66() +0.66
// cp_flpi() +PI
// cp_flnpi() -PI
static const vector unsigned int _cp_f_pio4 = { 0x3f490fdb, 0x3f490fdb, 0x3f490fdb, 0x3f490fdb };
static const vector unsigned int _cp_f_t3p8 = { 0x3fe5ec5d, 0x3fe5ec5d, 0x3fe5ec5d, 0x3fe5ec5d };
static const vector unsigned int _cp_f_npio2 = { 0xbfc90fdb, 0xbfc90fdb, 0xbfc90fdb, 0xbfc90fdb };
static const vector unsigned int _cp_f_pio2 = { 0x3fc90fdb, 0x3fc90fdb, 0x3fc90fdb, 0x3fc90fdb };
static const vector unsigned int _cp_f_pt66 = { 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab };
static const vector unsigned int _cp_f_pi = { 0x40490fdb, 0x40490fdb, 0x40490fdb, 0x40490fdb };
static const vector unsigned int _cp_f_npi = { 0xc0490fdb, 0xc0490fdb, 0xc0490fdb, 0xc0490fdb };
static const vector unsigned int _cp_f_morebits = { 0x38800000, 0x38800000, 0x38800000, 0x38800000 };
static const vector unsigned int _cp_f_hmorebits = { 0x34000000, 0x34000000, 0x34000000, 0x34000000 };
// Helper functions to load constant values into quadwords
static inline qword cp_flpio4(void) { return si_lqa((intptr_t)&_cp_f_pio4); }
static inline qword cp_flt3p8(void) { return si_lqa((intptr_t)&_cp_f_t3p8); }
static inline qword cp_flnpio2(void) { return si_lqa((intptr_t)&_cp_f_npio2); }
static inline qword cp_flpio2(void) { return si_lqa((intptr_t)&_cp_f_pio2); }
static inline qword cp_flpt66(void) { return si_lqa((intptr_t)&_cp_f_pt66); }
static inline qword cp_flpi(void) { return si_lqa((intptr_t)&_cp_f_pi); }
static inline qword cp_flnpi(void) { return si_lqa((intptr_t)&_cp_f_npi); }
static inline qword cp_filzero(void) { return si_ilhu((int16_t)0x0000); }
static inline qword cp_filnzero(void) { return si_ilhu((int16_t)0x8000); }
static inline qword cp_filone(void) { return si_ilhu((int16_t)0x3f80); }
static inline qword cp_filtwo(void) { return si_ilhu((int16_t)0x4000); }
static inline qword cp_filinf(void) { return si_ilhu((int16_t)0x7f80); }
static inline qword cp_filninf(void) { return si_ilhu((int16_t)0xff80); }
static inline qword cp_filnan(void) { return si_ilhu((int16_t)0x7fc0); }
// Polynomial coefficients for cp_fatan approximation
static const vector unsigned int _cp_f_atan_p7 = { 0x3c08876a, 0x3c08876a, 0x3c08876a, 0x3c08876a };
static const vector unsigned int _cp_f_atan_p6 = { 0xbd954629, 0xbd954629, 0xbd954629, 0xbd954629 };
static const vector unsigned int _cp_f_atan_p5 = { 0x3f8a07c1, 0x3f8a07c1, 0x3f8a07c1, 0x3f8a07c1 };
static const vector unsigned int _cp_f_atan_p4 = { 0xbf49eee6, 0xbf49eee6, 0xbf49eee6, 0xbf49eee6 };
static const vector unsigned int _cp_f_atan_p3 = { 0x3ee4f8b5, 0x3ee4f8b5, 0x3ee4f8b5, 0x3ee4f8b5 };
static const vector unsigned int _cp_f_atan_p2 = { 0xbf62365c, 0xbf62365c, 0xbf62365c, 0xbf62365c };
static const vector unsigned int _cp_f_atan_p1 = { 0x3f490965, 0x3f490965, 0x3f490965, 0x3f490965 };
static const vector unsigned int _cp_f_atan_p0 = { 0xbf2697e0, 0xbf2697e0, 0xbf2697e0, 0xbf2697e0 };
// Higher-degree polynomial coefficients for cp_fatan approximation
// These are pre-computed constant vectors representing the coefficients
// of the polynomials used to approximate the atan function.
static const vector unsigned int _cp_f_atan_q7 = { 0x3c0897d0, 0x3c0897d0, 0x3c0897d0, 0x3c0897d0 };
static const vector unsigned int _cp_f_atan_q6 = { 0xbd890e31, 0xbd890e31, 0xbd890e31, 0xbd890e31 };
static const vector unsigned int _cp_f_atan_q5 = { 0x3f6c4616, 0x3f6c4616, 0x3f6c4616, 0x3f6c4616 };
static const vector unsigned int _cp_f_atan_q4 = { 0xbf2bbfc2, 0xbf2bbfc2, 0xbf2bbfc2, 0xbf2bbfc2 };
static const vector unsigned int _cp_f_atan_q3 = { 0x3eb6679b, 0x3eb6679b, 0x3eb6679b, 0x3eb6679b };
static const vector unsigned int _cp_f_atan_q2 = { 0xbf56c0c9, 0xbf56c0c9, 0xbf56c0c9, 0xbf56c0c9 };
static const vector unsigned int _cp_f_atan_q1 = { 0x3f7df927, 0x3f7df927, 0x3f7df927, 0x3f7df927 };
static const vector unsigned int _cp_f_atan_q0 = { 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 };
// _cp_fatan(x)
//
// Computes the arctangent of the input value x using a polynomial approximation.
//
// Input:
// x: A quadword representing a single-precision floating-point value or a vector
// of four single-precision floating-point values.
//
// Output:
// A quadword containing the arctangent of x (or a vector of arctangents if x was a vector).
//
// This function utilizes SIMD instructions and the ILP (Instruction-Level Parallelism)
// capabilities of the CELL SPE to compute the arctangent of four single-precision
// floating-point values simultaneously. The polynomial approximation is implemented
// using a series of SIMD operations, taking advantage of the CELL SPE's vector operations
// and the Fused Multiply-Add (FMA) instruction to achieve efficient parallel computation.
//
// The FMA instruction performs a single, combined multiply-add operation, computing
// (a * b) + c with a single instruction. This can improve performance compared to
// separate multiply and add instructions, as it reduces the number of instructions required
// and can utilize specialized hardware for faster execution.
//
// In this implementation, the FMA instruction is used extensively in the polynomial
// evaluation and range reduction steps, allowing for efficient computation of expressions
// like (a * x^2) + (b * x) + c in a single instruction.
static inline qword _cp_fatan(const qword x)
{
const qword f_one = cp_filone();
const qword f_inf = cp_filinf();
const qword f_ninf = cp_filninf();
const qword f_msb = cp_filnzero();
const qword f_zero = cp_filzero();
const qword f_pt66 = cp_flpt66();
const qword f_pio2 = cp_flpio2();
const qword f_npio2 = cp_flnpio2();
const qword f_pio4 = cp_flpio4();
const qword f_t3p8 = cp_flt3p8();
const qword f_atan_p0 = si_lqa((intptr_t)&_cp_f_atan_p0);
const qword f_atan_p1 = si_lqa((intptr_t)&_cp_f_atan_p1);
const qword f_atan_p2 = si_lqa((intptr_t)&_cp_f_atan_p2);
const qword f_atan_p3 = si_lqa((intptr_t)&_cp_f_atan_p3);
const qword f_atan_p4 = si_lqa((intptr_t)&_cp_f_atan_p4);
const qword f_atan_q0 = si_lqa((intptr_t)&_cp_f_atan_q0);
const qword f_atan_q1 = si_lqa((intptr_t)&_cp_f_atan_q1);
const qword f_atan_q2 = si_lqa((intptr_t)&_cp_f_atan_q2);
const qword f_atan_q3 = si_lqa((intptr_t)&_cp_f_atan_q3);
const qword f_atan_q4 = si_lqa((intptr_t)&_cp_f_atan_q4);
const qword f_morebits = si_lqa((intptr_t)&_cp_f_morebits);
const qword f_hmorebits = si_lqa((intptr_t)&_cp_f_hmorebits);
// ##
// ## pos_x = -x { x < 0
// ## x { otherwise
// ##
// This section computes the absolute value of the input x, which is needed for the range
// reduction step. The sign of the input is handled separately.
//
// neg_x is computed by XORing x with f_msb, which flips the sign bit of x, effectively
// negating x. sign_mask is a mask that identifies if x is negative or not, using the si_fcgt
// intrinsic to perform a floating-point greater-than comparison with 0.0.
//
// pos_x is then computed by selecting between x and neg_x based on the sign_mask, using
// the si_selb intrinsic. If x is negative, neg_x (-x) is selected, otherwise x is selected.
// This operation effectively computes the absolute value of x.
const qword neg_x = si_xor(x, f_msb);
const qword sign_mask = si_fcgt(f_zero, x);
const qword pos_x = si_selb(x, neg_x, sign_mask);
// Range reduction
// ##
// ## range0_mask = ( pos_x > tan( 3.0 * PI / 8.0 ) )
// ## range1_mask = ( pos_x <= 0.66 )
// ## range2_mask = !( range0_mask || range1_mask )
// ##
//
// This section performs range reduction, which maps the input values into a smaller range
// where the arctangent function can be more accurately approximated using a polynomial.
//
// range0_mask identifies cases where pos_x is greater than tan(3π/8), which is a constant
// value f_t3p8. The si_fcgt intrinsic performs a floating-point greater-than comparison.
//
// range1_mask identifies cases where pos_x is less than or equal to 0.66, which is a constant
// value f_pt66. It is computed by combining two masks using si_or: range1_gt_mask (pos_x < 0.66)
// and range1_eq_mask (pos_x == 0.66).
//
// range2_mask identifies the remaining cases that are not covered by range0_mask or range1_mask.
// It is computed by performing a bitwise NOR operation (si_nor) on range0_mask and range1_mask.
//
// These range masks are used to select different computations and approximations for the
// arctangent function, based on the range in which the input value falls.
const qword range0_mask = si_fcgt(pos_x, f_t3p8);
const qword range1_gt_mask = si_fcgt(f_pt66, pos_x);
const qword range1_eq_mask = si_fceq(f_pt66, pos_x);
const qword range1_mask = si_or(range1_gt_mask, range1_eq_mask);
const qword range2_mask = si_nor(range0_mask, range1_mask);
// Range reduction using FMA
//
// range0_x = -1.0
// -----
// pos_x
//
// The range reduction step for the range0 case involves dividing -1.0 by pos_x.
// This is implemented using a series of FMA (Fused Multiply-Add) instructions, which perform
// the operation (a * b) + c in a single instruction for improved performance.
//
// The steps to compute range0_x are:
// range0_x0 = pos_x - (trunc(pos_x) * pos_x) // Compute the fractional part of pos_x
// range0_x1 = trunc(pos_x) // Compute the integer part of pos_x
// range0_x2 = -range0_x1 * pos_x + 1.0 // Compute -range0_x1 * pos_x and add 1.0
// range0_x3 = range0_x2 * range0_x1 + range0_x1 // Final result: range0_x3 = -1.0 / pos_x
//
// The FMA instructions are used to improve performance by reducing the number of instructions
// and utilizing specialized hardware for faster execution.
const qword range0_x0 = si_frest(pos_x);
const qword range0_x1 = si_fi(pos_x, range0_x0);
const qword range0_x2 = si_fnms(range0_x1, pos_x, f_one);
const qword range0_x3 = si_fma(range0_x2, range0_x1, range0_x1);
const qword range0_x = si_xor(range0_x3, f_msb);
const qword range0_y = f_pio2;
// ##
// ## range1_x = pos_x
// ## range1_y = 0.0
// ##
//
// For the range1 case, where the input value pos_x is less than or equal to 0.66,
// the range-specific values are simply set to:
//
// range1_x = pos_x
// range1_y = 0.0
//
// This means that the input value itself (pos_x) is used for the arctangent approximation
// in this range, and the corresponding y-value is set to 0.0.
const qword range1_x = pos_x;
const qword range1_y = f_zero;
// ##
// ## range2_x = (pos_x-1.0)
// ## -----------
// ## (pos_x+1.0)
// ##
// ## range2_y = PI
// ## ---
// ## 4.0
// ##
//
// For the range2 case, the range reduction involves computing (pos_x - 1.0) / (pos_x + 1.0).
// This is done using a series of steps:
// range2_x0num = pos_x - 1.0 // Compute the numerator
// range2_x0den = pos_x + 1.0 // Compute the denominator
// range2_x0 = range2_x0den - (trunc(range2_x0den) * range2_x0den) // Compute the fractional part of the denominator
// range2_x1 = -range2_x0 * range2_x0den + 1.0 // Compute the reciprocal of the denominator
// range2_x2 = range2_x1 * range2_x0 // Refine the reciprocal
// range2_x = range2_x0num * range2_x2 // Final result: range2_x = (pos_x - 1.0) / (pos_x + 1.0)
//
// The range2_y value is set to PI/4.0, which is used in the final arctangent computation.
const qword range2_y = f_pio4;
const qword range2_x0num = si_fs(pos_x, f_one);
const qword range2_x0den = si_fa(pos_x, f_one);
const qword range2_x0 = si_frest(range2_x0den);
const qword range2_x1 = si_fnms(range2_x0, range2_x0den, f_one);
const qword range2_x2 = si_fma(range2_x1, range2_x0, range2_x0);
const qword range2_x = si_fm(range2_x0num, range2_x2);
// ##
// ## range_x = range0_x { range0_mask
// ## range1_x { range1_mask
// ## range2_x { range2_mask
// ##
// ## range_y = range0_y { range0_mask
// ## range1_y { range1_mask
// ## range2_y { range2_mask
// ##
//
// Based on the range masks computed earlier (range0_mask, range1_mask, range2_mask),
// this section selects the appropriate range-specific values for x (range_x) and y (range_y).
//
// The range-specific values (range0_x, range1_x, range2_x, range0_y, range1_y, range2_y)
// are precomputed constants that are used in the arctangent approximation for each range.
//
// The si_selb intrinsic is used to select the appropriate value based on the range masks.
// It performs a conditional selection operation, selecting the first value if the corresponding
// mask bit is 0, and the second value if the mask bit is 1.
//
// For range_x:
// 1. range_x0 is computed by selecting between range2_x and range0_x based on range0_mask.
// 2. range_x is computed by selecting between range_x0 and range1_x based on range1_mask.
//
// For range_y:
// 1. range_y0 is computed by selecting between range2_y and range0_y based on range0_mask.
// 2. range_y is computed by selecting between range_y0 and range1_y based on range1_mask.
//
// These range-specific values are used in the subsequent steps of the arctangent approximation.
const qword range_x0 = si_selb(range2_x, range0_x, range0_mask);
const qword range_x = si_selb(range_x0, range1_x, range1_mask);
const qword range_y0 = si_selb(range2_y, range0_y, range0_mask);
const qword range_y = si_selb(range_y0, range1_y, range1_mask);
// Polynomial approximation using FMA
//
// zdiv = P + P xp2 + P xp2^2 + P xp2^3 + P xp2^4
// 0 1 2 3 4
// ------------------------------------------
// Q + Q xp2 + Q xp2^2 + Q xp2^3 + Q xp2^4 + xp2^5
// 0 1 2 3 4
//
// The polynomial approximation is implemented using Estrin's method, evaluating
// the numerator and denominator polynomials separately using FMA instructions,
// and then dividing the numerator by the denominator to obtain the final result.
//
// The numerator is computed as:
// znum0 = f_atan_p0
// znum1 = (znum0 * xp2) + f_atan_p1
// znum2 = (znum1 * xp2) + f_atan_p2
// znum3 = (znum2 * xp2) + f_atan_p3
// znum = (znum3 * xp2) + f_atan_p4
//
// The denominator is computed as:
// zden0 = xp2 + f_atan_q0
// zden1 = (zden0 * xp2) + f_atan_q1
// zden2 = (zden1 * xp2) + f_atan_q2
// zden3 = (zden2 * xp2) + f_atan_q3
// zden = (zden3 * xp2) + f_atan_q4 + xp2
//
// The final result, zdiv, is computed by dividing znum by zden using
// additional FMA instructions for improved accuracy.
const qword xp2 = si_fm(range_x, range_x);
const qword znum0 = f_atan_p0;
const qword znum1 = si_fma(znum0, xp2, f_atan_p1); // FMA: (znum0 * xp2) + f_atan_p1
const qword znum2 = si_fma(znum1, xp2, f_atan_p2); // FMA: (znum1 * xp2) + f_atan_p2
const qword znum3 = si_fma(znum2, xp2, f_atan_p3); // FMA: (znum2 * xp2) + f_atan_p3
const qword znum = si_fma(znum3, xp2, f_atan_p4); // FMA: (znum3 * xp2) + f_atan_p4
const qword zden0 = si_fa(xp2, f_atan_q0);
const qword zden1 = si_fma(zden0, xp2, f_atan_q1); // FMA: (zden0 * xp2) + f_atan_q1
const qword zden2 = si_fma(zden1, xp2, f_atan_q2); // FMA: (zden1 * xp2) + f_atan_q2
const qword zden3 = si_fma(zden2, xp2, f_atan_q3); // FMA: (zden2 * xp2) + f_atan_q3
const qword zden = si_fma(zden3, xp2, f_atan_q4); // FMA: (zden3 * xp2) + f_atan_q4
const qword zden_r0 = si_frest(zden);
const qword zden_r1 = si_fnms(zden_r0, zden, f_one);
const qword zden_r = si_fma(zden_r1, zden_r0, zden_r0);
const qword zdiv = si_fm(znum, zden_r);
const qword z0 = si_fm(xp2, zdiv);
const qword z1 = si_fma(range_x, z0, range_x);
// ##
// ## zadd = z1 + 0.5 * MOREBITS { range2_mask
// ## z1 + MOREBITS { range1_mask
// ## z1 { otherwise
// ##
// ## yaddz = range_y + zadd
// ##
// ## pos_yaddz = yaddz { yaddz >= 0
// ## -yaddz { yaddz < 0
// ##
const qword zadd0 = si_selb(f_zero, f_hmorebits, range2_mask);
const qword zadd1 = si_selb(zadd0, f_morebits, range1_mask);
const qword zadd = si_fa(z1, zadd1);
const qword yaddz = si_fa(range_y, zadd);
const qword neg_yaddz = si_xor(yaddz, f_msb);
const qword pos_yaddz = si_selb(yaddz, neg_yaddz, sign_mask);
// Handling special cases
//
// result_y0 = 0.0 { x == 0.0
// pos_yaddz { otherwise
//
// In this step, the function checks if the input value x is equal to 0.0.
// If it is, the result is set to 0.0. Otherwise, the result is set to pos_yaddz,
// which is the computed arctangent value after range reduction and polynomial
// approximation.
const qword x_eqz_mask = si_fceq(f_zero, x);
const qword result_y0 = si_selb(pos_yaddz, x, x_eqz_mask);
// result_y2 = +PI {
// --- { x == INF
// 2.0 {
//
// -PI {
// --- { x == -INF
// 2.0 {
//
// result_y0 { otherwise
//
// This section handles the cases where the input value x is positive or negative
// infinity. If x is positive infinity, the result is set to pi/2.0. If x is negative
// infinity, the result is set to -pi/2.0. Otherwise, the result is set to result_y0,
// which is the computed arctangent value from the previous step.
const qword x_eqinf_mask = si_fceq(f_inf, x);
const qword x_eqninf_mask = si_fceq(f_ninf, x);
const qword result_y1 = si_selb(result_y0, f_pio2, x_eqinf_mask);
const qword result = si_selb(result_y1, f_npio2, x_eqninf_mask);
return result;
}
// cp_fatan(x)
//
// Computes the arctangent of the input vector float x using _cp_fatan.
//
// Input:
// x: A vector float representing four single-precision floating-point values.
//
// Output:
// A vector float containing the arctangent of each element in x.
//
// This function acts as a wrapper around _cp_fatan, converting the input
// vector float to a quadword, computing the arctangent using _cp_fatan,
// and then converting the result back to a vector float.
static inline vector float cp_fatan(const vector float x)
{
return (vector float)(_cp_fatan((qword)x));
}
// cp_fatan_scalar(x)
//
// Computes the arctangent of the input single-precision float x using _cp_fatan.
//
// Input:
// x: A single-precision floating-point value.
//
// Output:
// The arctangent of x as a single-precision floating-point value.
//
// This function acts as a wrapper around _cp_fatan, converting the input
// float to a quadword, computing the arctangent using _cp_fatan, and then
// converting the result back to a float.
//
// The steps involved are:
// 1. Convert the input float x to a quadword vx using si_from_float.
// 2. Call _cp_fatan(vx) to compute the arctangent of vx, storing the result in vresult.
// 3. Convert the quadword vresult back to a float result using si_to_float.
// 4. Return the float result.
//
// This function provides a convenient way to compute the arctangent of a single float value,
// while still leveraging the SIMD capabilities of the _cp_fatan function.
static inline float cp_fatan_scalar(const float x)
{
const qword vx = si_from_float(x);
const qword vresult = _cp_fatan(vx);
const float result = si_to_float(vresult);
return result;
}
static inline float cp_fatan_scalar(const float x)
{
const qword vx = si_from_float(x);
const qword vresult = _cp_fatan(vx);
const float result = si_to_float(vresult);
return result;
}
// ## cp_fatan2(y, x)
// ##
// ## Function to compute the angle (in radians) between the positive x-axis and the
// ## point (x, y) in the Cartesian plane, using the atan2 function.
// ##
// ## Input:
// ## y: float or vector float representing the y-coordinate
// ## x: float or vector float representing the x-coordinate
// ##
// ## Output:
// ## The angle (theta) in radians between the positive x-axis and the point (x, y)
// ## in the range [-pi, pi].
// ##
// ## Mathematical Description:
// ## The atan2 function is a variation of the arctangent function, which computes
// ## the angle (in radians) between the positive x-axis and the vector (x, y) in the
// ## Cartesian plane. It is defined as:
// ##
// ## atan2(y, x) = arctan(y/x)
// ##
// ## However, the standard arctangent function (arctan) has a range of [-pi/2, pi/2],
// ## which is not sufficient to represent all possible angles in the Cartesian plane.
// ## The atan2 function extends the range to [-pi, pi] by considering the signs of
// ## both x and y.
// ##
// ## The mathematical relationships governing atan2 are as follows:
// ##
// ## atan2(y, x) = arctan(y/x) (for x > 0)
// ## atan2(y, x) = arctan(y/x) + pi (for x < 0 and y >= 0)
// ## atan2(y, x) = arctan(y/x) - pi (for x < 0 and y < 0)
// ##
// ## Additionally, the atan2 function handles special cases where either x or y is zero,
// ## positive infinity, or negative infinity, as described in the special cases section.
// ##
// ## The implementation of cp_fatan2 follows these mathematical relationships and special
// ## cases to compute the correct angle for any given (x, y) coordinate pair.
// ##
// ## Special cases:
// ## - If y is +0 and x is negative, the result is +pi
// ## - If y is -0 and x is negative, the result is -pi
// ## - If y is +0 and x is positive, the result is +0
// ## - If y is -0 and x is positive, the result is -0
// ## - If y is negative and x is 0, the result is -pi or pi/2 (depending on sign of y)
// ## - If y is positive and x is 0, the result is pi or pi/2 (depending on sign of y)
// ## - If either y or x is NaN, the result is NaN
// ## - If y is +0 and x is -0, the result is +pi
// ## - If y is -0 and x is -0, the result is -pi
// ## - If y is +0 and x is +0, the result is +0
// ## - If y is -0 and x is +0, the result is -0
// ## - If y is +inf and x is +inf, the result is pi/4
// ## - If y is -inf and x is +inf, the result is -pi/4
// ## - If y is +inf and x is -inf, the result is 3pi/4
// ## - If y is -inf and x is -inf, the result is -3pi/4
// ## - If y is finite and positive, and x is -inf, the result is +pi
// ## - If y is finite and negative, and x is -inf, the result is -pi
// ## - If y is finite and positive, and x is +inf, the result is +0
// ## - If y is finite and negative, and x is +inf, the result is -0
// ## - If x is finite and y is +inf, the result is pi/2
// ## - If x is finite and y is -inf, the result is -pi/2
// ##
// ## General cases:
// ## - If x is negative and y is non-negative, the result is pi + atan(y/x)
// ## - If x is negative and y is negative, the result is -pi + atan(y/x)
// ## - Otherwise, the result is atan(y/x)
qword _cp_fatan2(qword y, qword x)
{
const qword f_one = cp_filone();
const qword f_zero = cp_filzero();
const qword f_pi = cp_flpi();
const qword f_npi = cp_flnpi();
const qword f_inf = cp_filinf();
const qword f_ninf = cp_filninf();
// ##
// ## yox = y
// ## -
// ## x
// ##
//
// This section computes the ratio yox = y / x, which is a key step in the atan2(y, x) computation.
// The ratio yox is computed using a series of FMA (Fused Multiply-Add) instructions for improved
// accuracy and performance.
//
// First, x_r0 computes the fractional part of x using the si_frest intrinsic.
// x_r1 computes (1 - x_r0) * x using the si_fnms (Fused Negative Multiply-Subtract) intrinsic.
// x_r is then computed by refining the value of (1 - x_r0) using an FMA instruction.
//
// Finally, yox is computed by dividing y by x_r using the si_fm (Floating-Point Multiplication) intrinsic.
const qword x_r0 = si_frest(x);
const qword x_r1 = si_fnms(x_r0, x, f_one);
const qword x_r = si_fma(x_r1, x_r0, x_r0);
const qword yox = si_fm(y, x_r);
// ##
// ## z = +PI + cp_atan(yox) { (x < 0.0) && (y >= 0.0)
// ## -PI + cp_atan(yox) { (x < 0.0) && (y < 0.0)
// ## 0.0 + cp_atan(yox) { otherwise
// ##
//
// Based on the signs of x and y, this section computes the final result z by adding or subtracting
// multiples of PI to the result of cp_atan(yox), which is the arctangent of yox computed using
// the polynomial approximation.
//
// First, masks are created to identify the cases where x is negative (x_ltz_mask), y is negative (y_ltz_mask),
// and both x and y are negative (xy_ltz_mask).
//
// zadd0 is computed by selecting between 0.0 and +PI based on x_ltz_mask, which identifies if x is negative.
//
// zadd is then computed by selecting between zadd0 and -PI based on xy_ltz_mask, which identifies if both
// x and y are negative.
//
// The atan_yox value is computed by calling the _cp_fatan function, which computes the arctangent of yox
// using the polynomial approximation.
//
// Finally, result is computed by adding zadd (which can be 0.0, +PI, or -PI) to atan_yox, effectively
// adjusting the arctangent result based on the signs of x and y.
const qword x_ltz_mask = si_fcgt(f_zero, x);
const qword y_ltz_mask = si_fcgt(f_zero, y);
const qword xy_ltz_mask = si_and(x_ltz_mask, y_ltz_mask);
const qword zadd0 = si_selb(f_zero, f_pi, x_ltz_mask);
const qword zadd = si_selb(zadd0, f_npi, xy_ltz_mask);
const qword atan_yox = _cp_fatan(yox);
const qword result = si_fa(zadd, atan_yox);
// Improved special case handling
// Special cases need to be handled carefully to ensure correct results when dealing with
// corner cases or exceptional inputs. The CELL BE architecture provides efficient SIMD
// operations and intrinsics for handling these special cases in a parallel manner.
// The following special cases are handled in this implementation:
// Create masks to identify if x or y are finite values (not NaN or infinity).
// The si_andc intrinsic performs a bitwise AND with the complement, clearing the
// exponent bits of x and y. The si_fcgt intrinsic performs a floating-point greater-than
// comparison with +inf, effectively checking if the values are finite.
const qword x_isfinite_mask = si_fcgt(f_inf, si_andc(x, spu_splats(0x7f800000)));
const qword y_isfinite_mask = si_fcgt(f_inf, si_andc(y, spu_splats(0x7f800000)));
// Create masks to identify if x or y are equal to positive or negative infinity.
// The si_fceq intrinsic performs a floating-point equal-to comparison with +inf and -inf.
const qword x_eqinf_mask = si_fceq(f_inf, x);
const qword x_eqninf_mask = si_fceq(f_ninf, x);
const qword y_eqinf_mask = si_fceq(f_inf, y);
const qword y_eqninf_mask = si_fceq(f_ninf, y);
// Create masks to identify if x or y are equal to zero (positive or negative).
// The si_fceq intrinsic performs a floating-point equal-to comparison with +0.0.
const qword x_eqz_mask = si_fceq(f_zero, x);
const qword y_eqz_mask = si_fceq(f_zero, y);
// Combine masks to identify cases where both x and y are positive or negative infinity.
const qword xy_eqinf_mask = si_and(x_eqinf_mask, y_eqinf_mask);
const qword xy_eqninf_mask = si_and(x_eqninf_mask, y_eqninf_mask);
// Create masks for cases where one input is infinity and the other is not.
const qword y_posinf_x_neginf_mask = si_and(y_eqinf_mask, x_eqninf_mask);
const qword y_neginf_x_neginf_mask = si_and(y_eqninf_mask, x_eqninf_mask);
const qword y_posinf_x_posinf_mask = si_and(y_eqinf_mask, x_eqinf_mask);
const qword y_neginf_x_posinf_mask = si_and(y_eqninf_mask, x_eqinf_mask);
// Create masks for cases where one input is finite and the other is negative infinity.
const qword y_posfinite_x_neginf_mask = si_and(y_isfinite_mask, si_fceq(f_ninf, x));
const qword y_negfinite_x_neginf_mask = si_and(si_fcgt(f_zero, y), si_fceq(f_ninf, x));
const qword y_posfinite_x_posinf_mask = si_and(y_isfinite_mask, si_fceq(f_inf, x));
const qword y_negfinite_x_posinf_mask = si_and(si_fcgt(f_zero, y), si_fceq(f_inf, x));
// Create masks for cases where x is finite and y is positive or negative infinity.
const qword x_isfinite_y_posinf_mask = si_and(x_isfinite_mask, y_eqinf_mask);
const qword x_isfinite_y_neginf_mask = si_and(x_isfinite_mask, y_eqninf_mask);
// Combine all the special case masks using bitwise OR operations.
const qword special_case_mask = si_or(xy_eqinf_mask, xy_eqninf_mask);
special_case_mask = si_or(special_case_mask, y_posinf_x_neginf_mask);
special_case_mask = si_or(special_case_mask, y_neginf_x_neginf_mask);
special_case_mask = si_or(special_case_mask, y_posinf_x_posinf_mask);
special_case_mask = si_or(special_case_mask, y_neginf_x_posinf_mask);
special_case_mask = si_or(special_case_mask, y_posfinite_x_neginf_mask);
special_case_mask = si_or(special_case_mask, y_negfinite_x_neginf_mask);
special_case_mask = si_or(special_case_mask, y_posfinite_x_posinf_mask);
special_case_mask = si_or(special_case_mask, y_negfinite_x_posinf_mask);
special_case_mask = si_or(special_case_mask, x_isfinite_y_posinf_mask);
special_case_mask = si_or(special_case_mask, x_isfinite_y_neginf_mask);
special_case_mask = si_or(special_case_mask, x_eqz_mask);
special_case_mask = si_or(special_case_mask, y_eqz_mask);
// Compute the result for special cases using SIMD select operations.
const qword y_posinf_x_neginf_result = si_fm(si_ilhu(0x4049), f_one); // 3*pi/4
const qword y_neginf_x_neginf_result = si_fm(si_ilhu(0xc049), f_one); // -3*pi/4
const qword y_posinf_x_posinf_result = si_fm(f_pio2, si_ilhu(0x3f80)); // pi/4
const qword y_neginf_x_posinf_result = si_fm(f_npio2, si_ilhu(0x3f80)); // -pi/4
const qword y_posfinite_x_neginf_result = f_pi;
const qword y_negfinite_x_neginf_result = f_npi;
const qword y_posfinite_x_posinf_result = f_zero;
const qword y_negfinite_x_posinf_result = si_xor(f_zero, f_msb);
const qword x_isfinite_y_posinf_result = si_fm(f_pio2, si_ilhu(0x3f80)); // pi/2
const qword x_isfinite_y_neginf_result = si_fm(f_npio2, si_ilhu(0x3f80)); // -pi/2
const qword x_eqz_result = si_selb(f_pi, f_zero, si_fcgt(f_zero, y));
const qword y_eqz_result = si_selb(f_zero, si_xor(f_zero, f_msb), si_fcgt(f_zero, x));
// Select the appropriate result value based on the special case masks using si_selb.
const qword special_result = si_selb(result, y_posinf_x_neginf_result, y_posinf_x_neginf_mask);
special_result = si_selb(special_result, y_neginf_x_neginf_result, y_neginf_x_neginf_mask);
special_result = si_selb(special_result, y_posinf_x_posinf_result, y_posinf_x_posinf_mask);
special_result = si_selb(special_result, y_neginf_x_posinf_result, y_neginf_x_posinf_mask);
special_result = si_selb(special_result, y_posfinite_x_neginf_result, y_posfinite_x_neginf_mask);
special_result = si_selb(special_result, y_negfinite_x_neginf_result, y_negfinite_x_neginf_mask);
special_result = si_selb(special_result, y_posfinite_x_posinf_result, y_posfinite_x_posinf_mask);
special_result = si_selb(special_result, y_negfinite_x_posinf_result, y_negfinite_x_posinf_mask);
special_result = si_selb(special_result, x_isfinite_y_posinf_result, x_isfinite_y_posinf_mask);
special_result = si_selb(special_result, x_isfinite_y_neginf_result, x_isfinite_y_neginf_mask);
special_result = si_selb(special_result, x_eqz_result, x_eqz_mask);
special_result = si_selb(special_result, y_eqz_result, y_eqz_mask);
return si_selb(result, special_result, special_case_mask);
}
vector float cp_fatan2(vector float arg0, vector float arg1)
{
const qword y = (qword)arg0;
const qword x = (qword)arg1;
const qword result = _cp_fatan2(y, x);
return (vector float)result;
}
// cp_fatan2_scalar(arg0, arg1)
//
// Computes the atan2(y, x) function using _cp_fatan2 for single-precision float inputs.
//
// Input:
// arg0: The y-coordinate as a single-precision float.
// arg1: The x-coordinate as a single-precision float.
//
// Output:
// The angle (theta) in radians between the positive x-axis and the point (x, y),
// in the range [-pi, pi], as a single-precision float.
//
// This function is a scalar wrapper around _cp_fatan2, which computes atan2(y, x) using
// SIMD operations. The steps involved are:
// 1. Convert the input floats arg0 (y) and arg1 (x) to quadwords y and x using si_from_float.
// 2. Call _cp_fatan2(y, x) to compute the atan2(y, x) value, storing the result in z.
// 3. Convert the quadword z back to a float result using si_to_float.
// 4. Return the float result.
//
// This function provides a convenient way to compute atan2(y, x) for single-precision float
// inputs while leveraging the SIMD capabilities of the _cp_fatan2 function.
float cp_fatan2_scalar(float arg0, float arg1)
{
const qword y = si_from_float(arg0);
const qword x = si_from_float(arg1);
const qword z = _cp_fatan2(y, x);
const float result = si_to_float(z);
return result;
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment