Last active
May 2, 2024 05:14
-
-
Save maikxchd/b2419c047de81c082bf0ab5683a9b934 to your computer and use it in GitHub Desktop.
Implementation of Atan on the CELL SPU
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef CBE_SPU_FATAN | |
#define CBE_SPU_FATAN | |
#include <stdint.h> | |
#include <spu_intrinsics.h> | |
/* CBE_SPU_FATAN */ | |
// Implementation of atan for the CELL BE Synergistic Processing Unit | |
// (SPU) using SIMD and IPL features | |
// Helper functions to load constant values into quadwords | |
// These functions load pre-computed constant values into quadwords, | |
// which can be efficiently used in SIMD operations on the CELL SPE. | |
// The constant values are stored as arrays of 32-bit integers, and | |
// the functions use the si_lqa intrinsic to load these values into | |
// quadwords for use in SIMD computations. | |
// Global Floating-point constants (32 bit) | |
// Constant is loaded in each element of 32 bit floating-point vector from local store. | |
// Since the LS is so fast, loading these contants from SPU local memory is going to win | |
// over building the or storing them locally near the function. | |
// | |
// cp_flpio4() +PI/+4 | |
// cp_flt3p8() tan( +3.0 * PI / +8.0 ) | |
// cp_flnpio2() -PI/+2 | |
// cp_flpio2() +PI/+2 | |
// cp_flpt66() +0.66 | |
// cp_flpi() +PI | |
// cp_flnpi() -PI | |
static const vector unsigned int _cp_f_pio4 = { 0x3f490fdb, 0x3f490fdb, 0x3f490fdb, 0x3f490fdb }; | |
static const vector unsigned int _cp_f_t3p8 = { 0x3fe5ec5d, 0x3fe5ec5d, 0x3fe5ec5d, 0x3fe5ec5d }; | |
static const vector unsigned int _cp_f_npio2 = { 0xbfc90fdb, 0xbfc90fdb, 0xbfc90fdb, 0xbfc90fdb }; | |
static const vector unsigned int _cp_f_pio2 = { 0x3fc90fdb, 0x3fc90fdb, 0x3fc90fdb, 0x3fc90fdb }; | |
static const vector unsigned int _cp_f_pt66 = { 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab }; | |
static const vector unsigned int _cp_f_pi = { 0x40490fdb, 0x40490fdb, 0x40490fdb, 0x40490fdb }; | |
static const vector unsigned int _cp_f_npi = { 0xc0490fdb, 0xc0490fdb, 0xc0490fdb, 0xc0490fdb }; | |
static const vector unsigned int _cp_f_morebits = { 0x38800000, 0x38800000, 0x38800000, 0x38800000 }; | |
static const vector unsigned int _cp_f_hmorebits = { 0x34000000, 0x34000000, 0x34000000, 0x34000000 }; | |
// Helper functions to load constant values into quadwords | |
static inline qword cp_flpio4(void) { return si_lqa((intptr_t)&_cp_f_pio4); } | |
static inline qword cp_flt3p8(void) { return si_lqa((intptr_t)&_cp_f_t3p8); } | |
static inline qword cp_flnpio2(void) { return si_lqa((intptr_t)&_cp_f_npio2); } | |
static inline qword cp_flpio2(void) { return si_lqa((intptr_t)&_cp_f_pio2); } | |
static inline qword cp_flpt66(void) { return si_lqa((intptr_t)&_cp_f_pt66); } | |
static inline qword cp_flpi(void) { return si_lqa((intptr_t)&_cp_f_pi); } | |
static inline qword cp_flnpi(void) { return si_lqa((intptr_t)&_cp_f_npi); } | |
static inline qword cp_filzero(void) { return si_ilhu((int16_t)0x0000); } | |
static inline qword cp_filnzero(void) { return si_ilhu((int16_t)0x8000); } | |
static inline qword cp_filone(void) { return si_ilhu((int16_t)0x3f80); } | |
static inline qword cp_filtwo(void) { return si_ilhu((int16_t)0x4000); } | |
static inline qword cp_filinf(void) { return si_ilhu((int16_t)0x7f80); } | |
static inline qword cp_filninf(void) { return si_ilhu((int16_t)0xff80); } | |
static inline qword cp_filnan(void) { return si_ilhu((int16_t)0x7fc0); } | |
// Polynomial coefficients for cp_fatan approximation | |
static const vector unsigned int _cp_f_atan_p7 = { 0x3c08876a, 0x3c08876a, 0x3c08876a, 0x3c08876a }; | |
static const vector unsigned int _cp_f_atan_p6 = { 0xbd954629, 0xbd954629, 0xbd954629, 0xbd954629 }; | |
static const vector unsigned int _cp_f_atan_p5 = { 0x3f8a07c1, 0x3f8a07c1, 0x3f8a07c1, 0x3f8a07c1 }; | |
static const vector unsigned int _cp_f_atan_p4 = { 0xbf49eee6, 0xbf49eee6, 0xbf49eee6, 0xbf49eee6 }; | |
static const vector unsigned int _cp_f_atan_p3 = { 0x3ee4f8b5, 0x3ee4f8b5, 0x3ee4f8b5, 0x3ee4f8b5 }; | |
static const vector unsigned int _cp_f_atan_p2 = { 0xbf62365c, 0xbf62365c, 0xbf62365c, 0xbf62365c }; | |
static const vector unsigned int _cp_f_atan_p1 = { 0x3f490965, 0x3f490965, 0x3f490965, 0x3f490965 }; | |
static const vector unsigned int _cp_f_atan_p0 = { 0xbf2697e0, 0xbf2697e0, 0xbf2697e0, 0xbf2697e0 }; | |
// Higher-degree polynomial coefficients for cp_fatan approximation | |
// These are pre-computed constant vectors representing the coefficients | |
// of the polynomials used to approximate the atan function. | |
static const vector unsigned int _cp_f_atan_q7 = { 0x3c0897d0, 0x3c0897d0, 0x3c0897d0, 0x3c0897d0 }; | |
static const vector unsigned int _cp_f_atan_q6 = { 0xbd890e31, 0xbd890e31, 0xbd890e31, 0xbd890e31 }; | |
static const vector unsigned int _cp_f_atan_q5 = { 0x3f6c4616, 0x3f6c4616, 0x3f6c4616, 0x3f6c4616 }; | |
static const vector unsigned int _cp_f_atan_q4 = { 0xbf2bbfc2, 0xbf2bbfc2, 0xbf2bbfc2, 0xbf2bbfc2 }; | |
static const vector unsigned int _cp_f_atan_q3 = { 0x3eb6679b, 0x3eb6679b, 0x3eb6679b, 0x3eb6679b }; | |
static const vector unsigned int _cp_f_atan_q2 = { 0xbf56c0c9, 0xbf56c0c9, 0xbf56c0c9, 0xbf56c0c9 }; | |
static const vector unsigned int _cp_f_atan_q1 = { 0x3f7df927, 0x3f7df927, 0x3f7df927, 0x3f7df927 }; | |
static const vector unsigned int _cp_f_atan_q0 = { 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 }; | |
// _cp_fatan(x) | |
// | |
// Computes the arctangent of the input value x using a polynomial approximation. | |
// | |
// Input: | |
// x: A quadword representing a single-precision floating-point value or a vector | |
// of four single-precision floating-point values. | |
// | |
// Output: | |
// A quadword containing the arctangent of x (or a vector of arctangents if x was a vector). | |
// | |
// This function utilizes SIMD instructions and the ILP (Instruction-Level Parallelism) | |
// capabilities of the CELL SPE to compute the arctangent of four single-precision | |
// floating-point values simultaneously. The polynomial approximation is implemented | |
// using a series of SIMD operations, taking advantage of the CELL SPE's vector operations | |
// and the Fused Multiply-Add (FMA) instruction to achieve efficient parallel computation. | |
// | |
// The FMA instruction performs a single, combined multiply-add operation, computing | |
// (a * b) + c with a single instruction. This can improve performance compared to | |
// separate multiply and add instructions, as it reduces the number of instructions required | |
// and can utilize specialized hardware for faster execution. | |
// | |
// In this implementation, the FMA instruction is used extensively in the polynomial | |
// evaluation and range reduction steps, allowing for efficient computation of expressions | |
// like (a * x^2) + (b * x) + c in a single instruction. | |
static inline qword _cp_fatan(const qword x) | |
{ | |
const qword f_one = cp_filone(); | |
const qword f_inf = cp_filinf(); | |
const qword f_ninf = cp_filninf(); | |
const qword f_msb = cp_filnzero(); | |
const qword f_zero = cp_filzero(); | |
const qword f_pt66 = cp_flpt66(); | |
const qword f_pio2 = cp_flpio2(); | |
const qword f_npio2 = cp_flnpio2(); | |
const qword f_pio4 = cp_flpio4(); | |
const qword f_t3p8 = cp_flt3p8(); | |
const qword f_atan_p0 = si_lqa((intptr_t)&_cp_f_atan_p0); | |
const qword f_atan_p1 = si_lqa((intptr_t)&_cp_f_atan_p1); | |
const qword f_atan_p2 = si_lqa((intptr_t)&_cp_f_atan_p2); | |
const qword f_atan_p3 = si_lqa((intptr_t)&_cp_f_atan_p3); | |
const qword f_atan_p4 = si_lqa((intptr_t)&_cp_f_atan_p4); | |
const qword f_atan_q0 = si_lqa((intptr_t)&_cp_f_atan_q0); | |
const qword f_atan_q1 = si_lqa((intptr_t)&_cp_f_atan_q1); | |
const qword f_atan_q2 = si_lqa((intptr_t)&_cp_f_atan_q2); | |
const qword f_atan_q3 = si_lqa((intptr_t)&_cp_f_atan_q3); | |
const qword f_atan_q4 = si_lqa((intptr_t)&_cp_f_atan_q4); | |
const qword f_morebits = si_lqa((intptr_t)&_cp_f_morebits); | |
const qword f_hmorebits = si_lqa((intptr_t)&_cp_f_hmorebits); | |
// ## | |
// ## pos_x = -x { x < 0 | |
// ## x { otherwise | |
// ## | |
// This section computes the absolute value of the input x, which is needed for the range | |
// reduction step. The sign of the input is handled separately. | |
// | |
// neg_x is computed by XORing x with f_msb, which flips the sign bit of x, effectively | |
// negating x. sign_mask is a mask that identifies if x is negative or not, using the si_fcgt | |
// intrinsic to perform a floating-point greater-than comparison with 0.0. | |
// | |
// pos_x is then computed by selecting between x and neg_x based on the sign_mask, using | |
// the si_selb intrinsic. If x is negative, neg_x (-x) is selected, otherwise x is selected. | |
// This operation effectively computes the absolute value of x. | |
const qword neg_x = si_xor(x, f_msb); | |
const qword sign_mask = si_fcgt(f_zero, x); | |
const qword pos_x = si_selb(x, neg_x, sign_mask); | |
// Range reduction | |
// ## | |
// ## range0_mask = ( pos_x > tan( 3.0 * PI / 8.0 ) ) | |
// ## range1_mask = ( pos_x <= 0.66 ) | |
// ## range2_mask = !( range0_mask || range1_mask ) | |
// ## | |
// | |
// This section performs range reduction, which maps the input values into a smaller range | |
// where the arctangent function can be more accurately approximated using a polynomial. | |
// | |
// range0_mask identifies cases where pos_x is greater than tan(3π/8), which is a constant | |
// value f_t3p8. The si_fcgt intrinsic performs a floating-point greater-than comparison. | |
// | |
// range1_mask identifies cases where pos_x is less than or equal to 0.66, which is a constant | |
// value f_pt66. It is computed by combining two masks using si_or: range1_gt_mask (pos_x < 0.66) | |
// and range1_eq_mask (pos_x == 0.66). | |
// | |
// range2_mask identifies the remaining cases that are not covered by range0_mask or range1_mask. | |
// It is computed by performing a bitwise NOR operation (si_nor) on range0_mask and range1_mask. | |
// | |
// These range masks are used to select different computations and approximations for the | |
// arctangent function, based on the range in which the input value falls. | |
const qword range0_mask = si_fcgt(pos_x, f_t3p8); | |
const qword range1_gt_mask = si_fcgt(f_pt66, pos_x); | |
const qword range1_eq_mask = si_fceq(f_pt66, pos_x); | |
const qword range1_mask = si_or(range1_gt_mask, range1_eq_mask); | |
const qword range2_mask = si_nor(range0_mask, range1_mask); | |
// Range reduction using FMA | |
// | |
// range0_x = -1.0 | |
// ----- | |
// pos_x | |
// | |
// The range reduction step for the range0 case involves dividing -1.0 by pos_x. | |
// This is implemented using a series of FMA (Fused Multiply-Add) instructions, which perform | |
// the operation (a * b) + c in a single instruction for improved performance. | |
// | |
// The steps to compute range0_x are: | |
// range0_x0 = pos_x - (trunc(pos_x) * pos_x) // Compute the fractional part of pos_x | |
// range0_x1 = trunc(pos_x) // Compute the integer part of pos_x | |
// range0_x2 = -range0_x1 * pos_x + 1.0 // Compute -range0_x1 * pos_x and add 1.0 | |
// range0_x3 = range0_x2 * range0_x1 + range0_x1 // Final result: range0_x3 = -1.0 / pos_x | |
// | |
// The FMA instructions are used to improve performance by reducing the number of instructions | |
// and utilizing specialized hardware for faster execution. | |
const qword range0_x0 = si_frest(pos_x); | |
const qword range0_x1 = si_fi(pos_x, range0_x0); | |
const qword range0_x2 = si_fnms(range0_x1, pos_x, f_one); | |
const qword range0_x3 = si_fma(range0_x2, range0_x1, range0_x1); | |
const qword range0_x = si_xor(range0_x3, f_msb); | |
const qword range0_y = f_pio2; | |
// ## | |
// ## range1_x = pos_x | |
// ## range1_y = 0.0 | |
// ## | |
// | |
// For the range1 case, where the input value pos_x is less than or equal to 0.66, | |
// the range-specific values are simply set to: | |
// | |
// range1_x = pos_x | |
// range1_y = 0.0 | |
// | |
// This means that the input value itself (pos_x) is used for the arctangent approximation | |
// in this range, and the corresponding y-value is set to 0.0. | |
const qword range1_x = pos_x; | |
const qword range1_y = f_zero; | |
// ## | |
// ## range2_x = (pos_x-1.0) | |
// ## ----------- | |
// ## (pos_x+1.0) | |
// ## | |
// ## range2_y = PI | |
// ## --- | |
// ## 4.0 | |
// ## | |
// | |
// For the range2 case, the range reduction involves computing (pos_x - 1.0) / (pos_x + 1.0). | |
// This is done using a series of steps: | |
// range2_x0num = pos_x - 1.0 // Compute the numerator | |
// range2_x0den = pos_x + 1.0 // Compute the denominator | |
// range2_x0 = range2_x0den - (trunc(range2_x0den) * range2_x0den) // Compute the fractional part of the denominator | |
// range2_x1 = -range2_x0 * range2_x0den + 1.0 // Compute the reciprocal of the denominator | |
// range2_x2 = range2_x1 * range2_x0 // Refine the reciprocal | |
// range2_x = range2_x0num * range2_x2 // Final result: range2_x = (pos_x - 1.0) / (pos_x + 1.0) | |
// | |
// The range2_y value is set to PI/4.0, which is used in the final arctangent computation. | |
const qword range2_y = f_pio4; | |
const qword range2_x0num = si_fs(pos_x, f_one); | |
const qword range2_x0den = si_fa(pos_x, f_one); | |
const qword range2_x0 = si_frest(range2_x0den); | |
const qword range2_x1 = si_fnms(range2_x0, range2_x0den, f_one); | |
const qword range2_x2 = si_fma(range2_x1, range2_x0, range2_x0); | |
const qword range2_x = si_fm(range2_x0num, range2_x2); | |
// ## | |
// ## range_x = range0_x { range0_mask | |
// ## range1_x { range1_mask | |
// ## range2_x { range2_mask | |
// ## | |
// ## range_y = range0_y { range0_mask | |
// ## range1_y { range1_mask | |
// ## range2_y { range2_mask | |
// ## | |
// | |
// Based on the range masks computed earlier (range0_mask, range1_mask, range2_mask), | |
// this section selects the appropriate range-specific values for x (range_x) and y (range_y). | |
// | |
// The range-specific values (range0_x, range1_x, range2_x, range0_y, range1_y, range2_y) | |
// are precomputed constants that are used in the arctangent approximation for each range. | |
// | |
// The si_selb intrinsic is used to select the appropriate value based on the range masks. | |
// It performs a conditional selection operation, selecting the first value if the corresponding | |
// mask bit is 0, and the second value if the mask bit is 1. | |
// | |
// For range_x: | |
// 1. range_x0 is computed by selecting between range2_x and range0_x based on range0_mask. | |
// 2. range_x is computed by selecting between range_x0 and range1_x based on range1_mask. | |
// | |
// For range_y: | |
// 1. range_y0 is computed by selecting between range2_y and range0_y based on range0_mask. | |
// 2. range_y is computed by selecting between range_y0 and range1_y based on range1_mask. | |
// | |
// These range-specific values are used in the subsequent steps of the arctangent approximation. | |
const qword range_x0 = si_selb(range2_x, range0_x, range0_mask); | |
const qword range_x = si_selb(range_x0, range1_x, range1_mask); | |
const qword range_y0 = si_selb(range2_y, range0_y, range0_mask); | |
const qword range_y = si_selb(range_y0, range1_y, range1_mask); | |
// Polynomial approximation using FMA | |
// | |
// zdiv = P + P xp2 + P xp2^2 + P xp2^3 + P xp2^4 | |
// 0 1 2 3 4 | |
// ------------------------------------------ | |
// Q + Q xp2 + Q xp2^2 + Q xp2^3 + Q xp2^4 + xp2^5 | |
// 0 1 2 3 4 | |
// | |
// The polynomial approximation is implemented using Estrin's method, evaluating | |
// the numerator and denominator polynomials separately using FMA instructions, | |
// and then dividing the numerator by the denominator to obtain the final result. | |
// | |
// The numerator is computed as: | |
// znum0 = f_atan_p0 | |
// znum1 = (znum0 * xp2) + f_atan_p1 | |
// znum2 = (znum1 * xp2) + f_atan_p2 | |
// znum3 = (znum2 * xp2) + f_atan_p3 | |
// znum = (znum3 * xp2) + f_atan_p4 | |
// | |
// The denominator is computed as: | |
// zden0 = xp2 + f_atan_q0 | |
// zden1 = (zden0 * xp2) + f_atan_q1 | |
// zden2 = (zden1 * xp2) + f_atan_q2 | |
// zden3 = (zden2 * xp2) + f_atan_q3 | |
// zden = (zden3 * xp2) + f_atan_q4 + xp2 | |
// | |
// The final result, zdiv, is computed by dividing znum by zden using | |
// additional FMA instructions for improved accuracy. | |
const qword xp2 = si_fm(range_x, range_x); | |
const qword znum0 = f_atan_p0; | |
const qword znum1 = si_fma(znum0, xp2, f_atan_p1); // FMA: (znum0 * xp2) + f_atan_p1 | |
const qword znum2 = si_fma(znum1, xp2, f_atan_p2); // FMA: (znum1 * xp2) + f_atan_p2 | |
const qword znum3 = si_fma(znum2, xp2, f_atan_p3); // FMA: (znum2 * xp2) + f_atan_p3 | |
const qword znum = si_fma(znum3, xp2, f_atan_p4); // FMA: (znum3 * xp2) + f_atan_p4 | |
const qword zden0 = si_fa(xp2, f_atan_q0); | |
const qword zden1 = si_fma(zden0, xp2, f_atan_q1); // FMA: (zden0 * xp2) + f_atan_q1 | |
const qword zden2 = si_fma(zden1, xp2, f_atan_q2); // FMA: (zden1 * xp2) + f_atan_q2 | |
const qword zden3 = si_fma(zden2, xp2, f_atan_q3); // FMA: (zden2 * xp2) + f_atan_q3 | |
const qword zden = si_fma(zden3, xp2, f_atan_q4); // FMA: (zden3 * xp2) + f_atan_q4 | |
const qword zden_r0 = si_frest(zden); | |
const qword zden_r1 = si_fnms(zden_r0, zden, f_one); | |
const qword zden_r = si_fma(zden_r1, zden_r0, zden_r0); | |
const qword zdiv = si_fm(znum, zden_r); | |
const qword z0 = si_fm(xp2, zdiv); | |
const qword z1 = si_fma(range_x, z0, range_x); | |
// ## | |
// ## zadd = z1 + 0.5 * MOREBITS { range2_mask | |
// ## z1 + MOREBITS { range1_mask | |
// ## z1 { otherwise | |
// ## | |
// ## yaddz = range_y + zadd | |
// ## | |
// ## pos_yaddz = yaddz { yaddz >= 0 | |
// ## -yaddz { yaddz < 0 | |
// ## | |
const qword zadd0 = si_selb(f_zero, f_hmorebits, range2_mask); | |
const qword zadd1 = si_selb(zadd0, f_morebits, range1_mask); | |
const qword zadd = si_fa(z1, zadd1); | |
const qword yaddz = si_fa(range_y, zadd); | |
const qword neg_yaddz = si_xor(yaddz, f_msb); | |
const qword pos_yaddz = si_selb(yaddz, neg_yaddz, sign_mask); | |
// Handling special cases | |
// | |
// result_y0 = 0.0 { x == 0.0 | |
// pos_yaddz { otherwise | |
// | |
// In this step, the function checks if the input value x is equal to 0.0. | |
// If it is, the result is set to 0.0. Otherwise, the result is set to pos_yaddz, | |
// which is the computed arctangent value after range reduction and polynomial | |
// approximation. | |
const qword x_eqz_mask = si_fceq(f_zero, x); | |
const qword result_y0 = si_selb(pos_yaddz, x, x_eqz_mask); | |
// result_y2 = +PI { | |
// --- { x == INF | |
// 2.0 { | |
// | |
// -PI { | |
// --- { x == -INF | |
// 2.0 { | |
// | |
// result_y0 { otherwise | |
// | |
// This section handles the cases where the input value x is positive or negative | |
// infinity. If x is positive infinity, the result is set to pi/2.0. If x is negative | |
// infinity, the result is set to -pi/2.0. Otherwise, the result is set to result_y0, | |
// which is the computed arctangent value from the previous step. | |
const qword x_eqinf_mask = si_fceq(f_inf, x); | |
const qword x_eqninf_mask = si_fceq(f_ninf, x); | |
const qword result_y1 = si_selb(result_y0, f_pio2, x_eqinf_mask); | |
const qword result = si_selb(result_y1, f_npio2, x_eqninf_mask); | |
return result; | |
} | |
// cp_fatan(x) | |
// | |
// Computes the arctangent of the input vector float x using _cp_fatan. | |
// | |
// Input: | |
// x: A vector float representing four single-precision floating-point values. | |
// | |
// Output: | |
// A vector float containing the arctangent of each element in x. | |
// | |
// This function acts as a wrapper around _cp_fatan, converting the input | |
// vector float to a quadword, computing the arctangent using _cp_fatan, | |
// and then converting the result back to a vector float. | |
static inline vector float cp_fatan(const vector float x) | |
{ | |
return (vector float)(_cp_fatan((qword)x)); | |
} | |
// cp_fatan_scalar(x) | |
// | |
// Computes the arctangent of the input single-precision float x using _cp_fatan. | |
// | |
// Input: | |
// x: A single-precision floating-point value. | |
// | |
// Output: | |
// The arctangent of x as a single-precision floating-point value. | |
// | |
// This function acts as a wrapper around _cp_fatan, converting the input | |
// float to a quadword, computing the arctangent using _cp_fatan, and then | |
// converting the result back to a float. | |
// | |
// The steps involved are: | |
// 1. Convert the input float x to a quadword vx using si_from_float. | |
// 2. Call _cp_fatan(vx) to compute the arctangent of vx, storing the result in vresult. | |
// 3. Convert the quadword vresult back to a float result using si_to_float. | |
// 4. Return the float result. | |
// | |
// This function provides a convenient way to compute the arctangent of a single float value, | |
// while still leveraging the SIMD capabilities of the _cp_fatan function. | |
static inline float cp_fatan_scalar(const float x) | |
{ | |
const qword vx = si_from_float(x); | |
const qword vresult = _cp_fatan(vx); | |
const float result = si_to_float(vresult); | |
return result; | |
} | |
static inline float cp_fatan_scalar(const float x) | |
{ | |
const qword vx = si_from_float(x); | |
const qword vresult = _cp_fatan(vx); | |
const float result = si_to_float(vresult); | |
return result; | |
} | |
// ## cp_fatan2(y, x) | |
// ## | |
// ## Function to compute the angle (in radians) between the positive x-axis and the | |
// ## point (x, y) in the Cartesian plane, using the atan2 function. | |
// ## | |
// ## Input: | |
// ## y: float or vector float representing the y-coordinate | |
// ## x: float or vector float representing the x-coordinate | |
// ## | |
// ## Output: | |
// ## The angle (theta) in radians between the positive x-axis and the point (x, y) | |
// ## in the range [-pi, pi]. | |
// ## | |
// ## Mathematical Description: | |
// ## The atan2 function is a variation of the arctangent function, which computes | |
// ## the angle (in radians) between the positive x-axis and the vector (x, y) in the | |
// ## Cartesian plane. It is defined as: | |
// ## | |
// ## atan2(y, x) = arctan(y/x) | |
// ## | |
// ## However, the standard arctangent function (arctan) has a range of [-pi/2, pi/2], | |
// ## which is not sufficient to represent all possible angles in the Cartesian plane. | |
// ## The atan2 function extends the range to [-pi, pi] by considering the signs of | |
// ## both x and y. | |
// ## | |
// ## The mathematical relationships governing atan2 are as follows: | |
// ## | |
// ## atan2(y, x) = arctan(y/x) (for x > 0) | |
// ## atan2(y, x) = arctan(y/x) + pi (for x < 0 and y >= 0) | |
// ## atan2(y, x) = arctan(y/x) - pi (for x < 0 and y < 0) | |
// ## | |
// ## Additionally, the atan2 function handles special cases where either x or y is zero, | |
// ## positive infinity, or negative infinity, as described in the special cases section. | |
// ## | |
// ## The implementation of cp_fatan2 follows these mathematical relationships and special | |
// ## cases to compute the correct angle for any given (x, y) coordinate pair. | |
// ## | |
// ## Special cases: | |
// ## - If y is +0 and x is negative, the result is +pi | |
// ## - If y is -0 and x is negative, the result is -pi | |
// ## - If y is +0 and x is positive, the result is +0 | |
// ## - If y is -0 and x is positive, the result is -0 | |
// ## - If y is negative and x is 0, the result is -pi or pi/2 (depending on sign of y) | |
// ## - If y is positive and x is 0, the result is pi or pi/2 (depending on sign of y) | |
// ## - If either y or x is NaN, the result is NaN | |
// ## - If y is +0 and x is -0, the result is +pi | |
// ## - If y is -0 and x is -0, the result is -pi | |
// ## - If y is +0 and x is +0, the result is +0 | |
// ## - If y is -0 and x is +0, the result is -0 | |
// ## - If y is +inf and x is +inf, the result is pi/4 | |
// ## - If y is -inf and x is +inf, the result is -pi/4 | |
// ## - If y is +inf and x is -inf, the result is 3pi/4 | |
// ## - If y is -inf and x is -inf, the result is -3pi/4 | |
// ## - If y is finite and positive, and x is -inf, the result is +pi | |
// ## - If y is finite and negative, and x is -inf, the result is -pi | |
// ## - If y is finite and positive, and x is +inf, the result is +0 | |
// ## - If y is finite and negative, and x is +inf, the result is -0 | |
// ## - If x is finite and y is +inf, the result is pi/2 | |
// ## - If x is finite and y is -inf, the result is -pi/2 | |
// ## | |
// ## General cases: | |
// ## - If x is negative and y is non-negative, the result is pi + atan(y/x) | |
// ## - If x is negative and y is negative, the result is -pi + atan(y/x) | |
// ## - Otherwise, the result is atan(y/x) | |
qword _cp_fatan2(qword y, qword x) | |
{ | |
const qword f_one = cp_filone(); | |
const qword f_zero = cp_filzero(); | |
const qword f_pi = cp_flpi(); | |
const qword f_npi = cp_flnpi(); | |
const qword f_inf = cp_filinf(); | |
const qword f_ninf = cp_filninf(); | |
// ## | |
// ## yox = y | |
// ## - | |
// ## x | |
// ## | |
// | |
// This section computes the ratio yox = y / x, which is a key step in the atan2(y, x) computation. | |
// The ratio yox is computed using a series of FMA (Fused Multiply-Add) instructions for improved | |
// accuracy and performance. | |
// | |
// First, x_r0 computes the fractional part of x using the si_frest intrinsic. | |
// x_r1 computes (1 - x_r0) * x using the si_fnms (Fused Negative Multiply-Subtract) intrinsic. | |
// x_r is then computed by refining the value of (1 - x_r0) using an FMA instruction. | |
// | |
// Finally, yox is computed by dividing y by x_r using the si_fm (Floating-Point Multiplication) intrinsic. | |
const qword x_r0 = si_frest(x); | |
const qword x_r1 = si_fnms(x_r0, x, f_one); | |
const qword x_r = si_fma(x_r1, x_r0, x_r0); | |
const qword yox = si_fm(y, x_r); | |
// ## | |
// ## z = +PI + cp_atan(yox) { (x < 0.0) && (y >= 0.0) | |
// ## -PI + cp_atan(yox) { (x < 0.0) && (y < 0.0) | |
// ## 0.0 + cp_atan(yox) { otherwise | |
// ## | |
// | |
// Based on the signs of x and y, this section computes the final result z by adding or subtracting | |
// multiples of PI to the result of cp_atan(yox), which is the arctangent of yox computed using | |
// the polynomial approximation. | |
// | |
// First, masks are created to identify the cases where x is negative (x_ltz_mask), y is negative (y_ltz_mask), | |
// and both x and y are negative (xy_ltz_mask). | |
// | |
// zadd0 is computed by selecting between 0.0 and +PI based on x_ltz_mask, which identifies if x is negative. | |
// | |
// zadd is then computed by selecting between zadd0 and -PI based on xy_ltz_mask, which identifies if both | |
// x and y are negative. | |
// | |
// The atan_yox value is computed by calling the _cp_fatan function, which computes the arctangent of yox | |
// using the polynomial approximation. | |
// | |
// Finally, result is computed by adding zadd (which can be 0.0, +PI, or -PI) to atan_yox, effectively | |
// adjusting the arctangent result based on the signs of x and y. | |
const qword x_ltz_mask = si_fcgt(f_zero, x); | |
const qword y_ltz_mask = si_fcgt(f_zero, y); | |
const qword xy_ltz_mask = si_and(x_ltz_mask, y_ltz_mask); | |
const qword zadd0 = si_selb(f_zero, f_pi, x_ltz_mask); | |
const qword zadd = si_selb(zadd0, f_npi, xy_ltz_mask); | |
const qword atan_yox = _cp_fatan(yox); | |
const qword result = si_fa(zadd, atan_yox); | |
// Improved special case handling | |
// Special cases need to be handled carefully to ensure correct results when dealing with | |
// corner cases or exceptional inputs. The CELL BE architecture provides efficient SIMD | |
// operations and intrinsics for handling these special cases in a parallel manner. | |
// The following special cases are handled in this implementation: | |
// Create masks to identify if x or y are finite values (not NaN or infinity). | |
// The si_andc intrinsic performs a bitwise AND with the complement, clearing the | |
// exponent bits of x and y. The si_fcgt intrinsic performs a floating-point greater-than | |
// comparison with +inf, effectively checking if the values are finite. | |
const qword x_isfinite_mask = si_fcgt(f_inf, si_andc(x, spu_splats(0x7f800000))); | |
const qword y_isfinite_mask = si_fcgt(f_inf, si_andc(y, spu_splats(0x7f800000))); | |
// Create masks to identify if x or y are equal to positive or negative infinity. | |
// The si_fceq intrinsic performs a floating-point equal-to comparison with +inf and -inf. | |
const qword x_eqinf_mask = si_fceq(f_inf, x); | |
const qword x_eqninf_mask = si_fceq(f_ninf, x); | |
const qword y_eqinf_mask = si_fceq(f_inf, y); | |
const qword y_eqninf_mask = si_fceq(f_ninf, y); | |
// Create masks to identify if x or y are equal to zero (positive or negative). | |
// The si_fceq intrinsic performs a floating-point equal-to comparison with +0.0. | |
const qword x_eqz_mask = si_fceq(f_zero, x); | |
const qword y_eqz_mask = si_fceq(f_zero, y); | |
// Combine masks to identify cases where both x and y are positive or negative infinity. | |
const qword xy_eqinf_mask = si_and(x_eqinf_mask, y_eqinf_mask); | |
const qword xy_eqninf_mask = si_and(x_eqninf_mask, y_eqninf_mask); | |
// Create masks for cases where one input is infinity and the other is not. | |
const qword y_posinf_x_neginf_mask = si_and(y_eqinf_mask, x_eqninf_mask); | |
const qword y_neginf_x_neginf_mask = si_and(y_eqninf_mask, x_eqninf_mask); | |
const qword y_posinf_x_posinf_mask = si_and(y_eqinf_mask, x_eqinf_mask); | |
const qword y_neginf_x_posinf_mask = si_and(y_eqninf_mask, x_eqinf_mask); | |
// Create masks for cases where one input is finite and the other is negative infinity. | |
const qword y_posfinite_x_neginf_mask = si_and(y_isfinite_mask, si_fceq(f_ninf, x)); | |
const qword y_negfinite_x_neginf_mask = si_and(si_fcgt(f_zero, y), si_fceq(f_ninf, x)); | |
const qword y_posfinite_x_posinf_mask = si_and(y_isfinite_mask, si_fceq(f_inf, x)); | |
const qword y_negfinite_x_posinf_mask = si_and(si_fcgt(f_zero, y), si_fceq(f_inf, x)); | |
// Create masks for cases where x is finite and y is positive or negative infinity. | |
const qword x_isfinite_y_posinf_mask = si_and(x_isfinite_mask, y_eqinf_mask); | |
const qword x_isfinite_y_neginf_mask = si_and(x_isfinite_mask, y_eqninf_mask); | |
// Combine all the special case masks using bitwise OR operations. | |
const qword special_case_mask = si_or(xy_eqinf_mask, xy_eqninf_mask); | |
special_case_mask = si_or(special_case_mask, y_posinf_x_neginf_mask); | |
special_case_mask = si_or(special_case_mask, y_neginf_x_neginf_mask); | |
special_case_mask = si_or(special_case_mask, y_posinf_x_posinf_mask); | |
special_case_mask = si_or(special_case_mask, y_neginf_x_posinf_mask); | |
special_case_mask = si_or(special_case_mask, y_posfinite_x_neginf_mask); | |
special_case_mask = si_or(special_case_mask, y_negfinite_x_neginf_mask); | |
special_case_mask = si_or(special_case_mask, y_posfinite_x_posinf_mask); | |
special_case_mask = si_or(special_case_mask, y_negfinite_x_posinf_mask); | |
special_case_mask = si_or(special_case_mask, x_isfinite_y_posinf_mask); | |
special_case_mask = si_or(special_case_mask, x_isfinite_y_neginf_mask); | |
special_case_mask = si_or(special_case_mask, x_eqz_mask); | |
special_case_mask = si_or(special_case_mask, y_eqz_mask); | |
// Compute the result for special cases using SIMD select operations. | |
const qword y_posinf_x_neginf_result = si_fm(si_ilhu(0x4049), f_one); // 3*pi/4 | |
const qword y_neginf_x_neginf_result = si_fm(si_ilhu(0xc049), f_one); // -3*pi/4 | |
const qword y_posinf_x_posinf_result = si_fm(f_pio2, si_ilhu(0x3f80)); // pi/4 | |
const qword y_neginf_x_posinf_result = si_fm(f_npio2, si_ilhu(0x3f80)); // -pi/4 | |
const qword y_posfinite_x_neginf_result = f_pi; | |
const qword y_negfinite_x_neginf_result = f_npi; | |
const qword y_posfinite_x_posinf_result = f_zero; | |
const qword y_negfinite_x_posinf_result = si_xor(f_zero, f_msb); | |
const qword x_isfinite_y_posinf_result = si_fm(f_pio2, si_ilhu(0x3f80)); // pi/2 | |
const qword x_isfinite_y_neginf_result = si_fm(f_npio2, si_ilhu(0x3f80)); // -pi/2 | |
const qword x_eqz_result = si_selb(f_pi, f_zero, si_fcgt(f_zero, y)); | |
const qword y_eqz_result = si_selb(f_zero, si_xor(f_zero, f_msb), si_fcgt(f_zero, x)); | |
// Select the appropriate result value based on the special case masks using si_selb. | |
const qword special_result = si_selb(result, y_posinf_x_neginf_result, y_posinf_x_neginf_mask); | |
special_result = si_selb(special_result, y_neginf_x_neginf_result, y_neginf_x_neginf_mask); | |
special_result = si_selb(special_result, y_posinf_x_posinf_result, y_posinf_x_posinf_mask); | |
special_result = si_selb(special_result, y_neginf_x_posinf_result, y_neginf_x_posinf_mask); | |
special_result = si_selb(special_result, y_posfinite_x_neginf_result, y_posfinite_x_neginf_mask); | |
special_result = si_selb(special_result, y_negfinite_x_neginf_result, y_negfinite_x_neginf_mask); | |
special_result = si_selb(special_result, y_posfinite_x_posinf_result, y_posfinite_x_posinf_mask); | |
special_result = si_selb(special_result, y_negfinite_x_posinf_result, y_negfinite_x_posinf_mask); | |
special_result = si_selb(special_result, x_isfinite_y_posinf_result, x_isfinite_y_posinf_mask); | |
special_result = si_selb(special_result, x_isfinite_y_neginf_result, x_isfinite_y_neginf_mask); | |
special_result = si_selb(special_result, x_eqz_result, x_eqz_mask); | |
special_result = si_selb(special_result, y_eqz_result, y_eqz_mask); | |
return si_selb(result, special_result, special_case_mask); | |
} | |
vector float cp_fatan2(vector float arg0, vector float arg1) | |
{ | |
const qword y = (qword)arg0; | |
const qword x = (qword)arg1; | |
const qword result = _cp_fatan2(y, x); | |
return (vector float)result; | |
} | |
// cp_fatan2_scalar(arg0, arg1) | |
// | |
// Computes the atan2(y, x) function using _cp_fatan2 for single-precision float inputs. | |
// | |
// Input: | |
// arg0: The y-coordinate as a single-precision float. | |
// arg1: The x-coordinate as a single-precision float. | |
// | |
// Output: | |
// The angle (theta) in radians between the positive x-axis and the point (x, y), | |
// in the range [-pi, pi], as a single-precision float. | |
// | |
// This function is a scalar wrapper around _cp_fatan2, which computes atan2(y, x) using | |
// SIMD operations. The steps involved are: | |
// 1. Convert the input floats arg0 (y) and arg1 (x) to quadwords y and x using si_from_float. | |
// 2. Call _cp_fatan2(y, x) to compute the atan2(y, x) value, storing the result in z. | |
// 3. Convert the quadword z back to a float result using si_to_float. | |
// 4. Return the float result. | |
// | |
// This function provides a convenient way to compute atan2(y, x) for single-precision float | |
// inputs while leveraging the SIMD capabilities of the _cp_fatan2 function. | |
float cp_fatan2_scalar(float arg0, float arg1) | |
{ | |
const qword y = si_from_float(arg0); | |
const qword x = si_from_float(arg1); | |
const qword z = _cp_fatan2(y, x); | |
const float result = si_to_float(z); | |
return result; | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment