Skip to content

Instantly share code, notes, and snippets.

@anug7
Last active February 25, 2021 05:59
Show Gist options
  • Save anug7/2d4411d7445a4cd6369906641c580409 to your computer and use it in GitHub Desktop.
Save anug7/2d4411d7445a4cd6369906641c580409 to your computer and use it in GitHub Desktop.
Arm64 SIMD atan2
#include <arm_neon.h>
const double __atanf_lut[4] = {
-0.0443265554792128, //p7
-0.3258083974640975, //p3
+0.1555786518463281, //p5
+0.9997878412794807 //p1
};
const double __atanf_pi_2 = M_PI_2
//Taken from https://github.com/xboxfanj/math-neon/blob/master/math_atanf.c and converted to arm64
void atanf_neon_hfp(double *ip, double *op)
{
__asm__ volatile (
"ld1 {v0.2d}, [%2] \n\t" //v0 = {x1, x2};
"ld1r {v4.2d}, [%1] \n\t" //v4 = {pi/2, pi/2};
"ld1 {v6.2d}, [%2] \n\t" //v6 = {x1, x2};
"fcmeq v30.2d, v6.2d, #0 \n\t"
"mvn v31.16b, v30.16b \n\t"
"ushr v31.2d, v31.2d, #63 \n\t"
"scvtf v31.2d, v31.2d \n\t"
"ushr v30.2d, v30.2d, #63 \n\t"
"scvtf v30.2d, v30.2d \n\t" //d30 = (float) d30;
// add 1 if x == 0 to handle inf when x == 0
"fadd v0.2d, v30.2d, v0.2d \n\t"
"fabs v0.2d, v0.2d \n\t" //v0 = fabs(d0) ;
//Refer: https://stackoverflow.com/questions/6759897/divide-by-floating-point-number-using-neon-intrinsics
//fast reciporical approximation - Netwon Raphson two steps to refine.
//TODO: Can be replaced by fdiv? as Netwon Raphson is used before fdiv is introduced in arm64
//"frecpe v1.2d, v0.2d \n\t" //v1 = ~ 1 / v0;
//"frecps v2.2d, v1.2d, v0.2d \n\t" //v2 = 2.0 - v1 * v0;
//"fmul v1.2d, v1.2d, v2.2d \n\t" //v1 = v1 * v2;
//"frecps v2.2d, v1.2d, v0.2d \n\t" //v2 = 2.0 - v1 * v0;
//"fmul v1.2d, v1.2d, v2.2d \n\t" //v1 = v1 * v2;
"mov x1, #1 \n\t"
"dup v7.2d, x1 \n\t"
"scvtf v8.2d, v7.2d \n\t"
"fdiv v1.2d, v8.2d, v0.2d \n\t" //v1 = 1 / v0
//if |x| > 1.0 -> ax = -1/ax, r = pi/2
"fadd v1.2d, v1.2d, v0.2d \n\t" //v1 = v1 + v0;
"fcmgt v3.2d, v0.2d, v8.2d \n\t" //v3 = (v0 > 1);
"ushr v3.2d, v3.2d, #63 \n\t"
"scvtf v5.2d, v3.2d \n\t" //v5 = (float) v3;
"fmls v0.2d, v1.2d, v5.2d \n\t" //v0 = v0 - v1 * v5;
// r
"fmul v7.2d, v4.2d, v5.2d \n\t" //v7 = v5 * v4;
//polynomial:
"fmul v2.2d, v0.2d, v0.2d \n\t" //v2 = v0 * v0 = {ax1^2, ax2^2}
"ld2 {v4.2d, v5.2d}, [%0] \n\t" //v4 = {p7, p5}, d5 = {p3, p1}
"fmul v3.2d, v2.2d, v2.2d \n\t" //v3 = v2 * v2 = {x1^4, x2^4}
"fmul v8.2d, v0.2d, v4.2d[0] \n\t" //v8 = v0 * v4.2d[0] = {p7x1, p7x2}
"fmul v9.2d, v0.2d, v4.2d[1] \n\t" //v9 = v0 * v4.2d[1] = {p5x1, p5x2}
"fmul v10.2d, v0.2d, v5.2d[0] \n\t" //v10 = v0 * v5.2d[0] = {p3x1, p3x2}
"fmul v11.2d, v0.2d, v5.2d[1] \n\t" //v11 = v0 * v5.2d[1] = {p1x1, p1x2}
//a = p7x * x^2 + p5x;
"fmla v9.2d, v8.2d, v2.2d \n\t" //v9 = v9 + (v8 * v2) = {p5x1 + p7x1 * x1^2, p5x2 + p7x2 * x2^2}
//b = p3x * x^2 + p1x;
"fmla v11.2d, v10.2d, v2.2d \n\t" //v11 = v11 + (v10 * v2) = {p1x1 + p3x1 * x1^2, p1x2 + p3x2 * x2^2}
// b = b + a * x^4
"fmla v11.2d, v9.2d, v3.2d \n\t" //v11 = v11 + v9 * v3 = {(p1x1 + p3x1^3) + (p5x1 + p7x1^3) * x1^4, (p1x2 + p3x2^3) + (p5x2 + p7x2^3) * x2^4}
// r = r + b
"fadd v7.2d, v7.2d, v11.2d \n\t" //v7 = v7 + v11 = {rx1 + bx1, rx2 + b * x2}
// a = 2 * r
"fadd v9.2d, v7.2d, v7.2d \n\t" // v9 = 2 * v7
// b = x < 0.0
"fcmlt v3.2d, v6.2d, #0 \n\t" //d3 = (d6 < 0);
"ushr v3.2d, v3.2d, #63 \n\t"
"scvtf v5.2d, v3.2d \n\t" //d5 = (float) d3;
//r = r - a * b
"fmls v7.2d, v9.2d, v5.2d \n\t" // v7 = v7 - v9 * v5;
// Check if output is nan happens if x == 0
"fmul v7.2d, v7.2d, v31.2d \n\t" //handle x == 0
"st1 {v7.2d}, [%3]"
:
: "r"(__atanf_lut), "r"(&__atanf_pi_2), "r"(ip), "r"(op)
:
);
}
@anug7
Copy link
Author

anug7 commented Feb 25, 2021

compile with "-march=armv8-a"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment