Last active
February 25, 2021 05:59
-
-
Save anug7/2d4411d7445a4cd6369906641c580409 to your computer and use it in GitHub Desktop.
Arm64 SIMD atan2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <arm_neon.h> | |
const double __atanf_lut[4] = { | |
-0.0443265554792128, //p7 | |
-0.3258083974640975, //p3 | |
+0.1555786518463281, //p5 | |
+0.9997878412794807 //p1 | |
}; | |
const double __atanf_pi_2 = M_PI_2 | |
//Taken from https://github.com/xboxfanj/math-neon/blob/master/math_atanf.c and converted to arm64 | |
void atanf_neon_hfp(double *ip, double *op) | |
{ | |
__asm__ volatile ( | |
"ld1 {v0.2d}, [%2] \n\t" //v0 = {x1, x2}; | |
"ld1r {v4.2d}, [%1] \n\t" //v4 = {pi/2, pi/2}; | |
"ld1 {v6.2d}, [%2] \n\t" //v6 = {x1, x2}; | |
"fcmeq v30.2d, v6.2d, #0 \n\t" | |
"mvn v31.16b, v30.16b \n\t" | |
"ushr v31.2d, v31.2d, #63 \n\t" | |
"scvtf v31.2d, v31.2d \n\t" | |
"ushr v30.2d, v30.2d, #63 \n\t" | |
"scvtf v30.2d, v30.2d \n\t" //d30 = (float) d30; | |
// add 1 if x == 0 to handle inf when x == 0 | |
"fadd v0.2d, v30.2d, v0.2d \n\t" | |
"fabs v0.2d, v0.2d \n\t" //v0 = fabs(d0) ; | |
//Refer: https://stackoverflow.com/questions/6759897/divide-by-floating-point-number-using-neon-intrinsics | |
//fast reciporical approximation - Netwon Raphson two steps to refine. | |
//TODO: Can be replaced by fdiv? as Netwon Raphson is used before fdiv is introduced in arm64 | |
//"frecpe v1.2d, v0.2d \n\t" //v1 = ~ 1 / v0; | |
//"frecps v2.2d, v1.2d, v0.2d \n\t" //v2 = 2.0 - v1 * v0; | |
//"fmul v1.2d, v1.2d, v2.2d \n\t" //v1 = v1 * v2; | |
//"frecps v2.2d, v1.2d, v0.2d \n\t" //v2 = 2.0 - v1 * v0; | |
//"fmul v1.2d, v1.2d, v2.2d \n\t" //v1 = v1 * v2; | |
"mov x1, #1 \n\t" | |
"dup v7.2d, x1 \n\t" | |
"scvtf v8.2d, v7.2d \n\t" | |
"fdiv v1.2d, v8.2d, v0.2d \n\t" //v1 = 1 / v0 | |
//if |x| > 1.0 -> ax = -1/ax, r = pi/2 | |
"fadd v1.2d, v1.2d, v0.2d \n\t" //v1 = v1 + v0; | |
"fcmgt v3.2d, v0.2d, v8.2d \n\t" //v3 = (v0 > 1); | |
"ushr v3.2d, v3.2d, #63 \n\t" | |
"scvtf v5.2d, v3.2d \n\t" //v5 = (float) v3; | |
"fmls v0.2d, v1.2d, v5.2d \n\t" //v0 = v0 - v1 * v5; | |
// r | |
"fmul v7.2d, v4.2d, v5.2d \n\t" //v7 = v5 * v4; | |
//polynomial: | |
"fmul v2.2d, v0.2d, v0.2d \n\t" //v2 = v0 * v0 = {ax1^2, ax2^2} | |
"ld2 {v4.2d, v5.2d}, [%0] \n\t" //v4 = {p7, p5}, d5 = {p3, p1} | |
"fmul v3.2d, v2.2d, v2.2d \n\t" //v3 = v2 * v2 = {x1^4, x2^4} | |
"fmul v8.2d, v0.2d, v4.2d[0] \n\t" //v8 = v0 * v4.2d[0] = {p7x1, p7x2} | |
"fmul v9.2d, v0.2d, v4.2d[1] \n\t" //v9 = v0 * v4.2d[1] = {p5x1, p5x2} | |
"fmul v10.2d, v0.2d, v5.2d[0] \n\t" //v10 = v0 * v5.2d[0] = {p3x1, p3x2} | |
"fmul v11.2d, v0.2d, v5.2d[1] \n\t" //v11 = v0 * v5.2d[1] = {p1x1, p1x2} | |
//a = p7x * x^2 + p5x; | |
"fmla v9.2d, v8.2d, v2.2d \n\t" //v9 = v9 + (v8 * v2) = {p5x1 + p7x1 * x1^2, p5x2 + p7x2 * x2^2} | |
//b = p3x * x^2 + p1x; | |
"fmla v11.2d, v10.2d, v2.2d \n\t" //v11 = v11 + (v10 * v2) = {p1x1 + p3x1 * x1^2, p1x2 + p3x2 * x2^2} | |
// b = b + a * x^4 | |
"fmla v11.2d, v9.2d, v3.2d \n\t" //v11 = v11 + v9 * v3 = {(p1x1 + p3x1^3) + (p5x1 + p7x1^3) * x1^4, (p1x2 + p3x2^3) + (p5x2 + p7x2^3) * x2^4} | |
// r = r + b | |
"fadd v7.2d, v7.2d, v11.2d \n\t" //v7 = v7 + v11 = {rx1 + bx1, rx2 + b * x2} | |
// a = 2 * r | |
"fadd v9.2d, v7.2d, v7.2d \n\t" // v9 = 2 * v7 | |
// b = x < 0.0 | |
"fcmlt v3.2d, v6.2d, #0 \n\t" //d3 = (d6 < 0); | |
"ushr v3.2d, v3.2d, #63 \n\t" | |
"scvtf v5.2d, v3.2d \n\t" //d5 = (float) d3; | |
//r = r - a * b | |
"fmls v7.2d, v9.2d, v5.2d \n\t" // v7 = v7 - v9 * v5; | |
// Check if output is nan happens if x == 0 | |
"fmul v7.2d, v7.2d, v31.2d \n\t" //handle x == 0 | |
"st1 {v7.2d}, [%3]" | |
: | |
: "r"(__atanf_lut), "r"(&__atanf_pi_2), "r"(ip), "r"(op) | |
: | |
); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
compile with "-march=armv8-a"