Skip to content

Instantly share code, notes, and snippets.

@jcayzac
Created October 3, 2011 03:40
Show Gist options
  • Save jcayzac/1258385 to your computer and use it in GitHub Desktop.
Save jcayzac/1258385 to your computer and use it in GitHub Desktop.
Fast sine/cosine for ARMv7+NEON
#include <math.h>
/// Computes the sine and cosine of two angles
/// in: angles = Two angles, expressed in radians, in the [-PI,PI] range.
/// out: results = vector containing [sin(angles[0]),cos(angles[0]),sin(angles[1]),cos(angles[1])]
static inline void vsincos(const float angles[2], float results[4]) {
static const float constants[] = {
/* q1 */ 0, M_PI_2, 0, M_PI_2,
/* q2 */ M_PI, M_PI, M_PI, M_PI,
/* q3 */ 4.f/M_PI, 4.f/M_PI, 4.f/M_PI, 4.f/M_PI,
/* q4 */ -4.f/(M_PI*M_PI), -4.f/(M_PI*M_PI), -4.f/(M_PI*M_PI), -4.f/(M_PI*M_PI),
/* q5 */ 2.f, 2.f, 2.f, 2.f,
/* q6 */ .225f, .225f, .225f, .225f
};
asm volatile(
// Load q0 with [angle1,angle1,angle2,angle2]
"vldmia %1, { d3 }\n\t"
"vdup.f32 d0, d3[0]\n\t"
"vdup.f32 d1, d3[1]\n\t"
// Load q1-q6 with constants
"vldmia %2, { q1-q6 }\n\t"
// Cos(x) = Sin(x+PI/2), so
// q0 = [angle1, angle1+PI/2, angle2, angle2+PI/2]
"vadd.f32 q0,q0,q1\n\t"
// if angle1+PI/2>PI, substract 2*PI
// q0-=(q0>PI)?2*PI:0
"vcge.f32 q1,q0,q2\n\t"
"vand.f32 q1,q1,q2\n\t"
"vmls.f32 q0,q1,q5\n\t"
// q0=(4/PI)*q0 - q0*abs(q0)*4/(PI*PI)
"vabs.f32 q1,q0\n\t"
"vmul.f32 q1,q0,q1\n\t"
"vmul.f32 q0,q0,q3\n\t"
"vmul.f32 q1,q1,q4\n\t"
"vadd.f32 q0,q0,q1\n\t"
// q0+=.225*(q0*abs(q0) - q0)
"vabs.f32 q1,q0\n\t"
"vmul.f32 q1,q0,q1\n\t"
"vsub.f32 q1,q0\n\t"
"vmla.f32 q0,q1,q6\n\t"
"vstmia %0, { q0 }\n\t"
:: "r"(results), "r"(angles), "r"(constants)
: "memory","cc","q0","q1","q2","q3","q4","q5","q6"
);
}
@shaforostoff
Copy link

It doesn't seem to work as expected:

float test_c[10];
float test_s[10];
for(int i = 0; i < 10; i++)
{
	test_s[i] = sin((float)i);
	test_c[i] = cos((float)i);
	qDebug()<<"ssss"<<test_c[i]<<test_s[i];
}

the last line is
ssss -0.91113 0.412118

float test_c[10];
float test_s[10];
for(int i = 0; i < 10; i+=2)
{
	const float angles[2] = {i, i+1};
	float results[4];
	vsincos(angles, results);
	test_s[i] = results[0];
	test_s[i+1] = results[2];
	test_c[i] = results[1];
	test_c[i+1] = results[3];
	qDebug()<<"sc"<<test_c[i]<<test_s[i];
	qDebug()<<"sc"<<test_c[i+1]<<test_s[i+1];
}

the last line is
sc -2.43568 0.4117

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment