native-m · September 10, 2018 06:58
diff --git a/FastMath.h b/FastMath.h
 // License under: WTFPL 1.0
 // WARNING: I haven't checked the approximation error yet.
 #ifndef __FASTMATH_H__
 #define __FASTMATH_H__

 #define _USE_MATH_DEFINES
 #include <math.h>

 inline double fast_fmod(double x, double y)
 {
 	__m128d xA = _mm_load_sd(&x);		// Load x
 	__m128d yA = _mm_load_sd(&y);		// Load y
 	__m128d ret = _mm_div_sd(xA, yA);	// x / y

 	// (ret - floor(ret)) * y
 	ret = _mm_mul_sd(_mm_sub_sd(ret, _mm_cvtsi32_sd(ret, _mm_cvttsd_si32(ret))), *(__m128d*)&y);

 	return *(double*)&ret;
 }

 inline float fast_fmod(float x, float y)
 {
 	__m128 xA = _mm_load_ss(&x);		// Load x
 	__m128 yA = _mm_load_ss(&y);		// Load y
 	__m128 ret = _mm_div_ss(xA, yA);	// x / y

 	// (ret - floor(ret)) * y
 	ret = _mm_mul_ss(_mm_sub_ss(ret, _mm_cvtsi32_ss(ret, _mm_cvttss_si32(ret))), *(__m128*)&y);

 	return *(float*)&ret;
 }

 inline double fast_sin(double x)
 {
 	double xSq;
 	double result, tmp;
 	const double twopi = 2.0 * M_PI;

 	tmp = fast_fmod(x, twopi) - M_PI;

 	xSq = tmp * tmp;
 	result = -1.605904383682161334e-10;
 	result *= xSq;
 	result += 2.505210838544172022e-08;
 	result *= xSq;
 	result -= 2.755731922398589251e-06;
 	result *= xSq;
 	result += 0.0001984126984126984125;
 	result *= xSq;
 	result -= 0.008333333333333333218;
 	result *= xSq;
 	result += 0.1666666666666666574;
 	result *= xSq;
 	result -= 1.0;

 	return result * tmp;
 }

 inline float fast_sin(float x)
 {
 	float xSq;
 	float result, tmp;
 	const float twopi = 2.0 * (float)M_PI;

 	tmp = fast_fmod(x, twopi) - (float)M_PI;

 	xSq = tmp * tmp;
 	result = -1.605904437e-10f;
 	result *= xSq;
 	result += 2.505210794e-08f;
 	result *= xSq;
 	result -= 2.755731884e-06f;
 	result *= xSq;
 	result += 0.0001984127011f;
 	result *= xSq;
 	result -= 0.00833333333786f;
 	result *= xSq;
 	result += 0.166666666716f;
 	result *= xSq;
 	result -= 1.0f;

 	return result * tmp;
 }

 inline double fast_cos(double x)
 {
 	double xSq;
 	double result;
 	const double twopi = 2.0 * M_PI;

 	result = fast_fmod(x, twopi) - M_PI;

 	xSq = result * result;
 	result = -2.087675698786810019e-09;
 	result *= xSq;
 	result += 2.755731922398588828e-07;
 	result *= xSq;
 	result -= 2.480158730158730157e-05;
 	result *= xSq;
 	result += 0.001388888888888888942;
 	result *= xSq;
 	result -= 0.04166666666666666435;
 	result *= xSq;
 	result += 0.5;
 	result *= xSq;
 	result -= 1.0;

 	return result;
 }

 inline float fast_cos(float x)
 {
 	float xSq;
 	float result;
 	const float twopi = 2.0 * (float)M_PI;

 	result = fast_fmod(x, twopi) - (float)M_PI;

 	xSq = result * result;
 	result = -2.087675588e-09f;
 	result *= xSq;
 	result += 2.755731998e-07f;
 	result *= xSq;
 	result -= 2.480158764e-05f;
 	result *= xSq;
 	result += 0.001388888923f;
 	result *= xSq;
 	result -= 0.04166666791f;
 	result *= xSq;
 	result += 0.5f;
 	result *= xSq;
 	result -= 1.0f;

 	return result;
 }

 inline double fast_sqrt(double x)
 {
 	__m128d d = _mm_load_sd(&x);
 	d = _mm_sqrt_sd(d, d);
 	return *(double*)&d;
 }

 inline float fast_sqrt(float x)
 {
 	__m128 d = _mm_load_ss(&x);
 	d = _mm_sqrt_ss(d);
 	return *(float*)&d;
 }

 inline double fast_rsqrt(double x)
 {
 	// Since we dont have a native SSE instruction for inverse sqrt.
 	// We have to replace it with 1 / sqrt(x) instead. Maybe i will replace it with NR method later.
 	return 1.0 / fast_sqrt(x);
 }

 inline float fast_rsqrt(float x)
 {
 	long i;
 	float x2, y;
 	const float threehalfs = 1.5f;
 	__m128 r;

 	x2 = x * 0.5;							// x / 2
 	r = _mm_load_ss(&x);					// Load x
 	r = _mm_rsqrt_ss(r);					// Compute early inverse sqrt with SSE instruction. Btw, this boi is less accurate than the original 1 / sqrt(x).
 	y = *(float*)&r;						// Store early inverse sqrt to y
 	y = y * (threehalfs - (x2 * y * y));	// Do NR step (1st iteration)
 	//y = y * (threehalfs - (x2 * y * y));	// Do NR step (2nd iteration, this can be removed. More steps, more accurate)

 	return y;
 }

 #endif // __FASTMATH_H__
	// License under: WTFPL 1.0
	// WARNING: I haven't checked the approximation error yet.
	#ifndef __FASTMATH_H__
	#define __FASTMATH_H__

	#define _USE_MATH_DEFINES
	#include <math.h>

	inline double fast_fmod(double x, double y)
	{
	__m128d xA = _mm_load_sd(&x); // Load x
	__m128d yA = _mm_load_sd(&y); // Load y
	__m128d ret = _mm_div_sd(xA, yA); // x / y

	// (ret - floor(ret)) * y
	ret = _mm_mul_sd(_mm_sub_sd(ret, _mm_cvtsi32_sd(ret, _mm_cvttsd_si32(ret))), (__m128d)&y);

	return (double)&ret;
	}

	inline float fast_fmod(float x, float y)
	{
	__m128 xA = _mm_load_ss(&x); // Load x
	__m128 yA = _mm_load_ss(&y); // Load y
	__m128 ret = _mm_div_ss(xA, yA); // x / y

	// (ret - floor(ret)) * y
	ret = _mm_mul_ss(_mm_sub_ss(ret, _mm_cvtsi32_ss(ret, _mm_cvttss_si32(ret))), (__m128)&y);

	return (float)&ret;
	}

	inline double fast_sin(double x)
	{
	double xSq;
	double result, tmp;
	const double twopi = 2.0 * M_PI;

	tmp = fast_fmod(x, twopi) - M_PI;

	xSq = tmp * tmp;
	result = -1.605904383682161334e-10;
	result *= xSq;
	result += 2.505210838544172022e-08;
	result *= xSq;
	result -= 2.755731922398589251e-06;
	result *= xSq;
	result += 0.0001984126984126984125;
	result *= xSq;
	result -= 0.008333333333333333218;
	result *= xSq;
	result += 0.1666666666666666574;
	result *= xSq;
	result -= 1.0;

	return result * tmp;
	}

	inline float fast_sin(float x)
	{
	float xSq;
	float result, tmp;
	const float twopi = 2.0 * (float)M_PI;

	tmp = fast_fmod(x, twopi) - (float)M_PI;

	xSq = tmp * tmp;
	result = -1.605904437e-10f;
	result *= xSq;
	result += 2.505210794e-08f;
	result *= xSq;
	result -= 2.755731884e-06f;
	result *= xSq;
	result += 0.0001984127011f;
	result *= xSq;
	result -= 0.00833333333786f;
	result *= xSq;
	result += 0.166666666716f;
	result *= xSq;
	result -= 1.0f;

	return result * tmp;
	}

	inline double fast_cos(double x)
	{
	double xSq;
	double result;
	const double twopi = 2.0 * M_PI;

	result = fast_fmod(x, twopi) - M_PI;

	xSq = result * result;
	result = -2.087675698786810019e-09;
	result *= xSq;
	result += 2.755731922398588828e-07;
	result *= xSq;
	result -= 2.480158730158730157e-05;
	result *= xSq;
	result += 0.001388888888888888942;
	result *= xSq;
	result -= 0.04166666666666666435;
	result *= xSq;
	result += 0.5;
	result *= xSq;
	result -= 1.0;

	return result;
	}

	inline float fast_cos(float x)
	{
	float xSq;
	float result;
	const float twopi = 2.0 * (float)M_PI;

	result = fast_fmod(x, twopi) - (float)M_PI;

	xSq = result * result;
	result = -2.087675588e-09f;
	result *= xSq;
	result += 2.755731998e-07f;
	result *= xSq;
	result -= 2.480158764e-05f;
	result *= xSq;
	result += 0.001388888923f;
	result *= xSq;
	result -= 0.04166666791f;
	result *= xSq;
	result += 0.5f;
	result *= xSq;
	result -= 1.0f;

	return result;
	}

	inline double fast_sqrt(double x)
	{
	__m128d d = _mm_load_sd(&x);
	d = _mm_sqrt_sd(d, d);
	return (double)&d;
	}

	inline float fast_sqrt(float x)
	{
	__m128 d = _mm_load_ss(&x);
	d = _mm_sqrt_ss(d);
	return (float)&d;
	}

	inline double fast_rsqrt(double x)
	{
	// Since we dont have a native SSE instruction for inverse sqrt.
	// We have to replace it with 1 / sqrt(x) instead. Maybe i will replace it with NR method later.
	return 1.0 / fast_sqrt(x);
	}

	inline float fast_rsqrt(float x)
	{
	long i;
	float x2, y;
	const float threehalfs = 1.5f;
	__m128 r;

	x2 = x * 0.5; // x / 2
	r = _mm_load_ss(&x); // Load x
	r = _mm_rsqrt_ss(r); // Compute early inverse sqrt with SSE instruction. Btw, this boi is less accurate than the original 1 / sqrt(x).
	y = (float)&r; // Store early inverse sqrt to y
	y = y * (threehalfs - (x2 * y * y)); // Do NR step (1st iteration)
	//y = y * (threehalfs - (x2 * y * y)); // Do NR step (2nd iteration, this can be removed. More steps, more accurate)

	return y;
	}

	#endif // __FASTMATH_H__
No results found