Last active
September 10, 2018 06:58
-
-
Save native-m/154a85a6b3fac1d23d6623241a9fdb1e to your computer and use it in GitHub Desktop.
Fast math functions approximation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // License under: WTFPL 1.0 | |
| // WARNING: I haven't checked the approximation error yet. | |
| #ifndef __FASTMATH_H__ | |
| #define __FASTMATH_H__ | |
| #define _USE_MATH_DEFINES | |
| #include <math.h> | |
| inline double fast_fmod(double x, double y) | |
| { | |
| __m128d xA = _mm_load_sd(&x); // Load x | |
| __m128d yA = _mm_load_sd(&y); // Load y | |
| __m128d ret = _mm_div_sd(xA, yA); // x / y | |
| // (ret - floor(ret)) * y | |
| ret = _mm_mul_sd(_mm_sub_sd(ret, _mm_cvtsi32_sd(ret, _mm_cvttsd_si32(ret))), *(__m128d*)&y); | |
| return *(double*)&ret; | |
| } | |
| inline float fast_fmod(float x, float y) | |
| { | |
| __m128 xA = _mm_load_ss(&x); // Load x | |
| __m128 yA = _mm_load_ss(&y); // Load y | |
| __m128 ret = _mm_div_ss(xA, yA); // x / y | |
| // (ret - floor(ret)) * y | |
| ret = _mm_mul_ss(_mm_sub_ss(ret, _mm_cvtsi32_ss(ret, _mm_cvttss_si32(ret))), *(__m128*)&y); | |
| return *(float*)&ret; | |
| } | |
| inline double fast_sin(double x) | |
| { | |
| double xSq; | |
| double result, tmp; | |
| const double twopi = 2.0 * M_PI; | |
| tmp = fast_fmod(x, twopi) - M_PI; | |
| xSq = tmp * tmp; | |
| result = -1.605904383682161334e-10; | |
| result *= xSq; | |
| result += 2.505210838544172022e-08; | |
| result *= xSq; | |
| result -= 2.755731922398589251e-06; | |
| result *= xSq; | |
| result += 0.0001984126984126984125; | |
| result *= xSq; | |
| result -= 0.008333333333333333218; | |
| result *= xSq; | |
| result += 0.1666666666666666574; | |
| result *= xSq; | |
| result -= 1.0; | |
| return result * tmp; | |
| } | |
| inline float fast_sin(float x) | |
| { | |
| float xSq; | |
| float result, tmp; | |
| const float twopi = 2.0 * (float)M_PI; | |
| tmp = fast_fmod(x, twopi) - (float)M_PI; | |
| xSq = tmp * tmp; | |
| result = -1.605904437e-10f; | |
| result *= xSq; | |
| result += 2.505210794e-08f; | |
| result *= xSq; | |
| result -= 2.755731884e-06f; | |
| result *= xSq; | |
| result += 0.0001984127011f; | |
| result *= xSq; | |
| result -= 0.00833333333786f; | |
| result *= xSq; | |
| result += 0.166666666716f; | |
| result *= xSq; | |
| result -= 1.0f; | |
| return result * tmp; | |
| } | |
| inline double fast_cos(double x) | |
| { | |
| double xSq; | |
| double result; | |
| const double twopi = 2.0 * M_PI; | |
| result = fast_fmod(x, twopi) - M_PI; | |
| xSq = result * result; | |
| result = -2.087675698786810019e-09; | |
| result *= xSq; | |
| result += 2.755731922398588828e-07; | |
| result *= xSq; | |
| result -= 2.480158730158730157e-05; | |
| result *= xSq; | |
| result += 0.001388888888888888942; | |
| result *= xSq; | |
| result -= 0.04166666666666666435; | |
| result *= xSq; | |
| result += 0.5; | |
| result *= xSq; | |
| result -= 1.0; | |
| return result; | |
| } | |
| inline float fast_cos(float x) | |
| { | |
| float xSq; | |
| float result; | |
| const float twopi = 2.0 * (float)M_PI; | |
| result = fast_fmod(x, twopi) - (float)M_PI; | |
| xSq = result * result; | |
| result = -2.087675588e-09f; | |
| result *= xSq; | |
| result += 2.755731998e-07f; | |
| result *= xSq; | |
| result -= 2.480158764e-05f; | |
| result *= xSq; | |
| result += 0.001388888923f; | |
| result *= xSq; | |
| result -= 0.04166666791f; | |
| result *= xSq; | |
| result += 0.5f; | |
| result *= xSq; | |
| result -= 1.0f; | |
| return result; | |
| } | |
| inline double fast_sqrt(double x) | |
| { | |
| __m128d d = _mm_load_sd(&x); | |
| d = _mm_sqrt_sd(d, d); | |
| return *(double*)&d; | |
| } | |
| inline float fast_sqrt(float x) | |
| { | |
| __m128 d = _mm_load_ss(&x); | |
| d = _mm_sqrt_ss(d); | |
| return *(float*)&d; | |
| } | |
| inline double fast_rsqrt(double x) | |
| { | |
| // Since we dont have a native SSE instruction for inverse sqrt. | |
| // We have to replace it with 1 / sqrt(x) instead. Maybe i will replace it with NR method later. | |
| return 1.0 / fast_sqrt(x); | |
| } | |
| inline float fast_rsqrt(float x) | |
| { | |
| long i; | |
| float x2, y; | |
| const float threehalfs = 1.5f; | |
| __m128 r; | |
| x2 = x * 0.5; // x / 2 | |
| r = _mm_load_ss(&x); // Load x | |
| r = _mm_rsqrt_ss(r); // Compute early inverse sqrt with SSE instruction. Btw, this boi is less accurate than the original 1 / sqrt(x). | |
| y = *(float*)&r; // Store early inverse sqrt to y | |
| y = y * (threehalfs - (x2 * y * y)); // Do NR step (1st iteration) | |
| //y = y * (threehalfs - (x2 * y * y)); // Do NR step (2nd iteration, this can be removed. More steps, more accurate) | |
| return y; | |
| } | |
| #endif // __FASTMATH_H__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment