Last active
January 10, 2024 05:35
-
-
Save equalent/35d9575747b9cfa05230230a0d8469d4 to your computer and use it in GitHub Desktop.
Inverse square root implementation in plain C (q_rsqrt) and using SSE / NEON intrinsics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) \ | |
|| defined(__x86_64) || defined(_M_AMD64) | |
#define RSQRT_ARCH_AMD64 | |
#define RSQRT_ARCH_X86 | |
#endif | |
#if defined(i386) || defined(__i386) || defined(__i386__) \ | |
|| defined(__i386__) || defined(__IA32__) || defined(_M_IX86) | |
#define RSQRT_ARCH_I386 | |
#define RSQRT_ARCH_X86 | |
#endif | |
#if defined(__aarch64__) || defined(_M_ARM64) | |
#define RSQRT_ARCH_ARM64 | |
#endif | |
#ifdef _MSC_VER | |
#define RSQRT_FASTCALL __fastcall | |
#define RSQRT_FORCEINLINE __forceinline | |
#else | |
#define RSQRT_FASTCALL | |
#define RSQRT_FORCEINLINE __attribute__((__always_inline__)) | |
#endif | |
#if defined(RSQRT_ARCH_X86) | |
#include <xmmintrin.h> | |
#elif defined(RSQRT_ARCH_ARM64) | |
#ifdef _MSC_VER | |
#include <arm64_neon.h> | |
#else | |
#include <arm_neon.h> | |
#endif | |
#endif // defined(RSQRT_ARCH_ARM64) | |
RSQRT_FORCEINLINE float RSQRT_FASTCALL rsqrt(float number) | |
{ | |
#if defined(RSQRT_ARCH_X86) | |
__m128 n = _mm_set_ss(number); | |
n = _mm_rsqrt_ss(n); | |
return _mm_cvtss_f32(n); | |
#elif defined(RSQRT_ARCH_ARM64) | |
// initial estimate: | |
float32x2_t n = vrsqrte_f32(vdup_n_f32(number)); | |
// increase accuracy using Newton-Raphson method: | |
n = vmul_f32(n, vrsqrts_f32(n, vmul_f32(n, vdup_n_f32(number)))); | |
return vget_lane_f32(n, 0); | |
#else | |
// https://en.wikipedia.org/wiki/Fast_inverse_square_root#Worked_example | |
union { | |
float f; | |
uint32_t i; | |
} conv; | |
conv.f = number; | |
conv.i = 0x5f3759df - (conv.i >> 1); | |
conv.f *= 1.5F - (number * 0.5F * conv.f * conv.f); | |
return conv.f; | |
#endif | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment