Last active
January 3, 2019 14:42
-
-
Save simonlindholm/f7d8799b0781f50a7ccc8f53ff3b5478 to your computer and use it in GitHub Desktop.
Extracted from https://bugzilla.mozilla.org/show_bug.cgi?id=967709, MPL licensed
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _GNU_SOURCE | |
#include <bits/stdc++.h> | |
using namespace std; | |
/* mfbt basic SIMD wrappers. */ | |
#include <math.h> | |
#include <stdint.h> | |
// Figure out how to get access to SIMD on the current compiler. | |
#if defined(_MSC_VER) | |
# if _M_IX86_FP >= 2 | |
// MSVC supports <xmmintrin.h>. | |
# define MOZ_XMMINTRIN_SIMD 1 | |
# endif | |
#elif defined(__clang__) | |
// Clang supports the SIMD syntax and features even on targets which lack | |
// SIMD support. | |
# define MOZ_EXT_VECTOR_SIMD 1 | |
# define MOZ_BUILTIN_SIMD 1 | |
#elif defined(__GNUC__) | |
// GCC emits suboptimal code around xorpd though (filed as GCC bug 60826), so | |
// we use <xmmintrin.h> there also for now. | |
# if defined(__SSE2__) | |
# define MOZ_XMMINTRIN_SIMD 1 | |
# elif defined(__ARM_NEON__) || defined(__ALTIVEC__) | |
# define MOZ_BUILTIN_SIMD 1 | |
# endif | |
#endif | |
#if !defined(MOZ_XMMINTRIN_SIMD) && !defined(MOZ_BUILTIN_SIMD) | |
// No SIMD support. Do what we can. | |
# define MOZ_FAKE_SIMD 1 | |
#endif | |
// Figure out how to call copysign. | |
#ifdef __GNUC__ | |
# define MOZ_COPYSIGN __builtin_copysign | |
#elif defined _WIN32 | |
# define MOZ_COPYSIGN _copysign | |
#else | |
# define MOZ_COPYSIGN copysign | |
#endif | |
#ifdef MOZ_XMMINTRIN_SIMD | |
# include <xmmintrin.h> | |
#endif | |
namespace mozilla { | |
namespace SIMD { | |
#ifdef MOZ_BUILTIN_SIMD | |
// Add may_alias since we often have to cast pointers around, especially | |
// since we're restricted in what we can do due to portability constraints. | |
# ifdef MOZ_EXT_VECTOR_SIMD | |
typedef double V2F64 __attribute__((__ext_vector_type__(2), __may_alias__)); | |
typedef int64_t V2I64 __attribute__((__ext_vector_type__(2), __may_alias__)); | |
# else | |
typedef double V2F64 __attribute__((__vector_size__(16), __may_alias__)); | |
typedef int64_t V2I64 __attribute__((__vector_size__(16), __may_alias__)); | |
# endif | |
#endif | |
#ifdef MOZ_XMMINTRIN_SIMD | |
typedef __m128d V2F64; | |
typedef __m128i V2I64; | |
#endif | |
#ifdef MOZ_FAKE_SIMD | |
typedef union { double e[2]; int64_t i[2]; } V2F64; | |
typedef struct { int64_t e[2]; } V2I64; | |
#endif | |
#ifdef MOZ_BUILTIN_SIMD | |
inline V2F64 FAdd(V2F64 l, V2F64 r) { return l + r; } | |
inline V2F64 FSub(V2F64 l, V2F64 r) { return l - r; } | |
inline V2F64 FMul(V2F64 l, V2F64 r) { return l * r; } | |
inline V2F64 FDiv(V2F64 l, V2F64 r) { return l / r; } | |
#endif | |
#ifdef MOZ_XMMINTRIN_SIMD | |
inline V2F64 FAdd(V2F64 l, V2F64 r) { return _mm_add_pd(l, r); } | |
inline V2F64 FSub(V2F64 l, V2F64 r) { return _mm_sub_pd(l, r); } | |
inline V2F64 FMul(V2F64 l, V2F64 r) { return _mm_mul_pd(l, r); } | |
inline V2F64 FDiv(V2F64 l, V2F64 r) { return _mm_div_pd(l, r); } | |
#endif | |
#ifdef MOZ_FAKE_SIMD | |
inline V2F64 FAdd(V2F64 l, V2F64 r) { | |
V2F64 result = { l.e[0] + r.e[0], l.e[1] + r.e[1] }; | |
return result; | |
} | |
inline V2F64 FSub(V2F64 l, V2F64 r) { | |
V2F64 result = { l.e[0] - r.e[0], l.e[1] - r.e[1] }; | |
return result; | |
} | |
inline V2F64 FMul(V2F64 l, V2F64 r) { | |
V2F64 result = { l.e[0] * r.e[0], l.e[1] * r.e[1] }; | |
return result; | |
} | |
inline V2F64 FDiv(V2F64 l, V2F64 r) { | |
V2F64 result = { l.e[0] / r.e[0], l.e[1] / r.e[1] }; | |
return result; | |
} | |
#endif | |
#ifdef MOZ_BUILTIN_SIMD | |
// We do need to define the bitwise operators though. We don't use | |
// operator& and friends because these are actually floating-point types, and | |
// it's confusing to implicitly treat them like integer types in a C++ API. | |
// And speaking of conflating types in a confusing way, the compiler provides | |
// bitwise casts through the plain cast syntax, which is inconsistent, but | |
// the way it is. | |
# if defined(__clang__ ) && defined(__SSE2__) | |
// Clang has a philosophy that the optimizer knows better than the human | |
// writing vector intrinsics and studying the assembly output. It isn't always | |
// true. Use inline asm rinses, in this case to prevent it from converting | |
// operations like andnpd into integer operations, defeating CSE with other | |
// vectors, requiring awkward movabs instructions to materialize immediates, | |
// and generally doing weird things. | |
inline V2F64 FAnd (V2F64 l, V2F64 r) { | |
asm("" : "+x"(l)); | |
V2F64 result = (V2F64)((V2I64)l & (V2I64)r); | |
return result; | |
} | |
inline V2F64 FOr (V2F64 l, V2F64 r) { | |
asm("" : "+x"(l)); | |
V2F64 result = (V2F64)((V2I64)l | (V2I64)r); | |
return result; | |
} | |
inline V2F64 FXor (V2F64 l, V2F64 r) { | |
asm("" : "+x"(l)); | |
V2F64 result = (V2F64)((V2I64)l ^ (V2I64)r); | |
return result; | |
} | |
inline V2F64 FAndnot(V2F64 l, V2F64 r) { | |
asm("" : "+x"(l)); | |
V2F64 result = (V2F64)(~(V2I64)l & (V2I64)r); | |
return result; | |
} | |
# else | |
inline V2F64 FAnd (V2F64 l, V2F64 r) { return (V2F64)((V2I64)l & (V2I64)r); } | |
inline V2F64 FOr (V2F64 l, V2F64 r) { return (V2F64)((V2I64)l | (V2I64)r); } | |
inline V2F64 FXor (V2F64 l, V2F64 r) { return (V2F64)((V2I64)l ^ (V2I64)r); } | |
inline V2F64 FAndnot(V2F64 l, V2F64 r) { return (V2F64)(~(V2I64)l & (V2I64)r);} | |
# endif | |
#endif | |
#ifdef MOZ_XMMINTRIN_SIMD | |
inline V2F64 FAnd (V2F64 l, V2F64 r) { return _mm_and_pd (l, r); } | |
inline V2F64 FOr (V2F64 l, V2F64 r) { return _mm_or_pd (l, r); } | |
inline V2F64 FXor (V2F64 l, V2F64 r) { return _mm_xor_pd (l, r); } | |
inline V2F64 FAndnot(V2F64 l, V2F64 r) { return _mm_andnot_pd(l, r); } | |
#endif | |
#ifdef MOZ_FAKE_SIMD | |
inline V2F64 FAnd(V2F64 l, V2F64 r) { | |
V2F64 result; | |
result.i[0] = l.i[0] & r.i[0]; | |
result.i[1] = l.i[1] & r.i[1]; | |
return result; | |
} | |
inline V2F64 FOr(V2F64 l, V2F64 r) { | |
V2F64 result; | |
result.i[0] = l.i[0] | r.i[0]; | |
result.i[1] = l.i[1] | r.i[1]; | |
return result; | |
} | |
inline V2F64 FXor(V2F64 l, V2F64 r) { | |
V2F64 result; | |
result.i[0] = l.i[0] ^ r.i[0]; | |
result.i[1] = l.i[1] ^ r.i[1]; | |
return result; | |
} | |
inline V2F64 FAndnot(V2F64 l, V2F64 r) { | |
V2F64 result; | |
result.i[0] = ~l.i[0] & r.i[0]; | |
result.i[1] = ~l.i[1] & r.i[1]; | |
return result; | |
} | |
#endif | |
#ifdef MOZ_BUILTIN_SIMD | |
# if defined(__clang__ ) && defined(__SSE2__) | |
// As above, clang thinks it is smart but it does weird things, this time | |
// with a pslldq. Fix it. | |
inline V2F64 BuildVector(double e0, double e1) { | |
V2F64 result = (V2F64){ e0, e1 }; | |
asm("" : "+x"(result)); | |
return result; | |
} | |
# else | |
inline V2F64 BuildVector(double e0, double e1) { return (V2F64){ e0, e1 }; } | |
# endif | |
inline V2F64 Splat2(double e) { return (V2F64){ e, e }; } | |
inline V2F64 FZero() { return (V2F64){ 0.0, 0.0 }; } | |
#endif | |
#ifdef MOZ_XMMINTRIN_SIMD | |
// This is not a typo. e1 and e0 really do go in this order. | |
inline V2F64 BuildVector(double e0, double e1) { return _mm_set_pd(e1, e0); } | |
inline V2F64 Splat2(double e) { return _mm_set1_pd(e); } | |
inline V2F64 FZero() { return _mm_setzero_pd(); } | |
#endif | |
#ifdef MOZ_FAKE_SIMD | |
inline V2F64 BuildVector(double e0, double e1) { | |
V2F64 result = { e0, e1 }; | |
return result; | |
} | |
inline V2F64 Splat2(double e) { | |
V2F64 result = { e, e }; | |
return result; | |
} | |
inline V2F64 FZero() { | |
V2F64 result = { 0.0, 0.0 }; | |
return result; | |
} | |
#endif | |
// Implicitly promote a scalar to a vector, leaving the high elements undefined. | |
#if defined(MOZ_EXT_VECTOR_SIMD) | |
// With ext_vector_type, we can use OpenCL-style swizzling. | |
inline V2F64 ScalarToVector2(double e) { | |
V2F64 result; | |
result.x = e; | |
return result; | |
} | |
#elif defined(__GNUC__) && defined(__SSE2__) && !defined(MOZ_FAKE_SIMD) | |
// GCC appears to provide no nicer way to do this, but we can get exactly what | |
// we want with an inline asm. | |
inline V2F64 ScalarToVector2(double e) { | |
V2F64 result; | |
asm("" : "=x"(result) : "0"(e)); | |
return result; | |
} | |
#else | |
// This suffices, but it's often suboptimal as some compilers seem to want to | |
// set f to 0. | |
inline V2F64 ScalarToVector2(double e) { | |
double f; | |
return BuildVector(e, f); | |
} | |
#endif | |
// Demote a vector to a scalar with the value of its first element. | |
#if defined(MOZ_FAKE_SIMD) | |
inline double VectorToScalar(V2F64 v) { return v.e[0]; } | |
inline int64_t VectorToScalar(V2I64 v) { return v.e[0]; } | |
#elif defined(__clang__) || \ | |
(defined(__GNUC__) && \ | |
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) | |
inline double VectorToScalar(V2F64 v) { return v[0]; } | |
inline int64_t VectorToScalar(V2I64 v) { return v[0]; } | |
#else | |
// G++ pre 4.8 doesn't support subscripting on vector types. | |
// FIXME: Hopefully there's a better way to do this on MSVC. | |
inline double VectorToScalar(V2F64 v) { return *(double*)&v; } | |
inline int64_t VectorToScalar(V2I64 v) { return *(int64_t*)&v; } | |
#endif | |
inline V2F64 FNegzero() { return BuildVector(-0.0, -0.0); } | |
inline V2F64 FSign(V2F64 v) { return FAnd(FNegzero(), v); } | |
inline V2F64 FAbs(V2F64 v) { | |
#ifndef MOZ_FAKE_SIMD | |
return FAndnot(FNegzero(), v); | |
#else | |
// FAnd is slow on fake SIMD. | |
V2F64 result = { fabs(v.e[0]), fabs(v.e[1]) }; | |
return result; | |
#endif | |
} | |
inline V2F64 FNeg(V2F64 v) { | |
#ifndef MOZ_FAKE_SIMD | |
return FXor(FNegzero(), v); | |
#else | |
// FXor is slow on fake SIMD. | |
V2F64 result = { -v.e[0], -v.e[1] }; | |
return result; | |
#endif | |
} | |
inline V2F64 FCopysign(V2F64 v, V2F64 s) { | |
#ifndef MOZ_FAKE_SIMD | |
V2F64 negzero = FNegzero(); | |
return FOr(FAnd(negzero, s), | |
FAndnot(negzero, v)); | |
#else | |
// FOr/FAnd are slow on fake SIMD. | |
V2F64 result = { MOZ_COPYSIGN(v.e[0], s.e[0]), MOZ_COPYSIGN(v.e[1], s.e[1]) }; | |
return result; | |
#endif | |
} | |
#ifndef MOZ_FAKE_SIMD | |
inline unsigned FSignbit(V2F64 v) { return ((V2I64)v)[0] < 0; } | |
#else | |
inline unsigned FSignbit(V2F64 v) { return v.i[0] < 0; } | |
#endif | |
} // namespace SIMD | |
} // namespace mozilla | |
// Constants generated using Mathematica's GeneralMiniMaxApproximation. The | |
// Even elements are for the sin polynomial approximation, the odd for the | |
// cos polynomial approximation. | |
// | |
// The cos set uses one less coefficient than usual implementations to | |
// increase performance, raising the maximum approximation error to 2 bits. | |
extern const double coefficients[] __attribute__((aligned(16))); | |
const double coefficients[] = { | |
1.59046813973877163292e-10, 2.06467337476762997948e-9, | |
-2.50509001624159785668e-08, -2.75555495413759160741e-7, | |
2.75573146431678644161e-06, 2.48015808595638122085e-5, | |
-1.98412698327005105692e-04, -1.38888888779622760722e-3, | |
8.33333333332626768897e-03, 4.16666666665987187046e-2, | |
-1.66666666666666490881e-01, -4.99999999999999888978e-1 | |
}; | |
#define MOZ_UNLIKELY(x) __builtin_expect((x), 0) | |
struct sincos_result { double s, c; }; | |
static sincos_result fast_sincos(double x) | |
{ | |
using namespace mozilla::SIMD; | |
// Make argument non-negative but save the sign. | |
V2F64 vx = ScalarToVector2(x); | |
double absx = VectorToScalar(FAbs(vx)); | |
// The optimized algorithm below doesn't currently support values of x beyond | |
// pow(2, 32) - 2. If x is beyond the range we support, fall back to the libm | |
// implementation. This check also handles the Infinity and NaN input cases. | |
// abs(x) < (221069929647945 / pow(2,16)) | |
if (MOZ_UNLIKELY(!(absx < 3.37325942455970764160e9))) { | |
double sx = VectorToScalar(vx); | |
sincos_result result = { | |
sin(sx), | |
cos(sx) | |
}; | |
return result; | |
} | |
static const double m_4_pi = 1.27323954473516276487; // 4.0 / M_PI | |
uint32_t i = static_cast<uint32_t>(absx * m_4_pi) + 1; | |
// Integer and fractional part modulo one octant. | |
double y = static_cast<double>(i & -2); | |
// Extended precision modular arithmetic | |
double e0 = y * -7.85398006439208984375e-1; // 1647099 / pow(2,21) | |
double e1 = y * -1.56958208208379801363e-7; // 1380619 / pow(2,43) | |
double e2 = y * -3.11168608594830669189e-14; // 4930663418217751 / pow(2,97) | |
double z = absx + e0 + e1 + e2; | |
// Compute quadrant 0 sin and cos polynomial approximations. We compute sin | |
// in vector element 0 and cos in vector element 1. | |
double zz = z * z; | |
const V2F64 *c = (const V2F64 *)coefficients; | |
V2F64 v2zz = Splat2(zz); | |
V2F64 ans = c[0]; | |
ans = FMul(ans, v2zz); | |
ans = FAdd(ans, c[1]); | |
ans = FMul(ans, v2zz); | |
ans = FAdd(ans, c[2]); | |
ans = FMul(ans, v2zz); | |
ans = FAdd(ans, c[3]); | |
ans = FMul(ans, v2zz); | |
ans = FAdd(ans, c[4]); | |
ans = FMul(ans, v2zz); | |
ans = FAdd(ans, c[5]); | |
ans = FMul(ans, BuildVector(zz * z, zz)); | |
ans = FAdd(ans, BuildVector(z, 1.0)); | |
// Create an array which consists of the q0 sin, cos, -sin, -cos. | |
const V2F64 reflect_vecs[2] = { ans, FNeg(ans) }; | |
const double *reflect = (const double*)reflect_vecs; | |
// Load the result with the desired reflection. | |
uint32_t quad_index = i >> 1; | |
uint32_t orig_sign = FSignbit(vx); | |
double sin_result = reflect[(quad_index + (orig_sign << 1)) & 3]; | |
double cos_result = reflect[(quad_index + 1) & 3]; | |
sincos_result result = { sin_result, cos_result }; | |
return result; | |
} | |
int main() { | |
double sum = 0; | |
double v = 0.3; | |
for (int i = 0; i < 100000000; ++i) { | |
v += 0.2; | |
v /= 2; | |
if (0) { | |
double s, c; | |
sincos(0.2, &s, &c); | |
sum += s; | |
sum += c; | |
} else if (1) { | |
sincos_result r = fast_sincos(0.2); | |
sum += r.s; | |
sum += r.c; | |
} else { | |
sum += sin(v); | |
sum += cos(v); | |
} | |
} | |
cout << sum << endl; | |
// cout << r.s << ' ' << r.c << endl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment