Skip to content

Instantly share code, notes, and snippets.

@simonlindholm
Last active January 3, 2019 14:42
Show Gist options
  • Save simonlindholm/f7d8799b0781f50a7ccc8f53ff3b5478 to your computer and use it in GitHub Desktop.
Save simonlindholm/f7d8799b0781f50a7ccc8f53ff3b5478 to your computer and use it in GitHub Desktop.
#define _GNU_SOURCE
#include <bits/stdc++.h>
using namespace std;
/* mfbt basic SIMD wrappers. */
#include <math.h>
#include <stdint.h>
// Figure out how to get access to SIMD on the current compiler.
#if defined(_MSC_VER)
# if _M_IX86_FP >= 2
// MSVC supports <xmmintrin.h>.
# define MOZ_XMMINTRIN_SIMD 1
# endif
#elif defined(__clang__)
// Clang supports the SIMD syntax and features even on targets which lack
// SIMD support.
# define MOZ_EXT_VECTOR_SIMD 1
# define MOZ_BUILTIN_SIMD 1
#elif defined(__GNUC__)
// GCC emits suboptimal code around xorpd though (filed as GCC bug 60826), so
// we use <xmmintrin.h> there also for now.
# if defined(__SSE2__)
# define MOZ_XMMINTRIN_SIMD 1
# elif defined(__ARM_NEON__) || defined(__ALTIVEC__)
# define MOZ_BUILTIN_SIMD 1
# endif
#endif
#if !defined(MOZ_XMMINTRIN_SIMD) && !defined(MOZ_BUILTIN_SIMD)
// No SIMD support. Do what we can.
# define MOZ_FAKE_SIMD 1
#endif
// Figure out how to call copysign.
#ifdef __GNUC__
# define MOZ_COPYSIGN __builtin_copysign
#elif defined _WIN32
# define MOZ_COPYSIGN _copysign
#else
# define MOZ_COPYSIGN copysign
#endif
#ifdef MOZ_XMMINTRIN_SIMD
# include <xmmintrin.h>
#endif
namespace mozilla {
namespace SIMD {
#ifdef MOZ_BUILTIN_SIMD
// Add may_alias since we often have to cast pointers around, especially
// since we're restricted in what we can do due to portability constraints.
# ifdef MOZ_EXT_VECTOR_SIMD
typedef double V2F64 __attribute__((__ext_vector_type__(2), __may_alias__));
typedef int64_t V2I64 __attribute__((__ext_vector_type__(2), __may_alias__));
# else
typedef double V2F64 __attribute__((__vector_size__(16), __may_alias__));
typedef int64_t V2I64 __attribute__((__vector_size__(16), __may_alias__));
# endif
#endif
#ifdef MOZ_XMMINTRIN_SIMD
typedef __m128d V2F64;
typedef __m128i V2I64;
#endif
#ifdef MOZ_FAKE_SIMD
typedef union { double e[2]; int64_t i[2]; } V2F64;
typedef struct { int64_t e[2]; } V2I64;
#endif
#ifdef MOZ_BUILTIN_SIMD
inline V2F64 FAdd(V2F64 l, V2F64 r) { return l + r; }
inline V2F64 FSub(V2F64 l, V2F64 r) { return l - r; }
inline V2F64 FMul(V2F64 l, V2F64 r) { return l * r; }
inline V2F64 FDiv(V2F64 l, V2F64 r) { return l / r; }
#endif
#ifdef MOZ_XMMINTRIN_SIMD
inline V2F64 FAdd(V2F64 l, V2F64 r) { return _mm_add_pd(l, r); }
inline V2F64 FSub(V2F64 l, V2F64 r) { return _mm_sub_pd(l, r); }
inline V2F64 FMul(V2F64 l, V2F64 r) { return _mm_mul_pd(l, r); }
inline V2F64 FDiv(V2F64 l, V2F64 r) { return _mm_div_pd(l, r); }
#endif
#ifdef MOZ_FAKE_SIMD
inline V2F64 FAdd(V2F64 l, V2F64 r) {
V2F64 result = { l.e[0] + r.e[0], l.e[1] + r.e[1] };
return result;
}
inline V2F64 FSub(V2F64 l, V2F64 r) {
V2F64 result = { l.e[0] - r.e[0], l.e[1] - r.e[1] };
return result;
}
inline V2F64 FMul(V2F64 l, V2F64 r) {
V2F64 result = { l.e[0] * r.e[0], l.e[1] * r.e[1] };
return result;
}
inline V2F64 FDiv(V2F64 l, V2F64 r) {
V2F64 result = { l.e[0] / r.e[0], l.e[1] / r.e[1] };
return result;
}
#endif
#ifdef MOZ_BUILTIN_SIMD
// We do need to define the bitwise operators though. We don't use
// operator& and friends because these are actually floating-point types, and
// it's confusing to implicitly treat them like integer types in a C++ API.
// And speaking of conflating types in a confusing way, the compiler provides
// bitwise casts through the plain cast syntax, which is inconsistent, but
// the way it is.
# if defined(__clang__ ) && defined(__SSE2__)
// Clang has a philosophy that the optimizer knows better than the human
// writing vector intrinsics and studying the assembly output. It isn't always
// true. Use inline asm rinses, in this case to prevent it from converting
// operations like andnpd into integer operations, defeating CSE with other
// vectors, requiring awkward movabs instructions to materialize immediates,
// and generally doing weird things.
inline V2F64 FAnd (V2F64 l, V2F64 r) {
asm("" : "+x"(l));
V2F64 result = (V2F64)((V2I64)l & (V2I64)r);
return result;
}
inline V2F64 FOr (V2F64 l, V2F64 r) {
asm("" : "+x"(l));
V2F64 result = (V2F64)((V2I64)l | (V2I64)r);
return result;
}
inline V2F64 FXor (V2F64 l, V2F64 r) {
asm("" : "+x"(l));
V2F64 result = (V2F64)((V2I64)l ^ (V2I64)r);
return result;
}
inline V2F64 FAndnot(V2F64 l, V2F64 r) {
asm("" : "+x"(l));
V2F64 result = (V2F64)(~(V2I64)l & (V2I64)r);
return result;
}
# else
inline V2F64 FAnd (V2F64 l, V2F64 r) { return (V2F64)((V2I64)l & (V2I64)r); }
inline V2F64 FOr (V2F64 l, V2F64 r) { return (V2F64)((V2I64)l | (V2I64)r); }
inline V2F64 FXor (V2F64 l, V2F64 r) { return (V2F64)((V2I64)l ^ (V2I64)r); }
inline V2F64 FAndnot(V2F64 l, V2F64 r) { return (V2F64)(~(V2I64)l & (V2I64)r);}
# endif
#endif
#ifdef MOZ_XMMINTRIN_SIMD
inline V2F64 FAnd (V2F64 l, V2F64 r) { return _mm_and_pd (l, r); }
inline V2F64 FOr (V2F64 l, V2F64 r) { return _mm_or_pd (l, r); }
inline V2F64 FXor (V2F64 l, V2F64 r) { return _mm_xor_pd (l, r); }
inline V2F64 FAndnot(V2F64 l, V2F64 r) { return _mm_andnot_pd(l, r); }
#endif
#ifdef MOZ_FAKE_SIMD
inline V2F64 FAnd(V2F64 l, V2F64 r) {
V2F64 result;
result.i[0] = l.i[0] & r.i[0];
result.i[1] = l.i[1] & r.i[1];
return result;
}
inline V2F64 FOr(V2F64 l, V2F64 r) {
V2F64 result;
result.i[0] = l.i[0] | r.i[0];
result.i[1] = l.i[1] | r.i[1];
return result;
}
inline V2F64 FXor(V2F64 l, V2F64 r) {
V2F64 result;
result.i[0] = l.i[0] ^ r.i[0];
result.i[1] = l.i[1] ^ r.i[1];
return result;
}
inline V2F64 FAndnot(V2F64 l, V2F64 r) {
V2F64 result;
result.i[0] = ~l.i[0] & r.i[0];
result.i[1] = ~l.i[1] & r.i[1];
return result;
}
#endif
#ifdef MOZ_BUILTIN_SIMD
# if defined(__clang__ ) && defined(__SSE2__)
// As above, clang thinks it is smart but it does weird things, this time
// with a pslldq. Fix it.
inline V2F64 BuildVector(double e0, double e1) {
V2F64 result = (V2F64){ e0, e1 };
asm("" : "+x"(result));
return result;
}
# else
inline V2F64 BuildVector(double e0, double e1) { return (V2F64){ e0, e1 }; }
# endif
inline V2F64 Splat2(double e) { return (V2F64){ e, e }; }
inline V2F64 FZero() { return (V2F64){ 0.0, 0.0 }; }
#endif
#ifdef MOZ_XMMINTRIN_SIMD
// This is not a typo. e1 and e0 really do go in this order.
inline V2F64 BuildVector(double e0, double e1) { return _mm_set_pd(e1, e0); }
inline V2F64 Splat2(double e) { return _mm_set1_pd(e); }
inline V2F64 FZero() { return _mm_setzero_pd(); }
#endif
#ifdef MOZ_FAKE_SIMD
inline V2F64 BuildVector(double e0, double e1) {
V2F64 result = { e0, e1 };
return result;
}
inline V2F64 Splat2(double e) {
V2F64 result = { e, e };
return result;
}
inline V2F64 FZero() {
V2F64 result = { 0.0, 0.0 };
return result;
}
#endif
// Implicitly promote a scalar to a vector, leaving the high elements undefined.
#if defined(MOZ_EXT_VECTOR_SIMD)
// With ext_vector_type, we can use OpenCL-style swizzling.
inline V2F64 ScalarToVector2(double e) {
V2F64 result;
result.x = e;
return result;
}
#elif defined(__GNUC__) && defined(__SSE2__) && !defined(MOZ_FAKE_SIMD)
// GCC appears to provide no nicer way to do this, but we can get exactly what
// we want with an inline asm.
inline V2F64 ScalarToVector2(double e) {
V2F64 result;
asm("" : "=x"(result) : "0"(e));
return result;
}
#else
// This suffices, but it's often suboptimal as some compilers seem to want to
// set f to 0.
inline V2F64 ScalarToVector2(double e) {
double f;
return BuildVector(e, f);
}
#endif
// Demote a vector to a scalar with the value of its first element.
#if defined(MOZ_FAKE_SIMD)
inline double VectorToScalar(V2F64 v) { return v.e[0]; }
inline int64_t VectorToScalar(V2I64 v) { return v.e[0]; }
#elif defined(__clang__) || \
(defined(__GNUC__) && \
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
inline double VectorToScalar(V2F64 v) { return v[0]; }
inline int64_t VectorToScalar(V2I64 v) { return v[0]; }
#else
// G++ pre 4.8 doesn't support subscripting on vector types.
// FIXME: Hopefully there's a better way to do this on MSVC.
inline double VectorToScalar(V2F64 v) { return *(double*)&v; }
inline int64_t VectorToScalar(V2I64 v) { return *(int64_t*)&v; }
#endif
inline V2F64 FNegzero() { return BuildVector(-0.0, -0.0); }
inline V2F64 FSign(V2F64 v) { return FAnd(FNegzero(), v); }
inline V2F64 FAbs(V2F64 v) {
#ifndef MOZ_FAKE_SIMD
return FAndnot(FNegzero(), v);
#else
// FAnd is slow on fake SIMD.
V2F64 result = { fabs(v.e[0]), fabs(v.e[1]) };
return result;
#endif
}
inline V2F64 FNeg(V2F64 v) {
#ifndef MOZ_FAKE_SIMD
return FXor(FNegzero(), v);
#else
// FXor is slow on fake SIMD.
V2F64 result = { -v.e[0], -v.e[1] };
return result;
#endif
}
inline V2F64 FCopysign(V2F64 v, V2F64 s) {
#ifndef MOZ_FAKE_SIMD
V2F64 negzero = FNegzero();
return FOr(FAnd(negzero, s),
FAndnot(negzero, v));
#else
// FOr/FAnd are slow on fake SIMD.
V2F64 result = { MOZ_COPYSIGN(v.e[0], s.e[0]), MOZ_COPYSIGN(v.e[1], s.e[1]) };
return result;
#endif
}
#ifndef MOZ_FAKE_SIMD
inline unsigned FSignbit(V2F64 v) { return ((V2I64)v)[0] < 0; }
#else
inline unsigned FSignbit(V2F64 v) { return v.i[0] < 0; }
#endif
} // namespace SIMD
} // namespace mozilla
// Constants generated using Mathematica's GeneralMiniMaxApproximation. The
// Even elements are for the sin polynomial approximation, the odd for the
// cos polynomial approximation.
//
// The cos set uses one less coefficient than usual implementations to
// increase performance, raising the maximum approximation error to 2 bits.
extern const double coefficients[] __attribute__((aligned(16)));
const double coefficients[] = {
1.59046813973877163292e-10, 2.06467337476762997948e-9,
-2.50509001624159785668e-08, -2.75555495413759160741e-7,
2.75573146431678644161e-06, 2.48015808595638122085e-5,
-1.98412698327005105692e-04, -1.38888888779622760722e-3,
8.33333333332626768897e-03, 4.16666666665987187046e-2,
-1.66666666666666490881e-01, -4.99999999999999888978e-1
};
#define MOZ_UNLIKELY(x) __builtin_expect((x), 0)
struct sincos_result { double s, c; };
static sincos_result fast_sincos(double x)
{
using namespace mozilla::SIMD;
// Make argument non-negative but save the sign.
V2F64 vx = ScalarToVector2(x);
double absx = VectorToScalar(FAbs(vx));
// The optimized algorithm below doesn't currently support values of x beyond
// pow(2, 32) - 2. If x is beyond the range we support, fall back to the libm
// implementation. This check also handles the Infinity and NaN input cases.
// abs(x) < (221069929647945 / pow(2,16))
if (MOZ_UNLIKELY(!(absx < 3.37325942455970764160e9))) {
double sx = VectorToScalar(vx);
sincos_result result = {
sin(sx),
cos(sx)
};
return result;
}
static const double m_4_pi = 1.27323954473516276487; // 4.0 / M_PI
uint32_t i = static_cast<uint32_t>(absx * m_4_pi) + 1;
// Integer and fractional part modulo one octant.
double y = static_cast<double>(i & -2);
// Extended precision modular arithmetic
double e0 = y * -7.85398006439208984375e-1; // 1647099 / pow(2,21)
double e1 = y * -1.56958208208379801363e-7; // 1380619 / pow(2,43)
double e2 = y * -3.11168608594830669189e-14; // 4930663418217751 / pow(2,97)
double z = absx + e0 + e1 + e2;
// Compute quadrant 0 sin and cos polynomial approximations. We compute sin
// in vector element 0 and cos in vector element 1.
double zz = z * z;
const V2F64 *c = (const V2F64 *)coefficients;
V2F64 v2zz = Splat2(zz);
V2F64 ans = c[0];
ans = FMul(ans, v2zz);
ans = FAdd(ans, c[1]);
ans = FMul(ans, v2zz);
ans = FAdd(ans, c[2]);
ans = FMul(ans, v2zz);
ans = FAdd(ans, c[3]);
ans = FMul(ans, v2zz);
ans = FAdd(ans, c[4]);
ans = FMul(ans, v2zz);
ans = FAdd(ans, c[5]);
ans = FMul(ans, BuildVector(zz * z, zz));
ans = FAdd(ans, BuildVector(z, 1.0));
// Create an array which consists of the q0 sin, cos, -sin, -cos.
const V2F64 reflect_vecs[2] = { ans, FNeg(ans) };
const double *reflect = (const double*)reflect_vecs;
// Load the result with the desired reflection.
uint32_t quad_index = i >> 1;
uint32_t orig_sign = FSignbit(vx);
double sin_result = reflect[(quad_index + (orig_sign << 1)) & 3];
double cos_result = reflect[(quad_index + 1) & 3];
sincos_result result = { sin_result, cos_result };
return result;
}
int main() {
double sum = 0;
double v = 0.3;
for (int i = 0; i < 100000000; ++i) {
v += 0.2;
v /= 2;
if (0) {
double s, c;
sincos(0.2, &s, &c);
sum += s;
sum += c;
} else if (1) {
sincos_result r = fast_sincos(0.2);
sum += r.s;
sum += r.c;
} else {
sum += sin(v);
sum += cos(v);
}
}
cout << sum << endl;
// cout << r.s << ' ' << r.c << endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment