simonlindholm · January 3, 2019 14:42
diff --git a/sincos.cpp b/sincos.cpp
 #define _GNU_SOURCE
 #include <bits/stdc++.h>
 using namespace std;

 /* mfbt basic SIMD wrappers. */

 #include <math.h>
 #include <stdint.h>

 // Figure out how to get access to SIMD on the current compiler.
 #if defined(_MSC_VER)
 #  if _M_IX86_FP >= 2
     // MSVC supports <xmmintrin.h>.
 #    define MOZ_XMMINTRIN_SIMD 1
 #  endif
 #elif defined(__clang__)
   // Clang supports the SIMD syntax and features even on targets which lack
   // SIMD support.
 #  define MOZ_EXT_VECTOR_SIMD 1
 #  define MOZ_BUILTIN_SIMD 1
 #elif defined(__GNUC__)
   // GCC emits suboptimal code around xorpd though (filed as GCC bug 60826), so
   // we use <xmmintrin.h> there also for now.
 #  if defined(__SSE2__)
 #    define MOZ_XMMINTRIN_SIMD 1
 #  elif defined(__ARM_NEON__) || defined(__ALTIVEC__)
 #    define MOZ_BUILTIN_SIMD 1
 #  endif
 #endif
 #if !defined(MOZ_XMMINTRIN_SIMD) && !defined(MOZ_BUILTIN_SIMD)
   // No SIMD support. Do what we can.
 #  define MOZ_FAKE_SIMD 1
 #endif

 // Figure out how to call copysign.
 #ifdef __GNUC__
 #  define MOZ_COPYSIGN __builtin_copysign
 #elif defined _WIN32
 #  define MOZ_COPYSIGN _copysign
 #else
 #  define MOZ_COPYSIGN copysign
 #endif

 #ifdef MOZ_XMMINTRIN_SIMD
 #  include <xmmintrin.h>
 #endif

 namespace mozilla {

 namespace SIMD {

 #ifdef MOZ_BUILTIN_SIMD
   // Add may_alias since we often have to cast pointers around, especially
   // since we're restricted in what we can do due to portability constraints.
 #  ifdef MOZ_EXT_VECTOR_SIMD
 typedef double V2F64 __attribute__((__ext_vector_type__(2), __may_alias__));
 typedef int64_t V2I64 __attribute__((__ext_vector_type__(2), __may_alias__));
 #  else
 typedef double V2F64 __attribute__((__vector_size__(16), __may_alias__));
 typedef int64_t V2I64 __attribute__((__vector_size__(16), __may_alias__));
 #  endif
 #endif
 #ifdef MOZ_XMMINTRIN_SIMD
 typedef __m128d V2F64;
 typedef __m128i V2I64;
 #endif
 #ifdef MOZ_FAKE_SIMD
 typedef union { double e[2]; int64_t i[2]; } V2F64;
 typedef struct { int64_t e[2]; } V2I64;
 #endif

 #ifdef MOZ_BUILTIN_SIMD
 inline V2F64 FAdd(V2F64 l, V2F64 r) { return l + r; }
 inline V2F64 FSub(V2F64 l, V2F64 r) { return l - r; }
 inline V2F64 FMul(V2F64 l, V2F64 r) { return l * r; }
 inline V2F64 FDiv(V2F64 l, V2F64 r) { return l / r; }
 #endif
 #ifdef MOZ_XMMINTRIN_SIMD
 inline V2F64 FAdd(V2F64 l, V2F64 r) { return _mm_add_pd(l, r); }
 inline V2F64 FSub(V2F64 l, V2F64 r) { return _mm_sub_pd(l, r); }
 inline V2F64 FMul(V2F64 l, V2F64 r) { return _mm_mul_pd(l, r); }
 inline V2F64 FDiv(V2F64 l, V2F64 r) { return _mm_div_pd(l, r); }
 #endif
 #ifdef MOZ_FAKE_SIMD
 inline V2F64 FAdd(V2F64 l, V2F64 r) {
  V2F64 result = { l.e[0] + r.e[0], l.e[1] + r.e[1] };
  return result;
 }
 inline V2F64 FSub(V2F64 l, V2F64 r) {
  V2F64 result = { l.e[0] - r.e[0], l.e[1] - r.e[1] };
  return result;
 }
 inline V2F64 FMul(V2F64 l, V2F64 r) {
  V2F64 result = { l.e[0] * r.e[0], l.e[1] * r.e[1] };
  return result;
 }
 inline V2F64 FDiv(V2F64 l, V2F64 r) {
  V2F64 result = { l.e[0] / r.e[0], l.e[1] / r.e[1] };
  return result;
 }
 #endif

 #ifdef MOZ_BUILTIN_SIMD
 // We do need to define the bitwise operators though. We don't use
 // operator& and friends because these are actually floating-point types, and
 // it's confusing to implicitly treat them like integer types in a C++ API.
 // And speaking of conflating types in a confusing way, the compiler provides
 // bitwise casts through the plain cast syntax, which is inconsistent, but
 // the way it is.
 #  if defined(__clang__ ) && defined(__SSE2__)
 // Clang has a philosophy that the optimizer knows better than the human
 // writing vector intrinsics and studying the assembly output. It isn't always
 // true. Use inline asm rinses, in this case to prevent it from converting
 // operations like andnpd into integer operations, defeating CSE with other
 // vectors, requiring awkward movabs instructions to materialize immediates,
 // and generally doing weird things.
 inline V2F64 FAnd   (V2F64 l, V2F64 r) {
  asm("" : "+x"(l));
  V2F64 result = (V2F64)((V2I64)l & (V2I64)r);
  return result;
 }
 inline V2F64 FOr  (V2F64 l, V2F64 r) {
  asm("" : "+x"(l));
  V2F64 result = (V2F64)((V2I64)l | (V2I64)r);
  return result;
 }
 inline V2F64 FXor   (V2F64 l, V2F64 r) {
  asm("" : "+x"(l));
  V2F64 result = (V2F64)((V2I64)l ^ (V2I64)r);
  return result;
 }
 inline V2F64 FAndnot(V2F64 l, V2F64 r) {
  asm("" : "+x"(l));
  V2F64 result = (V2F64)(~(V2I64)l & (V2I64)r);
  return result;
 }
 #  else
 inline V2F64 FAnd   (V2F64 l, V2F64 r) { return (V2F64)((V2I64)l & (V2I64)r); }
 inline V2F64 FOr    (V2F64 l, V2F64 r) { return (V2F64)((V2I64)l | (V2I64)r); }
 inline V2F64 FXor   (V2F64 l, V2F64 r) { return (V2F64)((V2I64)l ^ (V2I64)r); }
 inline V2F64 FAndnot(V2F64 l, V2F64 r) { return (V2F64)(~(V2I64)l & (V2I64)r);}
 #  endif
 #endif
 #ifdef MOZ_XMMINTRIN_SIMD
 inline V2F64 FAnd   (V2F64 l, V2F64 r) { return _mm_and_pd   (l, r); }
 inline V2F64 FOr    (V2F64 l, V2F64 r) { return _mm_or_pd    (l, r); }
 inline V2F64 FXor   (V2F64 l, V2F64 r) { return _mm_xor_pd   (l, r); }
 inline V2F64 FAndnot(V2F64 l, V2F64 r) { return _mm_andnot_pd(l, r); }
 #endif
 #ifdef MOZ_FAKE_SIMD
 inline V2F64 FAnd(V2F64 l, V2F64 r) {
  V2F64 result;
  result.i[0] = l.i[0] & r.i[0];
  result.i[1] = l.i[1] & r.i[1];
  return result;
 }
 inline V2F64 FOr(V2F64 l, V2F64 r) {
  V2F64 result;
  result.i[0] = l.i[0] | r.i[0];
  result.i[1] = l.i[1] | r.i[1];
  return result;
 }
 inline V2F64 FXor(V2F64 l, V2F64 r) {
  V2F64 result;
  result.i[0] = l.i[0] ^ r.i[0];
  result.i[1] = l.i[1] ^ r.i[1];
  return result;
 }
 inline V2F64 FAndnot(V2F64 l, V2F64 r) {
  V2F64 result;
  result.i[0] = ~l.i[0] & r.i[0];
  result.i[1] = ~l.i[1] & r.i[1];
  return result;
 }
 #endif

 #ifdef MOZ_BUILTIN_SIMD
 #  if defined(__clang__ ) && defined(__SSE2__)
 // As above, clang thinks it is smart but it does weird things, this time
 // with a pslldq. Fix it.
 inline V2F64 BuildVector(double e0, double e1) {
  V2F64 result = (V2F64){ e0, e1 };
  asm("" : "+x"(result));
  return result;
 }
 #  else
 inline V2F64 BuildVector(double e0, double e1) { return (V2F64){ e0, e1 }; }
 #  endif
 inline V2F64 Splat2(double e) { return (V2F64){ e, e }; }
 inline V2F64 FZero() { return (V2F64){ 0.0, 0.0 }; }
 #endif
 #ifdef MOZ_XMMINTRIN_SIMD
 // This is not a typo. e1 and e0 really do go in this order.
 inline V2F64 BuildVector(double e0, double e1) { return _mm_set_pd(e1, e0); }
 inline V2F64 Splat2(double e) { return _mm_set1_pd(e); }
 inline V2F64 FZero() { return _mm_setzero_pd(); }
 #endif
 #ifdef MOZ_FAKE_SIMD
 inline V2F64 BuildVector(double e0, double e1) {
  V2F64 result = { e0, e1 };
  return result;
 }
 inline V2F64 Splat2(double e) {
  V2F64 result = { e, e };
  return result;
 }
 inline V2F64 FZero() {
  V2F64 result = { 0.0, 0.0 };
  return result;
 }
 #endif

 // Implicitly promote a scalar to a vector, leaving the high elements undefined.
 #if defined(MOZ_EXT_VECTOR_SIMD)
 // With ext_vector_type, we can use OpenCL-style swizzling.
 inline V2F64 ScalarToVector2(double e) {
  V2F64 result;
  result.x = e;
  return result;
 }
 #elif defined(__GNUC__) && defined(__SSE2__) && !defined(MOZ_FAKE_SIMD)
 // GCC appears to provide no nicer way to do this, but we can get exactly what
 // we want with an inline asm.
 inline V2F64 ScalarToVector2(double e) {
  V2F64 result;
  asm("" : "=x"(result) : "0"(e));
  return result;
 }
 #else
 // This suffices, but it's often suboptimal as some compilers seem to want to
 // set f to 0.
 inline V2F64 ScalarToVector2(double e) {
  double f;
  return BuildVector(e, f);
 }
 #endif

 // Demote a vector to a scalar with the value of its first element.
 #if defined(MOZ_FAKE_SIMD)
 inline double VectorToScalar(V2F64 v) { return v.e[0]; }
 inline int64_t VectorToScalar(V2I64 v) { return v.e[0]; }
 #elif defined(__clang__) || \
      (defined(__GNUC__) && \
       (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
 inline double VectorToScalar(V2F64 v) { return v[0]; }
 inline int64_t VectorToScalar(V2I64 v) { return v[0]; }
 #else
 // G++ pre 4.8 doesn't support subscripting on vector types.
 // FIXME: Hopefully there's a better way to do this on MSVC.
 inline double VectorToScalar(V2F64 v) { return *(double*)&v; }
 inline int64_t VectorToScalar(V2I64 v) { return *(int64_t*)&v; }
 #endif

 inline V2F64 FNegzero() { return BuildVector(-0.0, -0.0); }
 inline V2F64 FSign(V2F64 v) { return FAnd(FNegzero(), v); }

 inline V2F64 FAbs(V2F64 v) {
 #ifndef MOZ_FAKE_SIMD
  return FAndnot(FNegzero(), v);
 #else
  // FAnd is slow on fake SIMD.
  V2F64 result = { fabs(v.e[0]), fabs(v.e[1]) };
  return result;
 #endif
 }

 inline V2F64 FNeg(V2F64 v) {
 #ifndef MOZ_FAKE_SIMD
  return FXor(FNegzero(), v);
 #else
  // FXor is slow on fake SIMD.
  V2F64 result = { -v.e[0], -v.e[1] };
  return result;
 #endif
 }

 inline V2F64 FCopysign(V2F64 v, V2F64 s) {
 #ifndef MOZ_FAKE_SIMD
  V2F64 negzero = FNegzero();
  return FOr(FAnd(negzero, s),
             FAndnot(negzero, v));
 #else
  // FOr/FAnd are slow on fake SIMD.
  V2F64 result = { MOZ_COPYSIGN(v.e[0], s.e[0]), MOZ_COPYSIGN(v.e[1], s.e[1]) };
  return result;
 #endif
 }

 #ifndef MOZ_FAKE_SIMD
 inline unsigned FSignbit(V2F64 v) { return ((V2I64)v)[0] < 0; }
 #else
 inline unsigned FSignbit(V2F64 v) { return v.i[0] < 0; }
 #endif

 } // namespace SIMD

 } // namespace mozilla


 // Constants generated using Mathematica's GeneralMiniMaxApproximation. The
 // Even elements are for the sin polynomial approximation, the odd for the
 // cos polynomial approximation.
 //
 // The cos set uses one less coefficient than usual implementations to
 // increase performance, raising the maximum approximation error to 2 bits.
 extern const double coefficients[] __attribute__((aligned(16)));
 const double coefficients[] = {
    1.59046813973877163292e-10, 2.06467337476762997948e-9,
    -2.50509001624159785668e-08, -2.75555495413759160741e-7,
    2.75573146431678644161e-06, 2.48015808595638122085e-5,
    -1.98412698327005105692e-04, -1.38888888779622760722e-3,
    8.33333333332626768897e-03, 4.16666666665987187046e-2,
    -1.66666666666666490881e-01, -4.99999999999999888978e-1
 };

 #define MOZ_UNLIKELY(x) __builtin_expect((x), 0)

 struct sincos_result { double s, c; };
 static sincos_result fast_sincos(double x)
 {
    using namespace mozilla::SIMD;

    // Make argument non-negative but save the sign.
    V2F64 vx = ScalarToVector2(x);
    double absx = VectorToScalar(FAbs(vx));

    // The optimized algorithm below doesn't currently support values of x beyond
    // pow(2, 32) - 2. If x is beyond the range we support, fall back to the libm
    // implementation. This check also handles the Infinity and NaN input cases.
    // abs(x) < (221069929647945 / pow(2,16))
    if (MOZ_UNLIKELY(!(absx < 3.37325942455970764160e9))) {
        double sx = VectorToScalar(vx);
        sincos_result result = {
            sin(sx),
            cos(sx)
        };
        return result;
    }

    static const double m_4_pi = 1.27323954473516276487; // 4.0 / M_PI
    uint32_t i = static_cast<uint32_t>(absx * m_4_pi) + 1;

    // Integer and fractional part modulo one octant.
    double y = static_cast<double>(i & -2);

    // Extended precision modular arithmetic
    double e0 = y * -7.85398006439208984375e-1;  // 1647099 / pow(2,21)
    double e1 = y * -1.56958208208379801363e-7;  // 1380619 / pow(2,43)
    double e2 = y * -3.11168608594830669189e-14; // 4930663418217751 / pow(2,97)
    double z = absx + e0 + e1 + e2;

    // Compute quadrant 0 sin and cos polynomial approximations. We compute sin
    // in vector element 0 and cos in vector element 1.
    double zz = z * z;
    const V2F64 *c = (const V2F64 *)coefficients;
    V2F64 v2zz = Splat2(zz);
    V2F64 ans = c[0];
    ans = FMul(ans, v2zz);
    ans = FAdd(ans, c[1]);
    ans = FMul(ans, v2zz);
    ans = FAdd(ans, c[2]);
    ans = FMul(ans, v2zz);
    ans = FAdd(ans, c[3]);
    ans = FMul(ans, v2zz);
    ans = FAdd(ans, c[4]);
    ans = FMul(ans, v2zz);
    ans = FAdd(ans, c[5]);
    ans = FMul(ans, BuildVector(zz * z, zz));
    ans = FAdd(ans, BuildVector(z, 1.0));

    // Create an array which consists of the q0 sin, cos, -sin, -cos.
    const V2F64 reflect_vecs[2] = { ans, FNeg(ans) };
    const double *reflect = (const double*)reflect_vecs;

    // Load the result with the desired reflection.
    uint32_t quad_index = i >> 1;
    uint32_t orig_sign = FSignbit(vx);
    double sin_result = reflect[(quad_index + (orig_sign << 1)) & 3];
    double cos_result = reflect[(quad_index + 1) & 3];

    sincos_result result = { sin_result, cos_result };
    return result;
 }

 int main() {
 	double sum = 0;
 	double v = 0.3;
 	for (int i = 0; i < 100000000; ++i) {
 		v += 0.2;
 		v /= 2;
 		if (0) {
 			double s, c;
 			sincos(0.2, &s, &c);
 			sum += s;
 			sum += c;
 		} else if (1) {
 			sincos_result r = fast_sincos(0.2);
 			sum += r.s;
 			sum += r.c;
 		} else {
 			sum += sin(v);
 			sum += cos(v);
 		}
 	}
 	cout << sum << endl;
 	// cout << r.s << ' ' << r.c << endl;
 }
	#define _GNU_SOURCE
	#include <bits/stdc++.h>
	using namespace std;

	/* mfbt basic SIMD wrappers. */

	#include <math.h>
	#include <stdint.h>

	// Figure out how to get access to SIMD on the current compiler.
	#if defined(_MSC_VER)
	# if _M_IX86_FP >= 2
	// MSVC supports <xmmintrin.h>.
	# define MOZ_XMMINTRIN_SIMD 1
	# endif
	#elif defined(__clang__)
	// Clang supports the SIMD syntax and features even on targets which lack
	// SIMD support.
	# define MOZ_EXT_VECTOR_SIMD 1
	# define MOZ_BUILTIN_SIMD 1
	#elif defined(__GNUC__)
	// GCC emits suboptimal code around xorpd though (filed as GCC bug 60826), so
	// we use <xmmintrin.h> there also for now.
	# if defined(__SSE2__)
	# define MOZ_XMMINTRIN_SIMD 1
	# elif defined(__ARM_NEON__) \|\| defined(__ALTIVEC__)
	# define MOZ_BUILTIN_SIMD 1
	# endif
	#endif
	#if !defined(MOZ_XMMINTRIN_SIMD) && !defined(MOZ_BUILTIN_SIMD)
	// No SIMD support. Do what we can.
	# define MOZ_FAKE_SIMD 1
	#endif

	// Figure out how to call copysign.
	#ifdef __GNUC__
	# define MOZ_COPYSIGN __builtin_copysign
	#elif defined _WIN32
	# define MOZ_COPYSIGN _copysign
	#else
	# define MOZ_COPYSIGN copysign
	#endif

	#ifdef MOZ_XMMINTRIN_SIMD
	# include <xmmintrin.h>
	#endif

	namespace mozilla {

	namespace SIMD {

	#ifdef MOZ_BUILTIN_SIMD
	// Add may_alias since we often have to cast pointers around, especially
	// since we're restricted in what we can do due to portability constraints.
	# ifdef MOZ_EXT_VECTOR_SIMD
	typedef double V2F64 __attribute__((__ext_vector_type__(2), __may_alias__));
	typedef int64_t V2I64 __attribute__((__ext_vector_type__(2), __may_alias__));
	# else
	typedef double V2F64 __attribute__((__vector_size__(16), __may_alias__));
	typedef int64_t V2I64 __attribute__((__vector_size__(16), __may_alias__));
	# endif
	#endif
	#ifdef MOZ_XMMINTRIN_SIMD
	typedef __m128d V2F64;
	typedef __m128i V2I64;
	#endif
	#ifdef MOZ_FAKE_SIMD
	typedef union { double e[2]; int64_t i[2]; } V2F64;
	typedef struct { int64_t e[2]; } V2I64;
	#endif

	#ifdef MOZ_BUILTIN_SIMD
	inline V2F64 FAdd(V2F64 l, V2F64 r) { return l + r; }
	inline V2F64 FSub(V2F64 l, V2F64 r) { return l - r; }
	inline V2F64 FMul(V2F64 l, V2F64 r) { return l * r; }
	inline V2F64 FDiv(V2F64 l, V2F64 r) { return l / r; }
	#endif
	#ifdef MOZ_XMMINTRIN_SIMD
	inline V2F64 FAdd(V2F64 l, V2F64 r) { return _mm_add_pd(l, r); }
	inline V2F64 FSub(V2F64 l, V2F64 r) { return _mm_sub_pd(l, r); }
	inline V2F64 FMul(V2F64 l, V2F64 r) { return _mm_mul_pd(l, r); }
	inline V2F64 FDiv(V2F64 l, V2F64 r) { return _mm_div_pd(l, r); }
	#endif
	#ifdef MOZ_FAKE_SIMD
	inline V2F64 FAdd(V2F64 l, V2F64 r) {
	V2F64 result = { l.e[0] + r.e[0], l.e[1] + r.e[1] };
	return result;
	}
	inline V2F64 FSub(V2F64 l, V2F64 r) {
	V2F64 result = { l.e[0] - r.e[0], l.e[1] - r.e[1] };
	return result;
	}
	inline V2F64 FMul(V2F64 l, V2F64 r) {
	V2F64 result = { l.e[0] * r.e[0], l.e[1] * r.e[1] };
	return result;
	}
	inline V2F64 FDiv(V2F64 l, V2F64 r) {
	V2F64 result = { l.e[0] / r.e[0], l.e[1] / r.e[1] };
	return result;
	}
	#endif

	#ifdef MOZ_BUILTIN_SIMD
	// We do need to define the bitwise operators though. We don't use
	// operator& and friends because these are actually floating-point types, and
	// it's confusing to implicitly treat them like integer types in a C++ API.
	// And speaking of conflating types in a confusing way, the compiler provides
	// bitwise casts through the plain cast syntax, which is inconsistent, but
	// the way it is.
	# if defined(__clang__ ) && defined(__SSE2__)
	// Clang has a philosophy that the optimizer knows better than the human
	// writing vector intrinsics and studying the assembly output. It isn't always
	// true. Use inline asm rinses, in this case to prevent it from converting
	// operations like andnpd into integer operations, defeating CSE with other
	// vectors, requiring awkward movabs instructions to materialize immediates,
	// and generally doing weird things.
	inline V2F64 FAnd (V2F64 l, V2F64 r) {
	asm("" : "+x"(l));
	V2F64 result = (V2F64)((V2I64)l & (V2I64)r);
	return result;
	}
	inline V2F64 FOr (V2F64 l, V2F64 r) {
	asm("" : "+x"(l));
	V2F64 result = (V2F64)((V2I64)l \| (V2I64)r);
	return result;
	}
	inline V2F64 FXor (V2F64 l, V2F64 r) {
	asm("" : "+x"(l));
	V2F64 result = (V2F64)((V2I64)l ^ (V2I64)r);
	return result;
	}
	inline V2F64 FAndnot(V2F64 l, V2F64 r) {
	asm("" : "+x"(l));
	V2F64 result = (V2F64)(~(V2I64)l & (V2I64)r);
	return result;
	}
	# else
	inline V2F64 FAnd (V2F64 l, V2F64 r) { return (V2F64)((V2I64)l & (V2I64)r); }
	inline V2F64 FOr (V2F64 l, V2F64 r) { return (V2F64)((V2I64)l \| (V2I64)r); }
	inline V2F64 FXor (V2F64 l, V2F64 r) { return (V2F64)((V2I64)l ^ (V2I64)r); }
	inline V2F64 FAndnot(V2F64 l, V2F64 r) { return (V2F64)(~(V2I64)l & (V2I64)r);}
	# endif
	#endif
	#ifdef MOZ_XMMINTRIN_SIMD
	inline V2F64 FAnd (V2F64 l, V2F64 r) { return _mm_and_pd (l, r); }
	inline V2F64 FOr (V2F64 l, V2F64 r) { return _mm_or_pd (l, r); }
	inline V2F64 FXor (V2F64 l, V2F64 r) { return _mm_xor_pd (l, r); }
	inline V2F64 FAndnot(V2F64 l, V2F64 r) { return _mm_andnot_pd(l, r); }
	#endif
	#ifdef MOZ_FAKE_SIMD
	inline V2F64 FAnd(V2F64 l, V2F64 r) {
	V2F64 result;
	result.i[0] = l.i[0] & r.i[0];
	result.i[1] = l.i[1] & r.i[1];
	return result;
	}
	inline V2F64 FOr(V2F64 l, V2F64 r) {
	V2F64 result;
	result.i[0] = l.i[0] \| r.i[0];
	result.i[1] = l.i[1] \| r.i[1];
	return result;
	}
	inline V2F64 FXor(V2F64 l, V2F64 r) {
	V2F64 result;
	result.i[0] = l.i[0] ^ r.i[0];
	result.i[1] = l.i[1] ^ r.i[1];
	return result;
	}
	inline V2F64 FAndnot(V2F64 l, V2F64 r) {
	V2F64 result;
	result.i[0] = ~l.i[0] & r.i[0];
	result.i[1] = ~l.i[1] & r.i[1];
	return result;
	}
	#endif

	#ifdef MOZ_BUILTIN_SIMD
	# if defined(__clang__ ) && defined(__SSE2__)
	// As above, clang thinks it is smart but it does weird things, this time
	// with a pslldq. Fix it.
	inline V2F64 BuildVector(double e0, double e1) {
	V2F64 result = (V2F64){ e0, e1 };
	asm("" : "+x"(result));
	return result;
	}
	# else
	inline V2F64 BuildVector(double e0, double e1) { return (V2F64){ e0, e1 }; }
	# endif
	inline V2F64 Splat2(double e) { return (V2F64){ e, e }; }
	inline V2F64 FZero() { return (V2F64){ 0.0, 0.0 }; }
	#endif
	#ifdef MOZ_XMMINTRIN_SIMD
	// This is not a typo. e1 and e0 really do go in this order.
	inline V2F64 BuildVector(double e0, double e1) { return _mm_set_pd(e1, e0); }
	inline V2F64 Splat2(double e) { return _mm_set1_pd(e); }
	inline V2F64 FZero() { return _mm_setzero_pd(); }
	#endif
	#ifdef MOZ_FAKE_SIMD
	inline V2F64 BuildVector(double e0, double e1) {
	V2F64 result = { e0, e1 };
	return result;
	}
	inline V2F64 Splat2(double e) {
	V2F64 result = { e, e };
	return result;
	}
	inline V2F64 FZero() {
	V2F64 result = { 0.0, 0.0 };
	return result;
	}
	#endif

	// Implicitly promote a scalar to a vector, leaving the high elements undefined.
	#if defined(MOZ_EXT_VECTOR_SIMD)
	// With ext_vector_type, we can use OpenCL-style swizzling.
	inline V2F64 ScalarToVector2(double e) {
	V2F64 result;
	result.x = e;
	return result;
	}
	#elif defined(__GNUC__) && defined(__SSE2__) && !defined(MOZ_FAKE_SIMD)
	// GCC appears to provide no nicer way to do this, but we can get exactly what
	// we want with an inline asm.
	inline V2F64 ScalarToVector2(double e) {
	V2F64 result;
	asm("" : "=x"(result) : "0"(e));
	return result;
	}
	#else
	// This suffices, but it's often suboptimal as some compilers seem to want to
	// set f to 0.
	inline V2F64 ScalarToVector2(double e) {
	double f;
	return BuildVector(e, f);
	}
	#endif

	// Demote a vector to a scalar with the value of its first element.
	#if defined(MOZ_FAKE_SIMD)
	inline double VectorToScalar(V2F64 v) { return v.e[0]; }
	inline int64_t VectorToScalar(V2I64 v) { return v.e[0]; }
	#elif defined(__clang__) \|\| \
	(defined(__GNUC__) && \
	(__GNUC__ > 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
	inline double VectorToScalar(V2F64 v) { return v[0]; }
	inline int64_t VectorToScalar(V2I64 v) { return v[0]; }
	#else
	// G++ pre 4.8 doesn't support subscripting on vector types.
	// FIXME: Hopefully there's a better way to do this on MSVC.
	inline double VectorToScalar(V2F64 v) { return (double)&v; }
	inline int64_t VectorToScalar(V2I64 v) { return (int64_t)&v; }
	#endif

	inline V2F64 FNegzero() { return BuildVector(-0.0, -0.0); }
	inline V2F64 FSign(V2F64 v) { return FAnd(FNegzero(), v); }

	inline V2F64 FAbs(V2F64 v) {
	#ifndef MOZ_FAKE_SIMD
	return FAndnot(FNegzero(), v);
	#else
	// FAnd is slow on fake SIMD.
	V2F64 result = { fabs(v.e[0]), fabs(v.e[1]) };
	return result;
	#endif
	}

	inline V2F64 FNeg(V2F64 v) {
	#ifndef MOZ_FAKE_SIMD
	return FXor(FNegzero(), v);
	#else
	// FXor is slow on fake SIMD.
	V2F64 result = { -v.e[0], -v.e[1] };
	return result;
	#endif
	}

	inline V2F64 FCopysign(V2F64 v, V2F64 s) {
	#ifndef MOZ_FAKE_SIMD
	V2F64 negzero = FNegzero();
	return FOr(FAnd(negzero, s),
	FAndnot(negzero, v));
	#else
	// FOr/FAnd are slow on fake SIMD.
	V2F64 result = { MOZ_COPYSIGN(v.e[0], s.e[0]), MOZ_COPYSIGN(v.e[1], s.e[1]) };
	return result;
	#endif
	}

	#ifndef MOZ_FAKE_SIMD
	inline unsigned FSignbit(V2F64 v) { return ((V2I64)v)[0] < 0; }
	#else
	inline unsigned FSignbit(V2F64 v) { return v.i[0] < 0; }
	#endif

	} // namespace SIMD

	} // namespace mozilla


	// Constants generated using Mathematica's GeneralMiniMaxApproximation. The
	// Even elements are for the sin polynomial approximation, the odd for the
	// cos polynomial approximation.
	//
	// The cos set uses one less coefficient than usual implementations to
	// increase performance, raising the maximum approximation error to 2 bits.
	extern const double coefficients[] __attribute__((aligned(16)));
	const double coefficients[] = {
	1.59046813973877163292e-10, 2.06467337476762997948e-9,
	-2.50509001624159785668e-08, -2.75555495413759160741e-7,
	2.75573146431678644161e-06, 2.48015808595638122085e-5,
	-1.98412698327005105692e-04, -1.38888888779622760722e-3,
	8.33333333332626768897e-03, 4.16666666665987187046e-2,
	-1.66666666666666490881e-01, -4.99999999999999888978e-1
	};

	#define MOZ_UNLIKELY(x) __builtin_expect((x), 0)

	struct sincos_result { double s, c; };
	static sincos_result fast_sincos(double x)
	{
	using namespace mozilla::SIMD;

	// Make argument non-negative but save the sign.
	V2F64 vx = ScalarToVector2(x);
	double absx = VectorToScalar(FAbs(vx));

	// The optimized algorithm below doesn't currently support values of x beyond
	// pow(2, 32) - 2. If x is beyond the range we support, fall back to the libm
	// implementation. This check also handles the Infinity and NaN input cases.
	// abs(x) < (221069929647945 / pow(2,16))
	if (MOZ_UNLIKELY(!(absx < 3.37325942455970764160e9))) {
	double sx = VectorToScalar(vx);
	sincos_result result = {
	sin(sx),
	cos(sx)
	};
	return result;
	}

	static const double m_4_pi = 1.27323954473516276487; // 4.0 / M_PI
	uint32_t i = static_cast<uint32_t>(absx * m_4_pi) + 1;

	// Integer and fractional part modulo one octant.
	double y = static_cast<double>(i & -2);

	// Extended precision modular arithmetic
	double e0 = y * -7.85398006439208984375e-1; // 1647099 / pow(2,21)
	double e1 = y * -1.56958208208379801363e-7; // 1380619 / pow(2,43)
	double e2 = y * -3.11168608594830669189e-14; // 4930663418217751 / pow(2,97)
	double z = absx + e0 + e1 + e2;

	// Compute quadrant 0 sin and cos polynomial approximations. We compute sin
	// in vector element 0 and cos in vector element 1.
	double zz = z * z;
	const V2F64 c = (const V2F64 )coefficients;
	V2F64 v2zz = Splat2(zz);
	V2F64 ans = c[0];
	ans = FMul(ans, v2zz);
	ans = FAdd(ans, c[1]);
	ans = FMul(ans, v2zz);
	ans = FAdd(ans, c[2]);
	ans = FMul(ans, v2zz);
	ans = FAdd(ans, c[3]);
	ans = FMul(ans, v2zz);
	ans = FAdd(ans, c[4]);
	ans = FMul(ans, v2zz);
	ans = FAdd(ans, c[5]);
	ans = FMul(ans, BuildVector(zz * z, zz));
	ans = FAdd(ans, BuildVector(z, 1.0));

	// Create an array which consists of the q0 sin, cos, -sin, -cos.
	const V2F64 reflect_vecs[2] = { ans, FNeg(ans) };
	const double reflect = (const double)reflect_vecs;

	// Load the result with the desired reflection.
	uint32_t quad_index = i >> 1;
	uint32_t orig_sign = FSignbit(vx);
	double sin_result = reflect[(quad_index + (orig_sign << 1)) & 3];
	double cos_result = reflect[(quad_index + 1) & 3];

	sincos_result result = { sin_result, cos_result };
	return result;
	}

	int main() {
	double sum = 0;
	double v = 0.3;
	for (int i = 0; i < 100000000; ++i) {
	v += 0.2;
	v /= 2;
	if (0) {
	double s, c;
	sincos(0.2, &s, &c);
	sum += s;
	sum += c;
	} else if (1) {
	sincos_result r = fast_sincos(0.2);
	sum += r.s;
	sum += r.c;
	} else {
	sum += sin(v);
	sum += cos(v);
	}
	}
	cout << sum << endl;
	// cout << r.s << ' ' << r.c << endl;
	}