Skip to content

Instantly share code, notes, and snippets.

@syoyo
Created April 18, 2015 16:45
Show Gist options
  • Save syoyo/d80cb6f9936aa5f290da to your computer and use it in GitHub Desktop.
Save syoyo/d80cb6f9936aa5f290da to your computer and use it in GitHub Desktop.
exp(x) using trigonometric function in HPC-ACE
#include <cstdio>
#include <cmath>
#include <emmintrin.h>
#define _mm_set1_pd(x) _mm_set_pd((x), (x))
// Probably somewhat faster.
inline __m128d fastexp(__m128d v)
{
const __m128d inv_log2 = _mm_set1_pd(1.4426950408889634073599);
const __m128d log2val = _mm_set1_pd(6.93145751953125E-1);
const __m128d zero = _mm_setzero_pd();
const __m128d one = _fjsp_setone_v2r8();
const __m128d c1 = _mm_set1_pd(1.42860682030941723212E-6);
__m128d x = v;
// a = x / log2
__m128d a = _mm_mul_pd(x, inv_log2);
// a -= (a < 0)
// k = (int)floor(a); p = (float)k
__m128d p = _mm_cmplt_pd(a, zero);
p = _mm_and_pd(p, one);
a = _mm_sub_pd(a, p);
__m128d k = _fjsp_dtox_v2r8(a);
p = _fjsp_xtod_v2r8(k);
__m128d p_1023 = _mm_add_pd(p, _mm_set1_pd(1023.0));
// x -= p * log2
x = _mm_sub_pd(x, _mm_mul_pd(p, log2val));
x = _mm_sub_pd(x, _mm_mul_pd(p, c1)); // BTW what is this?
//
// exp(x) ~= 1 + (1/2!)x^2 + (1/3!)x^3 + ... + (1/n!)x^n
//
__m128d x2 = _mm_mul_pd(x, x); // x^2,
__m128d x4c = _fjsp_trismul_v2r8(x2, one); // x^4, select cos table
__m128d m4c = _fjsp_trissel_v2r8(x2, one); // select cos table(sign bit 1)
__m128d x4s = _fjsp_trismul_v2r8(x2, zero); // x^4, select sin table
__m128d m4s = _fjsp_trissel_v2r8(x2, zero); // select sintable(sign bit 0)
// a4c = 1.0 + x^4((1/4!) + x^4((1/8!) + x^4((1/12!) + 0 * 0)))
__m128d a0c = zero;
__m128d a1c = _fjsp_trimadd_v2r8(a0c, x4c, 6);
__m128d a2c = _fjsp_trimadd_v2r8(a1c, x4c, 4);
__m128d a3c = _fjsp_trimadd_v2r8(a2c, x4c, 2);
__m128d a4c = _fjsp_trimadd_v2r8(a3c, x4c, 0);
// a4s = x * (1.0 + x^4((1/5!) + x^4((1/9!) + x^4((1/13!) + 0 * 0)))
__m128d a0s = zero;
__m128d a1s = _fjsp_trimadd_v2r8(a0s, x4s, 6);
__m128d a2s = _fjsp_trimadd_v2r8(a1s, x4s, 4);
__m128d a3s = _fjsp_trimadd_v2r8(a2s, x4s, 2);
__m128d a4s = _fjsp_trimadd_v2r8(a3s, x4s, 0);
// b4c = -((1/2!) + x^4((1/6!) + x^4((1/10!) + x^4((1/14!) + 0 * 0))))
__m128d b0c = zero;
__m128d b1c = _fjsp_trimadd_v2r8(b0c, x4c, 7);
__m128d b2c = _fjsp_trimadd_v2r8(b1c, x4c, 5);
__m128d b3c = _fjsp_trimadd_v2r8(b2c, x4c, 3);
__m128d b4c = _fjsp_trimadd_v2r8(b3c, x4c, 1);
// b4s = -((1/3!) + x^4((1/7!) + x^4((1/11!) + x^4((1/15!) + 0 * 0))))
__m128d b0s = zero;
__m128d b1s = _fjsp_trimadd_v2r8(b0s, x4s, 7);
__m128d b2s = _fjsp_trimadd_v2r8(b1s, x4s, 5);
__m128d b3s = _fjsp_trimadd_v2r8(b2s, x4s, 3);
__m128d b4s = _fjsp_trimadd_v2r8(b3s, x4s, 1);
// rc = a4c - (x^2 * b4c)
__m128d rc = _fjsp_nmsub_v2r8(x2, b4c, a4c);
// rs = x * (a4s - (x^2 * b4s))
__m128d rs = _mm_mul_pd(x, _fjsp_nmsub_v2r8(x2, b4s, a4s));
// r = rc + r;
__m128d r = _mm_add_pd(rc, rs);
// 2**52 = 4503599627370496ULL
const unsigned long long c0[2] = {4503599627370496ULL, 4503599627370496ULL};
const unsigned long long mask[2] = {0x7FF0000000000000ULL, 0x7FF0000000000000ULL};
__m128d t = _mm_load_pd((double*)c0);
__m128d m = _mm_load_pd((double*)mask);
// rr = 2^k
// ((p + 1023) << 52) & 0x7FF0000000000000
__m128d k1d = _fjsp_dtox_v2r8(p_1023);
__m128d rr = _fjsp_pmaddx_v2r8(k1d, t, zero);
rr = _mm_and_pd(rr, m);
// exp(x) = r * 2^k
__m128d ret = _mm_mul_pd(r, rr);
return ret;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment