Created
April 18, 2015 16:45
-
-
Save syoyo/d80cb6f9936aa5f290da to your computer and use it in GitHub Desktop.
exp(x) using trigonometric function in HPC-ACE
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <cmath> | |
#include <emmintrin.h> | |
#define _mm_set1_pd(x) _mm_set_pd((x), (x)) | |
// Probably somewhat faster. | |
inline __m128d fastexp(__m128d v) | |
{ | |
const __m128d inv_log2 = _mm_set1_pd(1.4426950408889634073599); | |
const __m128d log2val = _mm_set1_pd(6.93145751953125E-1); | |
const __m128d zero = _mm_setzero_pd(); | |
const __m128d one = _fjsp_setone_v2r8(); | |
const __m128d c1 = _mm_set1_pd(1.42860682030941723212E-6); | |
__m128d x = v; | |
// a = x / log2 | |
__m128d a = _mm_mul_pd(x, inv_log2); | |
// a -= (a < 0) | |
// k = (int)floor(a); p = (float)k | |
__m128d p = _mm_cmplt_pd(a, zero); | |
p = _mm_and_pd(p, one); | |
a = _mm_sub_pd(a, p); | |
__m128d k = _fjsp_dtox_v2r8(a); | |
p = _fjsp_xtod_v2r8(k); | |
__m128d p_1023 = _mm_add_pd(p, _mm_set1_pd(1023.0)); | |
// x -= p * log2 | |
x = _mm_sub_pd(x, _mm_mul_pd(p, log2val)); | |
x = _mm_sub_pd(x, _mm_mul_pd(p, c1)); // BTW what is this? | |
// | |
// exp(x) ~= 1 + (1/2!)x^2 + (1/3!)x^3 + ... + (1/n!)x^n | |
// | |
__m128d x2 = _mm_mul_pd(x, x); // x^2, | |
__m128d x4c = _fjsp_trismul_v2r8(x2, one); // x^4, select cos table | |
__m128d m4c = _fjsp_trissel_v2r8(x2, one); // select cos table(sign bit 1) | |
__m128d x4s = _fjsp_trismul_v2r8(x2, zero); // x^4, select sin table | |
__m128d m4s = _fjsp_trissel_v2r8(x2, zero); // select sintable(sign bit 0) | |
// a4c = 1.0 + x^4((1/4!) + x^4((1/8!) + x^4((1/12!) + 0 * 0))) | |
__m128d a0c = zero; | |
__m128d a1c = _fjsp_trimadd_v2r8(a0c, x4c, 6); | |
__m128d a2c = _fjsp_trimadd_v2r8(a1c, x4c, 4); | |
__m128d a3c = _fjsp_trimadd_v2r8(a2c, x4c, 2); | |
__m128d a4c = _fjsp_trimadd_v2r8(a3c, x4c, 0); | |
// a4s = x * (1.0 + x^4((1/5!) + x^4((1/9!) + x^4((1/13!) + 0 * 0))) | |
__m128d a0s = zero; | |
__m128d a1s = _fjsp_trimadd_v2r8(a0s, x4s, 6); | |
__m128d a2s = _fjsp_trimadd_v2r8(a1s, x4s, 4); | |
__m128d a3s = _fjsp_trimadd_v2r8(a2s, x4s, 2); | |
__m128d a4s = _fjsp_trimadd_v2r8(a3s, x4s, 0); | |
// b4c = -((1/2!) + x^4((1/6!) + x^4((1/10!) + x^4((1/14!) + 0 * 0)))) | |
__m128d b0c = zero; | |
__m128d b1c = _fjsp_trimadd_v2r8(b0c, x4c, 7); | |
__m128d b2c = _fjsp_trimadd_v2r8(b1c, x4c, 5); | |
__m128d b3c = _fjsp_trimadd_v2r8(b2c, x4c, 3); | |
__m128d b4c = _fjsp_trimadd_v2r8(b3c, x4c, 1); | |
// b4s = -((1/3!) + x^4((1/7!) + x^4((1/11!) + x^4((1/15!) + 0 * 0)))) | |
__m128d b0s = zero; | |
__m128d b1s = _fjsp_trimadd_v2r8(b0s, x4s, 7); | |
__m128d b2s = _fjsp_trimadd_v2r8(b1s, x4s, 5); | |
__m128d b3s = _fjsp_trimadd_v2r8(b2s, x4s, 3); | |
__m128d b4s = _fjsp_trimadd_v2r8(b3s, x4s, 1); | |
// rc = a4c - (x^2 * b4c) | |
__m128d rc = _fjsp_nmsub_v2r8(x2, b4c, a4c); | |
// rs = x * (a4s - (x^2 * b4s)) | |
__m128d rs = _mm_mul_pd(x, _fjsp_nmsub_v2r8(x2, b4s, a4s)); | |
// r = rc + r; | |
__m128d r = _mm_add_pd(rc, rs); | |
// 2**52 = 4503599627370496ULL | |
const unsigned long long c0[2] = {4503599627370496ULL, 4503599627370496ULL}; | |
const unsigned long long mask[2] = {0x7FF0000000000000ULL, 0x7FF0000000000000ULL}; | |
__m128d t = _mm_load_pd((double*)c0); | |
__m128d m = _mm_load_pd((double*)mask); | |
// rr = 2^k | |
// ((p + 1023) << 52) & 0x7FF0000000000000 | |
__m128d k1d = _fjsp_dtox_v2r8(p_1023); | |
__m128d rr = _fjsp_pmaddx_v2r8(k1d, t, zero); | |
rr = _mm_and_pd(rr, m); | |
// exp(x) = r * 2^k | |
__m128d ret = _mm_mul_pd(r, rr); | |
return ret; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment