Skip to content

Instantly share code, notes, and snippets.

@syoyo
Created May 8, 2015 06:59
Show Gist options
  • Save syoyo/1516aa3e8e5489871fdd to your computer and use it in GitHub Desktop.
Save syoyo/1516aa3e8e5489871fdd to your computer and use it in GitHub Desktop.
doubl2 exp(double2) performance on SPARC/HPC-ACE
#include <cmath>
#include <emmintrin.h>
#define _mm_set1_pd(x) _mm_set_pd((x), (x))
#define FORCE_INLINE __attriabute__((force_inline))
// Compute exp(x) using trigonometric function.
inline __m128d exp_tri(__m128d v)
{
const __m128d inv_log2 = _mm_set1_pd(1.4426950408889634073599);
const __m128d log2val = _mm_set1_pd(6.93145751953125E-1);
const __m128d zero = _mm_setzero_pd();
const __m128d one = _mm_set1_pd(1.0);
const __m128d c1 = _mm_set1_pd(1.42860682030941723212E-6);
__m128d x = v;
// a = x / log2
__m128d a = _mm_mul_pd(x, inv_log2);
// a -= (a < 0)
// k = (int)floor(a); p = (float)k
__m128d p = _mm_cmplt_pd(a, zero);
p = _mm_and_pd(p, one);
a = _mm_sub_pd(a, p);
__m128d k = _fjsp_dtox_v2r8(a);
p = _fjsp_xtod_v2r8(k);
__m128d p_1023 = _mm_add_pd(p, _mm_set1_pd(1023.0));
// x -= p * log2
x = _mm_sub_pd(x, _mm_mul_pd(p, log2val));
x = _mm_sub_pd(x, _mm_mul_pd(p, c1));
// abs(x)
__m128d negMask = _mm_cmplt_pd(x, zero);
x = _fjsp_abs_v2r8(x);
//
// exp(x) ~= 1 + (1/2!)x^2 + (1/3!)x^3 + ... + (1/n!)x^n
//
__m128d x2 = _mm_mul_pd(x, x); // x^2,
__m128d x4c = _fjsp_trismul_v2r8(x2, one); // x^4, select cos table
__m128d m4c = _fjsp_trissel_v2r8(x2, one); // select cos table(sign bit 1)
__m128d x4s = _fjsp_trismul_v2r8(x2, zero); // x^4, select sin table
__m128d m4s = _fjsp_trissel_v2r8(x2, zero); // select sintable(sign bit 0)
// a4c = 1.0 + x^4((1/4!) + x^4((1/8!) + x^4((1/12!) + 0 * 0)))
__m128d a0c = zero;
__m128d a1c = _fjsp_trimadd_v2r8(a0c, x4c, 6);
__m128d a2c = _fjsp_trimadd_v2r8(a1c, x4c, 4);
__m128d a3c = _fjsp_trimadd_v2r8(a2c, x4c, 2);
__m128d a4c = _fjsp_trimadd_v2r8(a3c, x4c, 0);
// a4s = x * (1.0 + x^4((1/5!) + x^4((1/9!) + x^4((1/13!) + 0 * 0)))
__m128d a0s = zero;
__m128d a1s = _fjsp_trimadd_v2r8(a0s, x4s, 6);
__m128d a2s = _fjsp_trimadd_v2r8(a1s, x4s, 4);
__m128d a3s = _fjsp_trimadd_v2r8(a2s, x4s, 2);
__m128d a4s = _fjsp_trimadd_v2r8(a3s, x4s, 0);
// b4c = -((1/2!) + x^4((1/6!) + x^4((1/10!) + x^4((1/14!) + 0 * 0))))
__m128d b0c = zero;
__m128d b1c = _fjsp_trimadd_v2r8(b0c, x4c, 7);
__m128d b2c = _fjsp_trimadd_v2r8(b1c, x4c, 5);
__m128d b3c = _fjsp_trimadd_v2r8(b2c, x4c, 3);
__m128d b4c = _fjsp_trimadd_v2r8(b3c, x4c, 1);
// b4s = -((1/3!) + x^4((1/7!) + x^4((1/11!) + x^4((1/15!) + 0 * 0))))
__m128d b0s = zero;
__m128d b1s = _fjsp_trimadd_v2r8(b0s, x4s, 7);
__m128d b2s = _fjsp_trimadd_v2r8(b1s, x4s, 5);
__m128d b3s = _fjsp_trimadd_v2r8(b2s, x4s, 3);
__m128d b4s = _fjsp_trimadd_v2r8(b3s, x4s, 1);
// rc = a4c - (x^2 * b4c)
__m128d rc = _fjsp_nmsub_v2r8(x2, b4c, a4c);
// rs = x * (a4s - (x^2 * b4s))
__m128d rs = _mm_mul_pd(x, _fjsp_nmsub_v2r8(x2, b4s, a4s));
// r = rc + r;
__m128d r = _mm_add_pd(rc, rs);
// r = (x < 0.0) ? -r : r
r = _fjsp_selmov_v2r8(_fjsp_neg_v2r8(r), r, negMask);
// 2**52 = 4503599627370496ULL
const unsigned long long c0[2] = {4503599627370496ULL, 4503599627370496ULL};
const unsigned long long mask[2] = {0x7FF0000000000000ULL, 0x7FF0000000000000ULL};
__m128d t = _mm_load_pd((double*)c0);
__m128d m = _mm_load_pd((double*)mask);
// rr = 2^k
// ((p + 1023) << 52) & 0x7FF0000000000000
__m128d k1d = _fjsp_dtox_v2r8(p_1023);
__m128d rr = _fjsp_pmaddx_v2r8(k1d, t, zero);
rr = _mm_and_pd(rr, m);
// exp(x) = r * 2^k
__m128d ret = _mm_mul_pd(r, rr);
return ret;
}
// ------------------------------------------------------
void bench_2SIMD(const char *msg, __m128d (*f)(__m128d), const std::vector<double>& v)
{
const size_t n = v.size();
cybozu::CpuClock clk;
double sum = 0;
clk.begin();
for (size_t i = 0; i < n/2; i++) {
__m128d f2 = f(_mm_load_pd(&v[2*i]));
double f[2];
_mm_store_pd(f, f2);
sum += f[0] + f[1];
}
clk.end();
double peak = 0;
double rms = 0;
for (size_t i = 0; i < n/2; i++) {
double ex[2];
__m128d ex2 = f(_mm_load_pd(&v[2*i]));
_mm_store_pd(ex, ex2);
double exf0 = exp(v[2*i+0]);
double exf1 = exp(v[2*i+1]);
double err0 = fabs(exf0 - ex[0]) / ex[0];
if (peak < err0) {
peak = err0;
}
rms += err0 * err0;
double err1 = fabs(exf1 - ex[1]) / ex[1];
if (peak < err1) {
peak = err1;
}
rms += err1 * err1;
}
rms /= n;
rms = sqrt(rms);
printf("%s %7.2f %.18e %e %e\n", msg, clk.getClock() / double(n), sum, peak, rms);
}
int main()
...
const struct {
const char *name;
__m128d (*f)(__m128d);
} tbl_2SIMD[] = {
{"exp_tri(double2) ", exp_tri }
};
for (size_t i = 0; i < sizeof(tbl_2SIMD) / sizeof(*tbl_2SIMD); i++) {
bench_2SIMD(tbl_2SIMD[i].name, tbl_2SIMD[i].f, v);
}
...
}
// ---------------------------------------
$ ./a.out
exp 119.15 1.635514436087156064e+05 0.000000e+00 0.000000e+00
fmath:exp(double) 22.23 1.635514436087156064e+05 7.572649e-16 1.774574e-16
exp_tri(double2) 27.13 1.581559103355605621e+05 9.168868e-02 4.481628e-02
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment