Created
May 8, 2015 06:59
-
-
Save syoyo/1516aa3e8e5489871fdd to your computer and use it in GitHub Desktop.
doubl2 exp(double2) performance on SPARC/HPC-ACE
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cmath> | |
#include <emmintrin.h> | |
#define _mm_set1_pd(x) _mm_set_pd((x), (x)) | |
#define FORCE_INLINE __attriabute__((force_inline)) | |
// Compute exp(x) using trigonometric function. | |
inline __m128d exp_tri(__m128d v) | |
{ | |
const __m128d inv_log2 = _mm_set1_pd(1.4426950408889634073599); | |
const __m128d log2val = _mm_set1_pd(6.93145751953125E-1); | |
const __m128d zero = _mm_setzero_pd(); | |
const __m128d one = _mm_set1_pd(1.0); | |
const __m128d c1 = _mm_set1_pd(1.42860682030941723212E-6); | |
__m128d x = v; | |
// a = x / log2 | |
__m128d a = _mm_mul_pd(x, inv_log2); | |
// a -= (a < 0) | |
// k = (int)floor(a); p = (float)k | |
__m128d p = _mm_cmplt_pd(a, zero); | |
p = _mm_and_pd(p, one); | |
a = _mm_sub_pd(a, p); | |
__m128d k = _fjsp_dtox_v2r8(a); | |
p = _fjsp_xtod_v2r8(k); | |
__m128d p_1023 = _mm_add_pd(p, _mm_set1_pd(1023.0)); | |
// x -= p * log2 | |
x = _mm_sub_pd(x, _mm_mul_pd(p, log2val)); | |
x = _mm_sub_pd(x, _mm_mul_pd(p, c1)); | |
// abs(x) | |
__m128d negMask = _mm_cmplt_pd(x, zero); | |
x = _fjsp_abs_v2r8(x); | |
// | |
// exp(x) ~= 1 + (1/2!)x^2 + (1/3!)x^3 + ... + (1/n!)x^n | |
// | |
__m128d x2 = _mm_mul_pd(x, x); // x^2, | |
__m128d x4c = _fjsp_trismul_v2r8(x2, one); // x^4, select cos table | |
__m128d m4c = _fjsp_trissel_v2r8(x2, one); // select cos table(sign bit 1) | |
__m128d x4s = _fjsp_trismul_v2r8(x2, zero); // x^4, select sin table | |
__m128d m4s = _fjsp_trissel_v2r8(x2, zero); // select sintable(sign bit 0) | |
// a4c = 1.0 + x^4((1/4!) + x^4((1/8!) + x^4((1/12!) + 0 * 0))) | |
__m128d a0c = zero; | |
__m128d a1c = _fjsp_trimadd_v2r8(a0c, x4c, 6); | |
__m128d a2c = _fjsp_trimadd_v2r8(a1c, x4c, 4); | |
__m128d a3c = _fjsp_trimadd_v2r8(a2c, x4c, 2); | |
__m128d a4c = _fjsp_trimadd_v2r8(a3c, x4c, 0); | |
// a4s = x * (1.0 + x^4((1/5!) + x^4((1/9!) + x^4((1/13!) + 0 * 0))) | |
__m128d a0s = zero; | |
__m128d a1s = _fjsp_trimadd_v2r8(a0s, x4s, 6); | |
__m128d a2s = _fjsp_trimadd_v2r8(a1s, x4s, 4); | |
__m128d a3s = _fjsp_trimadd_v2r8(a2s, x4s, 2); | |
__m128d a4s = _fjsp_trimadd_v2r8(a3s, x4s, 0); | |
// b4c = -((1/2!) + x^4((1/6!) + x^4((1/10!) + x^4((1/14!) + 0 * 0)))) | |
__m128d b0c = zero; | |
__m128d b1c = _fjsp_trimadd_v2r8(b0c, x4c, 7); | |
__m128d b2c = _fjsp_trimadd_v2r8(b1c, x4c, 5); | |
__m128d b3c = _fjsp_trimadd_v2r8(b2c, x4c, 3); | |
__m128d b4c = _fjsp_trimadd_v2r8(b3c, x4c, 1); | |
// b4s = -((1/3!) + x^4((1/7!) + x^4((1/11!) + x^4((1/15!) + 0 * 0)))) | |
__m128d b0s = zero; | |
__m128d b1s = _fjsp_trimadd_v2r8(b0s, x4s, 7); | |
__m128d b2s = _fjsp_trimadd_v2r8(b1s, x4s, 5); | |
__m128d b3s = _fjsp_trimadd_v2r8(b2s, x4s, 3); | |
__m128d b4s = _fjsp_trimadd_v2r8(b3s, x4s, 1); | |
// rc = a4c - (x^2 * b4c) | |
__m128d rc = _fjsp_nmsub_v2r8(x2, b4c, a4c); | |
// rs = x * (a4s - (x^2 * b4s)) | |
__m128d rs = _mm_mul_pd(x, _fjsp_nmsub_v2r8(x2, b4s, a4s)); | |
// r = rc + r; | |
__m128d r = _mm_add_pd(rc, rs); | |
// r = (x < 0.0) ? -r : r | |
r = _fjsp_selmov_v2r8(_fjsp_neg_v2r8(r), r, negMask); | |
// 2**52 = 4503599627370496ULL | |
const unsigned long long c0[2] = {4503599627370496ULL, 4503599627370496ULL}; | |
const unsigned long long mask[2] = {0x7FF0000000000000ULL, 0x7FF0000000000000ULL}; | |
__m128d t = _mm_load_pd((double*)c0); | |
__m128d m = _mm_load_pd((double*)mask); | |
// rr = 2^k | |
// ((p + 1023) << 52) & 0x7FF0000000000000 | |
__m128d k1d = _fjsp_dtox_v2r8(p_1023); | |
__m128d rr = _fjsp_pmaddx_v2r8(k1d, t, zero); | |
rr = _mm_and_pd(rr, m); | |
// exp(x) = r * 2^k | |
__m128d ret = _mm_mul_pd(r, rr); | |
return ret; | |
} | |
// ------------------------------------------------------ | |
void bench_2SIMD(const char *msg, __m128d (*f)(__m128d), const std::vector<double>& v) | |
{ | |
const size_t n = v.size(); | |
cybozu::CpuClock clk; | |
double sum = 0; | |
clk.begin(); | |
for (size_t i = 0; i < n/2; i++) { | |
__m128d f2 = f(_mm_load_pd(&v[2*i])); | |
double f[2]; | |
_mm_store_pd(f, f2); | |
sum += f[0] + f[1]; | |
} | |
clk.end(); | |
double peak = 0; | |
double rms = 0; | |
for (size_t i = 0; i < n/2; i++) { | |
double ex[2]; | |
__m128d ex2 = f(_mm_load_pd(&v[2*i])); | |
_mm_store_pd(ex, ex2); | |
double exf0 = exp(v[2*i+0]); | |
double exf1 = exp(v[2*i+1]); | |
double err0 = fabs(exf0 - ex[0]) / ex[0]; | |
if (peak < err0) { | |
peak = err0; | |
} | |
rms += err0 * err0; | |
double err1 = fabs(exf1 - ex[1]) / ex[1]; | |
if (peak < err1) { | |
peak = err1; | |
} | |
rms += err1 * err1; | |
} | |
rms /= n; | |
rms = sqrt(rms); | |
printf("%s %7.2f %.18e %e %e\n", msg, clk.getClock() / double(n), sum, peak, rms); | |
} | |
int main() | |
... | |
const struct { | |
const char *name; | |
__m128d (*f)(__m128d); | |
} tbl_2SIMD[] = { | |
{"exp_tri(double2) ", exp_tri } | |
}; | |
for (size_t i = 0; i < sizeof(tbl_2SIMD) / sizeof(*tbl_2SIMD); i++) { | |
bench_2SIMD(tbl_2SIMD[i].name, tbl_2SIMD[i].f, v); | |
} | |
... | |
} | |
// --------------------------------------- | |
$ ./a.out | |
exp 119.15 1.635514436087156064e+05 0.000000e+00 0.000000e+00 | |
fmath:exp(double) 22.23 1.635514436087156064e+05 7.572649e-16 1.774574e-16 | |
exp_tri(double2) 27.13 1.581559103355605621e+05 9.168868e-02 4.481628e-02 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment