Skip to content

Instantly share code, notes, and snippets.

@syoyo
Created June 30, 2015 14:47
Show Gist options
  • Save syoyo/5fcf1e1400130d01023d to your computer and use it in GitHub Desktop.
Save syoyo/5fcf1e1400130d01023d to your computer and use it in GitHub Desktop.
new_exp8d
//
// SPARC HPC-ACE
// FCCpx -Krestp=all -Kfast expf_test.cpp
//
fmath_exp aveDiff= 1e-06 maxDiff=3.891789e-06
new_exp aveDiff= 2e-08 maxDiff=1.192089e-07
new_exp4 aveDiff= 2e-08 maxDiff=1.192078e-07
new_exp4d aveDiff= 2e-08 maxDiff=5.036628e-08
std::exp 0.197360
fmath_exp 0.110120
new_exp 0.105100
std::exp4 0.102860
fmath_exp4 0.107620
new_exp4 0.065870
std_exp4d 0.066650
new_exp4d 0.053880
new_exp8d 0.061670
dummy=1236149.375000
void new_exp8d(double py[8], const double px[8])
{
const __m128d c256 = _mm_set_pd(1.0 / 256, 1.0 / 256);
const __m128d c0 = _mm_set_pd(FMATH_EXP_C0, FMATH_EXP_C0);
const __m128d c1 = _mm_set_pd(FMATH_EXP_C1, FMATH_EXP_C1);
const __m128d c2 = _mm_set_pd(FMATH_EXP_C2, FMATH_EXP_C2);
const __m128d c3 = _mm_set_pd(FMATH_EXP_C3, FMATH_EXP_C3);
const __m128d c4 = _mm_set_pd(FMATH_EXP_C4, FMATH_EXP_C4);
const __m128d c5 = _mm_set_pd(FMATH_EXP_C5, FMATH_EXP_C5);
__m128d t0 = _mm_load_pd(px + 0);
__m128d t1 = _mm_load_pd(px + 2);
__m128d t2 = _mm_load_pd(px + 4);
__m128d t3 = _mm_load_pd(px + 6);
t0 = _mm_mul_pd(t0, c256);
t1 = _mm_mul_pd(t1, c256);
t2 = _mm_mul_pd(t2, c256);
t3 = _mm_mul_pd(t3, c256);
__m128d y0, y1, y2, y3;
y0 = _fjsp_madd_v2r8(c5, t0, c4);
y1 = _fjsp_madd_v2r8(c5, t1, c4);
y2 = _fjsp_madd_v2r8(c5, t2, c4);
y3 = _fjsp_madd_v2r8(c5, t3, c4);
y0 = _fjsp_madd_v2r8(y0, t0, c3);
y1 = _fjsp_madd_v2r8(y1, t1, c3);
y2 = _fjsp_madd_v2r8(y2, t2, c3);
y3 = _fjsp_madd_v2r8(y3, t3, c3);
y0 = _fjsp_madd_v2r8(y0, t0, c2);
y1 = _fjsp_madd_v2r8(y1, t1, c2);
y2 = _fjsp_madd_v2r8(y2, t2, c2);
y3 = _fjsp_madd_v2r8(y3, t3, c2);
y0 = _fjsp_madd_v2r8(y0, t0, c1);
y1 = _fjsp_madd_v2r8(y1, t1, c1);
y2 = _fjsp_madd_v2r8(y2, t2, c1);
y3 = _fjsp_madd_v2r8(y3, t3, c1);
y0 = _fjsp_madd_v2r8(y0, t0, c0);
y1 = _fjsp_madd_v2r8(y1, t1, c0);
y2 = _fjsp_madd_v2r8(y2, t2, c0);
y3 = _fjsp_madd_v2r8(y3, t3, c0);
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1);
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1);
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1);
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1);
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1);
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1);
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1);
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1);
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3);
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3);
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3);
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3);
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3);
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3);
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3);
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3);
_mm_store_pd(py + 0, y0);
_mm_store_pd(py + 2, y1);
_mm_store_pd(py + 4, y2);
_mm_store_pd(py + 6, y3);
}
....
float bench2d8(const char *msg, void func(double*, const double *))
{
uint64_t clk = getClk();
MIE_ALIGN(16) double xa[8] = { -1.2, 2.4, 3.5, 0.6, 10.0, -8.33, 4.56, 0.12 };
MIE_ALIGN(16) double ya[8];
const int N = 100000;
for (int i = 0; i < N; i++) {
func(ya, xa);
xa[0] += 1e-7;
xa[1] += 1e-7;
xa[2] += 1e-7;
xa[3] += 1e-7;
xa[4] += 1e-7;
xa[5] += 1e-7;
xa[6] += 1e-7;
xa[7] += 1e-7;
}
double t = ya[0] + ya[1] + ya[2] + ya[3] + ya[4] + ya[5] + ya[6] + ya[7];
double time = (getClk() - clk) / double(N);
printf("%s %f\n", msg, time);
return t;
}
...
// == sim start ==
LLFB7
L66
LSSN171
L375
LSSN172
LSSN173
00000001 : ldd %f32
LSSN174
LSSN175
00000002 : ldd %f36
LSSN176
LSSN177
00000003 : ldd %f38
LSSN178
LSSN179
00000004 : ldd %f40
LSSN180
00000005 : ldd %f34
00000006 : ldd %f290
00000007 : ldd %f42
00000008 : ldd %f298
00000009 : ldd %f46
00000010 : ldd %f302
00000011 : ldd %f52
00000012 : ldd %f308
00000013 : ldd %f54
LSSN181
00000014 : ldd %f310
00000015 : fmuld %f34, %f32, %f32
00000016 : fmuld %f34, %f36, %f36
LSSN182
00000017 : ldd %f56
LSSN183
00000018 : ldd %f312
00000019 : fmuld %f34, %f38, %f38
00000020 : fmuld %f34, %f40, %f34
LSSN184
00000021 : ldd %f58
LSSN185
00000022 : ldd %f314
00000023 : fmaddd %f46, %f32, %f42, %f44
LSSN186
00000050 : fmaddd %f46, %f34, %f58, %f46
-- STALL -- : f44(2)
-- STALL -- : f44(1)
00000053 : fmuld %f44, %f44, %f44
00000054 : fmuld %f48, %f48, %f48
00000055 : fmuld %f50, %f50, %f50
00000056 : fmuld %f46, %f46, %f46
-- STALL -- : f44(2)
-- STALL -- : f44(1)
00000059 : fmuld %f44, %f44, %f44
00000060 : fmuld %f48, %f48, %f48
00000061 : fmuld %f50, %f50, %f50
00000062 : fmuld %f46, %f46, %f46
-- STALL -- : f44(2)
-- STALL -- : f44(1)
00000065 : fmuld %f44, %f44, %f44
00000066 : fmuld %f48, %f48, %f48
00000067 : fmuld %f50, %f50, %f50
00000068 : fmuld %f46, %f46, %f46
-- STALL -- : f44(2)
-- STALL -- : f44(1)
00000071 : fmuld %f44, %f44, %f44
00000072 : fmuld %f48, %f48, %f48
00000073 : fmuld %f50, %f50, %f50
00000074 : fmuld %f46, %f46, %f46
-- STALL -- : f44(2)
-- STALL -- : f44(1)
00000077 : fmuld %f44, %f44, %f44
00000078 : fmuld %f48, %f48, %f48
00000079 : fmuld %f50, %f50, %f50
00000080 : fmuld %f46, %f46, %f46
-- STALL -- : f44(2)
-- STALL -- : f44(1)
00000083 : fmuld %f44, %f44, %f44
00000084 : fmuld %f48, %f48, %f48
00000085 : fmuld %f50, %f50, %f50
00000086 : fmuld %f46, %f46, %f46
-- STALL -- : f44(2)
-- STALL -- : f44(1)
00000089 : fmuld %f44, %f44, %f44
00000090 : fmuld %f48, %f48, %f48
00000091 : fmuld %f50, %f50, %f50
00000092 : fmuld %f46, %f46, %f46
-- STALL -- : f48(3)
-- STALL -- : f48(2)
-- STALL -- : f48(1)
00000096 : fmuld %f48, %f48, %f48
LSSN187
LSSN188
00000097 : fmuld %f44, %f44, %f44
00000098 : fmuld %f50, %f50, %f50
LSSN189
00000099 : fmuld %f46, %f46, %f46
-- STALL -- : f44(3)
-- STALL -- : f44(2)
-- STALL -- : f44(1)
00000103 : std %f44
00000104 : std %f48
00000105 : std %f50
00000106 : std %f46
// == sim end ==
// Total cycles : 106
// Stall cycles : 28
// Efficiency : 73.58%
// fp : 56 (71.79%)
// ld : 18 (23.08%)
// st : 4 (5.13%)
// call : 0 (0.00%)
// special : 0 (0.00%)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment