Created
June 30, 2015 14:47
-
-
Save syoyo/5fcf1e1400130d01023d to your computer and use it in GitHub Desktop.
new_exp8d
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// SPARC HPC-ACE | |
// FCCpx -Krestp=all -Kfast expf_test.cpp | |
// | |
fmath_exp aveDiff= 1e-06 maxDiff=3.891789e-06 | |
new_exp aveDiff= 2e-08 maxDiff=1.192089e-07 | |
new_exp4 aveDiff= 2e-08 maxDiff=1.192078e-07 | |
new_exp4d aveDiff= 2e-08 maxDiff=5.036628e-08 | |
std::exp 0.197360 | |
fmath_exp 0.110120 | |
new_exp 0.105100 | |
std::exp4 0.102860 | |
fmath_exp4 0.107620 | |
new_exp4 0.065870 | |
std_exp4d 0.066650 | |
new_exp4d 0.053880 | |
new_exp8d 0.061670 | |
dummy=1236149.375000 | |
void new_exp8d(double py[8], const double px[8]) | |
{ | |
const __m128d c256 = _mm_set_pd(1.0 / 256, 1.0 / 256); | |
const __m128d c0 = _mm_set_pd(FMATH_EXP_C0, FMATH_EXP_C0); | |
const __m128d c1 = _mm_set_pd(FMATH_EXP_C1, FMATH_EXP_C1); | |
const __m128d c2 = _mm_set_pd(FMATH_EXP_C2, FMATH_EXP_C2); | |
const __m128d c3 = _mm_set_pd(FMATH_EXP_C3, FMATH_EXP_C3); | |
const __m128d c4 = _mm_set_pd(FMATH_EXP_C4, FMATH_EXP_C4); | |
const __m128d c5 = _mm_set_pd(FMATH_EXP_C5, FMATH_EXP_C5); | |
__m128d t0 = _mm_load_pd(px + 0); | |
__m128d t1 = _mm_load_pd(px + 2); | |
__m128d t2 = _mm_load_pd(px + 4); | |
__m128d t3 = _mm_load_pd(px + 6); | |
t0 = _mm_mul_pd(t0, c256); | |
t1 = _mm_mul_pd(t1, c256); | |
t2 = _mm_mul_pd(t2, c256); | |
t3 = _mm_mul_pd(t3, c256); | |
__m128d y0, y1, y2, y3; | |
y0 = _fjsp_madd_v2r8(c5, t0, c4); | |
y1 = _fjsp_madd_v2r8(c5, t1, c4); | |
y2 = _fjsp_madd_v2r8(c5, t2, c4); | |
y3 = _fjsp_madd_v2r8(c5, t3, c4); | |
y0 = _fjsp_madd_v2r8(y0, t0, c3); | |
y1 = _fjsp_madd_v2r8(y1, t1, c3); | |
y2 = _fjsp_madd_v2r8(y2, t2, c3); | |
y3 = _fjsp_madd_v2r8(y3, t3, c3); | |
y0 = _fjsp_madd_v2r8(y0, t0, c2); | |
y1 = _fjsp_madd_v2r8(y1, t1, c2); | |
y2 = _fjsp_madd_v2r8(y2, t2, c2); | |
y3 = _fjsp_madd_v2r8(y3, t3, c2); | |
y0 = _fjsp_madd_v2r8(y0, t0, c1); | |
y1 = _fjsp_madd_v2r8(y1, t1, c1); | |
y2 = _fjsp_madd_v2r8(y2, t2, c1); | |
y3 = _fjsp_madd_v2r8(y3, t3, c1); | |
y0 = _fjsp_madd_v2r8(y0, t0, c0); | |
y1 = _fjsp_madd_v2r8(y1, t1, c0); | |
y2 = _fjsp_madd_v2r8(y2, t2, c0); | |
y3 = _fjsp_madd_v2r8(y3, t3, c0); | |
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1); | |
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1); | |
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1); | |
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1); | |
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1); | |
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1); | |
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1); | |
y0 = _mm_mul_pd(y0, y0); y1 = _mm_mul_pd(y1, y1); | |
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3); | |
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3); | |
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3); | |
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3); | |
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3); | |
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3); | |
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3); | |
y2 = _mm_mul_pd(y2, y2); y3 = _mm_mul_pd(y3, y3); | |
_mm_store_pd(py + 0, y0); | |
_mm_store_pd(py + 2, y1); | |
_mm_store_pd(py + 4, y2); | |
_mm_store_pd(py + 6, y3); | |
} | |
.... | |
float bench2d8(const char *msg, void func(double*, const double *)) | |
{ | |
uint64_t clk = getClk(); | |
MIE_ALIGN(16) double xa[8] = { -1.2, 2.4, 3.5, 0.6, 10.0, -8.33, 4.56, 0.12 }; | |
MIE_ALIGN(16) double ya[8]; | |
const int N = 100000; | |
for (int i = 0; i < N; i++) { | |
func(ya, xa); | |
xa[0] += 1e-7; | |
xa[1] += 1e-7; | |
xa[2] += 1e-7; | |
xa[3] += 1e-7; | |
xa[4] += 1e-7; | |
xa[5] += 1e-7; | |
xa[6] += 1e-7; | |
xa[7] += 1e-7; | |
} | |
double t = ya[0] + ya[1] + ya[2] + ya[3] + ya[4] + ya[5] + ya[6] + ya[7]; | |
double time = (getClk() - clk) / double(N); | |
printf("%s %f\n", msg, time); | |
return t; | |
} | |
... | |
// == sim start == | |
LLFB7 | |
L66 | |
LSSN171 | |
L375 | |
LSSN172 | |
LSSN173 | |
00000001 : ldd %f32 | |
LSSN174 | |
LSSN175 | |
00000002 : ldd %f36 | |
LSSN176 | |
LSSN177 | |
00000003 : ldd %f38 | |
LSSN178 | |
LSSN179 | |
00000004 : ldd %f40 | |
LSSN180 | |
00000005 : ldd %f34 | |
00000006 : ldd %f290 | |
00000007 : ldd %f42 | |
00000008 : ldd %f298 | |
00000009 : ldd %f46 | |
00000010 : ldd %f302 | |
00000011 : ldd %f52 | |
00000012 : ldd %f308 | |
00000013 : ldd %f54 | |
LSSN181 | |
00000014 : ldd %f310 | |
00000015 : fmuld %f34, %f32, %f32 | |
00000016 : fmuld %f34, %f36, %f36 | |
LSSN182 | |
00000017 : ldd %f56 | |
LSSN183 | |
00000018 : ldd %f312 | |
00000019 : fmuld %f34, %f38, %f38 | |
00000020 : fmuld %f34, %f40, %f34 | |
LSSN184 | |
00000021 : ldd %f58 | |
LSSN185 | |
00000022 : ldd %f314 | |
00000023 : fmaddd %f46, %f32, %f42, %f44 | |
LSSN186 | |
00000050 : fmaddd %f46, %f34, %f58, %f46 | |
-- STALL -- : f44(2) | |
-- STALL -- : f44(1) | |
00000053 : fmuld %f44, %f44, %f44 | |
00000054 : fmuld %f48, %f48, %f48 | |
00000055 : fmuld %f50, %f50, %f50 | |
00000056 : fmuld %f46, %f46, %f46 | |
-- STALL -- : f44(2) | |
-- STALL -- : f44(1) | |
00000059 : fmuld %f44, %f44, %f44 | |
00000060 : fmuld %f48, %f48, %f48 | |
00000061 : fmuld %f50, %f50, %f50 | |
00000062 : fmuld %f46, %f46, %f46 | |
-- STALL -- : f44(2) | |
-- STALL -- : f44(1) | |
00000065 : fmuld %f44, %f44, %f44 | |
00000066 : fmuld %f48, %f48, %f48 | |
00000067 : fmuld %f50, %f50, %f50 | |
00000068 : fmuld %f46, %f46, %f46 | |
-- STALL -- : f44(2) | |
-- STALL -- : f44(1) | |
00000071 : fmuld %f44, %f44, %f44 | |
00000072 : fmuld %f48, %f48, %f48 | |
00000073 : fmuld %f50, %f50, %f50 | |
00000074 : fmuld %f46, %f46, %f46 | |
-- STALL -- : f44(2) | |
-- STALL -- : f44(1) | |
00000077 : fmuld %f44, %f44, %f44 | |
00000078 : fmuld %f48, %f48, %f48 | |
00000079 : fmuld %f50, %f50, %f50 | |
00000080 : fmuld %f46, %f46, %f46 | |
-- STALL -- : f44(2) | |
-- STALL -- : f44(1) | |
00000083 : fmuld %f44, %f44, %f44 | |
00000084 : fmuld %f48, %f48, %f48 | |
00000085 : fmuld %f50, %f50, %f50 | |
00000086 : fmuld %f46, %f46, %f46 | |
-- STALL -- : f44(2) | |
-- STALL -- : f44(1) | |
00000089 : fmuld %f44, %f44, %f44 | |
00000090 : fmuld %f48, %f48, %f48 | |
00000091 : fmuld %f50, %f50, %f50 | |
00000092 : fmuld %f46, %f46, %f46 | |
-- STALL -- : f48(3) | |
-- STALL -- : f48(2) | |
-- STALL -- : f48(1) | |
00000096 : fmuld %f48, %f48, %f48 | |
LSSN187 | |
LSSN188 | |
00000097 : fmuld %f44, %f44, %f44 | |
00000098 : fmuld %f50, %f50, %f50 | |
LSSN189 | |
00000099 : fmuld %f46, %f46, %f46 | |
-- STALL -- : f44(3) | |
-- STALL -- : f44(2) | |
-- STALL -- : f44(1) | |
00000103 : std %f44 | |
00000104 : std %f48 | |
00000105 : std %f50 | |
00000106 : std %f46 | |
// == sim end == | |
// Total cycles : 106 | |
// Stall cycles : 28 | |
// Efficiency : 73.58% | |
// fp : 56 (71.79%) | |
// ld : 18 (23.08%) | |
// st : 4 (5.13%) | |
// call : 0 (0.00%) | |
// special : 0 (0.00%) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment