Created
May 29, 2017 22:22
-
-
Save rygorous/ccaca25f29b8bde85fba3d47b318d1c4 to your computer and use it in GitHub Desktop.
A53 latency tester
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// latency tester generator | |
#define PROD_ALF "add %1,%1,%2\n" // ALU fast-forward: basic ALU ops; CCMP/CCMN | |
#define PROD_ALU "add %1,%1,%2,lsl #13\n" // ALU+shift; all bitfield move; EXTR; RBIT/REV*; CLS/CLZ; CSEL/CSET etc. | |
#define PROD_SHF "lslv %1,%1,%2\n" // variable shifts, imm movs (e.g. "movz %1,#0,lsl #16") | |
#define PROD_LDR "ldr %1,[%4]\n" // load | |
#define CONS_ALU "add %1,%1,%2\n" // basic ALU ops; CCMP/CCMN; CSEL/CSET etc; CLZ/CLS; first (unshifted) src in ALU+shift | |
#define CONS_SHF "add %1,%2,%1,lsl #13\n" // second (shifted) src in ALU+shift; SBFM/UBFM/BFM/RBIT/REV*/var shifts/EXTR all sources | |
#define CONS_AGU "ldr %1,[%4,%1]\n" // load/store address generation unit | |
#define CONS_STR "str %1,[%4]\n" // store data | |
#define LATENCY_TO_ALL(prod) \ | |
T(prod,ALU) \ | |
T(prod,SHF) \ | |
T(prod,AGU) \ | |
T(prod,STR) \ | |
/* end */ | |
#define ALL_LATENCY_TESTS \ | |
LATENCY_TO_ALL(ALF) \ | |
LATENCY_TO_ALL(ALU) \ | |
LATENCY_TO_ALL(SHF) \ | |
LATENCY_TO_ALL(LDR) \ | |
/* end */ | |
#define PROD_PREFIX(x) PROD_##x | |
#define CONS_PREFIX(x) CONS_##x | |
#define T(prod,cons) \ | |
static void latency_##prod##_##cons(U32 niter) \ | |
{ \ | |
U64 buf[4] = { 0, 0, 0, 0 }; \ | |
U64 *e = buf; \ | |
U64 a=0,b=0,c=0,d=0,f=0; \ | |
__asm__ volatile(".align 3\n" \ | |
"1:\n" \ | |
"ldp %3,%6,[%4,#16]\n" /* must be slot0 - to clear pipe */ \ | |
"subs %w5,%w5,#1\n" /* dual-issues in second cycle of LDP */ \ | |
PROD_PREFIX(prod) \ | |
CONS_PREFIX(cons) \ | |
"ldp %3,%6,[%4,#16]\n" /* forces us to slot0 again */ \ | |
"b.ne 1b\n" /* dual-issues in second cycle of LDP */ \ | |
: "+&r"(a), "+&r"(b), "+&r"(c), "+&r"(d), "+&r"(e), "+&r"(niter), "+&r"(f) : "r"(1ull<<63)); \ | |
} | |
ALL_LATENCY_TESTS | |
#undef T | |
// prod=LDR cons=ADD gives 7 cycles/iter, assumed breakdown is: | |
// c0 ldp (first) | |
// c1 ldp (second) + subs | |
// c2 ldr | |
// c3 <stall> | |
// c4 add | |
// c5 ldp (first) | |
// c6 ldp (second) + b.ne | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment