Created
February 10, 2021 23:43
-
-
Save woachk/c854df442e6c69f3c67c3a5cbd843ab3 to your computer and use it in GitHub Desktop.
Benchmarking LDR/LDAR/LDAPR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdint.h> | |
#include <time.h> | |
#include <limits.h> | |
#include <assert.h> | |
#ifndef __aarch64__ | |
#error This testcase is AArch64 specific and will not work on other processor families. | |
#endif | |
#ifdef __linux__ | |
#include <sys/auxv.h> | |
#endif | |
uint64_t loadRcpc(uint64_t* a, uint64_t* b, uint64_t* c, uint64_t* d); | |
asm( | |
".globl _loadRcpc \n" | |
".globl loadRcpc \n" | |
".align 4\n" | |
"loadRcpc:\n" | |
"_loadRcpc:\n" | |
"ldapr x4, [x3] \n" | |
"ldapr x5, [x2] \n" | |
"ldapr x6, [x1] \n" | |
"ldapr x7, [x0] \n" | |
"add x0, x4, x5 \n" | |
"add x0, x0, x6 \n" | |
"add x0, x0, x7 \n" | |
"ret"); | |
uint64_t loadRcpcDmb(uint64_t* a, uint64_t* b, uint64_t* c, uint64_t* d); | |
asm( | |
".globl _loadRcpcDmb \n" | |
".globl loadRcpcDmb \n" | |
".align 4\n" | |
"_loadRcpcDmb:\n" | |
"loadRcpcDmb:\n" | |
"dmb ishld \n" | |
"ldapr x4, [x3] \n" | |
"dmb ishld \n" | |
"ldapr x5, [x2] \n" | |
"dmb ishld \n" | |
"ldapr x6, [x1] \n" | |
"dmb ishld \n" | |
"ldapr x7, [x0] \n" | |
"dmb ishld \n" | |
"add x0, x4, x5 \n" | |
"add x0, x0, x6 \n" | |
"add x0, x0, x7 \n" | |
"ret"); | |
uint64_t loadAcquire(uint64_t* a, uint64_t* b, uint64_t* c, uint64_t* d); | |
asm( | |
".globl _loadAcquire \n" | |
".globl loadAcquire \n" | |
".align 4\n" | |
"_loadAcquire:\n" | |
"loadAcquire:\n" | |
"ldar x4, [x3] \n" | |
"ldar x5, [x2] \n" | |
"ldar x6, [x1] \n" | |
"ldar x7, [x0] \n" | |
"add x0, x4, x5 \n" | |
"add x0, x0, x6 \n" | |
"add x0, x0, x7 \n" | |
"ret"); | |
uint64_t loadAcquireDmb(uint64_t* a, uint64_t* b, uint64_t* c, uint64_t* d); | |
asm( | |
".globl _loadAcquireDmb \n" | |
".globl loadAcquireDmb \n" | |
".align 4\n" | |
"_loadAcquireDmb:\n" | |
"loadAcquireDmb:\n" | |
"dmb ishld \n" | |
"ldar x4, [x3] \n" | |
"dmb ishld \n" | |
"ldar x5, [x2] \n" | |
"dmb ishld \n" | |
"ldar x6, [x1] \n" | |
"dmb ishld \n" | |
"ldar x7, [x0] \n" | |
"dmb ishld \n" | |
"add x0, x4, x5 \n" | |
"add x0, x0, x6 \n" | |
"add x0, x0, x7 \n" | |
"ret"); | |
uint64_t loadRegular(uint64_t* a, uint64_t* b, uint64_t* c, uint64_t* d); | |
asm( | |
".globl _loadRegular \n" | |
".globl loadRegular \n" | |
".align 4\n" | |
"_loadRegular:\n" | |
"loadRegular:\n" | |
"ldr x4, [x3] \n" | |
"ldr x5, [x2] \n" | |
"ldr x6, [x1] \n" | |
"ldr x7, [x0] \n" | |
"add x0, x4, x5 \n" | |
"add x0, x0, x6 \n" | |
"add x0, x0, x7 \n" | |
"ret"); | |
uint64_t loadRegularDmb(uint64_t* a, uint64_t* b, uint64_t* c, uint64_t* d); | |
asm( | |
".globl _loadRegularDmb \n" | |
".globl loadRegularDmb \n" | |
".align 4\n" | |
"_loadRegularDmb:\n" | |
"loadRegularDmb:\n" | |
"dmb ishld \n" | |
"ldr x4, [x3] \n" | |
"dmb ishld \n" | |
"ldr x5, [x2] \n" | |
"dmb ishld \n" | |
"ldr x6, [x1] \n" | |
"dmb ishld \n" | |
"ldr x7, [x0] \n" | |
"dmb ishld \n" | |
"add x0, x4, x5 \n" | |
"add x0, x0, x6 \n" | |
"add x0, x0, x7 \n" | |
"ret"); | |
#ifndef __APPLE__ | |
#define CLOCK_UPTIME_RAW 0 | |
uint64_t clock_gettime_nsec_np(int a) { | |
if (a != 0) return -1; // invalid clock type | |
struct timespec time; | |
int res = clock_gettime(CLOCK_MONOTONIC, &time); | |
if (res != 0) | |
assert("Error in clock_gettime"); | |
return time.tv_sec * 1000000000 + time.tv_nsec; | |
} | |
#endif | |
int main (int argc, char **argv) { | |
uint64_t time_a; | |
uint64_t a = 2, b = 4, c = 6, d = 8; | |
uint64_t total = a + b + c + d; | |
printf("This testcase does test the performance of load instructions.\n\n"); | |
printf("A set of loops of 512m iterations which do:\n"); | |
printf("\n4x LDR + addition of the loaded value, which is returned and then checked.\n"); | |
printf("\n4x LDR, with each one surrounded by dmb ishld + addition of the loaded value\nwhich is returned and then checked.\n"); | |
printf("\nOn hardware with RCpc only: 4x LDAPR + addition of the loaded value, which is\nreturned and then checked.\n"); | |
printf("\nOn hardware with RCpc only: 4x LDAPR, with each one surrounded by dmb ishld +\naddition of the loaded value which is returned and then checked.\n"); | |
printf("\n4x LDAR + addition of the loaded value, which is returned and then checked.\n"); | |
printf("\n4x LDAR, with each one surrounded by dmb ishld + addition of the loaded value\nwhich is returned and then checked.\n"); | |
printf("\nNow testing.\n"); | |
fflush(stdout); | |
time_a = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); | |
for (uint64_t i = 0; i < INT_MAX / 4; i++) | |
if (loadRegular(&a, &b, &c, &d) != total) break; | |
printf("(DMB ISHLD between reads: No ) LDR : %llu ns\n", clock_gettime_nsec_np(CLOCK_UPTIME_RAW) - time_a); | |
time_a = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); | |
for (uint64_t i = 0; i < INT_MAX / 4; i++) | |
if (loadRegularDmb(&a, &b, &c, &d) != total) break; | |
printf("(DMB ISHLD between reads: Yes) LDR : %llu ns\n", clock_gettime_nsec_np(CLOCK_UPTIME_RAW) - time_a); | |
time_a = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); | |
#ifdef __linux__ | |
unsigned long hwcaps = getauxval(AT_HWCAP); | |
if (!(hwcaps & HWCAP_LRCPC)) | |
goto no_rcpc; | |
#endif | |
for (int i = 0; i < INT_MAX / 4; i++) | |
if (loadRcpc(&a, &b, &c, &d) != total) break; | |
printf("(DMB ISHLD between reads: No ) LDAPR: %llu ns\n", clock_gettime_nsec_np(CLOCK_UPTIME_RAW) - time_a); | |
time_a = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); | |
for (int i = 0; i < INT_MAX / 4; i++) | |
if (loadRcpcDmb(&a, &b, &c, &d) != total) break; | |
printf("(DMB ISHLD between reads: Yes) LDAPR: %llu ns\n", clock_gettime_nsec_np(CLOCK_UPTIME_RAW) - time_a); | |
time_a = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); | |
no_rcpc: | |
for (int i = 0; i < INT_MAX / 4; i++) | |
if (loadAcquire(&a, &b, &c, &d) != total) break; | |
printf("(DMB ISHLD between reads: No ) LDAR : %llu ns\n", clock_gettime_nsec_np(CLOCK_UPTIME_RAW) - time_a); | |
time_a = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); | |
for (int i = 0; i < INT_MAX / 4; i++) | |
if (loadAcquireDmb(&a, &b, &c, &d) != total) break; | |
printf("(DMB ISHLD between reads: Yes) LDAR : %llu ns\n", clock_gettime_nsec_np(CLOCK_UPTIME_RAW) - time_a); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment