Skip to content

Instantly share code, notes, and snippets.

@woachk
Created February 10, 2021 23:43
Show Gist options
  • Save woachk/c854df442e6c69f3c67c3a5cbd843ab3 to your computer and use it in GitHub Desktop.
Save woachk/c854df442e6c69f3c67c3a5cbd843ab3 to your computer and use it in GitHub Desktop.
Benchmarking LDR/LDAR/LDAPR
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <limits.h>
#include <assert.h>
#ifndef __aarch64__
#error This testcase is AArch64 specific and will not work on other processor families.
#endif
#ifdef __linux__
#include <sys/auxv.h>
#endif
uint64_t loadRcpc(uint64_t* a, uint64_t* b, uint64_t* c, uint64_t* d);
asm(
".globl _loadRcpc \n"
".globl loadRcpc \n"
".align 4\n"
"loadRcpc:\n"
"_loadRcpc:\n"
"ldapr x4, [x3] \n"
"ldapr x5, [x2] \n"
"ldapr x6, [x1] \n"
"ldapr x7, [x0] \n"
"add x0, x4, x5 \n"
"add x0, x0, x6 \n"
"add x0, x0, x7 \n"
"ret");
uint64_t loadRcpcDmb(uint64_t* a, uint64_t* b, uint64_t* c, uint64_t* d);
asm(
".globl _loadRcpcDmb \n"
".globl loadRcpcDmb \n"
".align 4\n"
"_loadRcpcDmb:\n"
"loadRcpcDmb:\n"
"dmb ishld \n"
"ldapr x4, [x3] \n"
"dmb ishld \n"
"ldapr x5, [x2] \n"
"dmb ishld \n"
"ldapr x6, [x1] \n"
"dmb ishld \n"
"ldapr x7, [x0] \n"
"dmb ishld \n"
"add x0, x4, x5 \n"
"add x0, x0, x6 \n"
"add x0, x0, x7 \n"
"ret");
uint64_t loadAcquire(uint64_t* a, uint64_t* b, uint64_t* c, uint64_t* d);
asm(
".globl _loadAcquire \n"
".globl loadAcquire \n"
".align 4\n"
"_loadAcquire:\n"
"loadAcquire:\n"
"ldar x4, [x3] \n"
"ldar x5, [x2] \n"
"ldar x6, [x1] \n"
"ldar x7, [x0] \n"
"add x0, x4, x5 \n"
"add x0, x0, x6 \n"
"add x0, x0, x7 \n"
"ret");
uint64_t loadAcquireDmb(uint64_t* a, uint64_t* b, uint64_t* c, uint64_t* d);
asm(
".globl _loadAcquireDmb \n"
".globl loadAcquireDmb \n"
".align 4\n"
"_loadAcquireDmb:\n"
"loadAcquireDmb:\n"
"dmb ishld \n"
"ldar x4, [x3] \n"
"dmb ishld \n"
"ldar x5, [x2] \n"
"dmb ishld \n"
"ldar x6, [x1] \n"
"dmb ishld \n"
"ldar x7, [x0] \n"
"dmb ishld \n"
"add x0, x4, x5 \n"
"add x0, x0, x6 \n"
"add x0, x0, x7 \n"
"ret");
uint64_t loadRegular(uint64_t* a, uint64_t* b, uint64_t* c, uint64_t* d);
asm(
".globl _loadRegular \n"
".globl loadRegular \n"
".align 4\n"
"_loadRegular:\n"
"loadRegular:\n"
"ldr x4, [x3] \n"
"ldr x5, [x2] \n"
"ldr x6, [x1] \n"
"ldr x7, [x0] \n"
"add x0, x4, x5 \n"
"add x0, x0, x6 \n"
"add x0, x0, x7 \n"
"ret");
uint64_t loadRegularDmb(uint64_t* a, uint64_t* b, uint64_t* c, uint64_t* d);
asm(
".globl _loadRegularDmb \n"
".globl loadRegularDmb \n"
".align 4\n"
"_loadRegularDmb:\n"
"loadRegularDmb:\n"
"dmb ishld \n"
"ldr x4, [x3] \n"
"dmb ishld \n"
"ldr x5, [x2] \n"
"dmb ishld \n"
"ldr x6, [x1] \n"
"dmb ishld \n"
"ldr x7, [x0] \n"
"dmb ishld \n"
"add x0, x4, x5 \n"
"add x0, x0, x6 \n"
"add x0, x0, x7 \n"
"ret");
#ifndef __APPLE__
#define CLOCK_UPTIME_RAW 0
uint64_t clock_gettime_nsec_np(int a) {
if (a != 0) return -1; // invalid clock type
struct timespec time;
int res = clock_gettime(CLOCK_MONOTONIC, &time);
if (res != 0)
assert("Error in clock_gettime");
return time.tv_sec * 1000000000 + time.tv_nsec;
}
#endif
int main (int argc, char **argv) {
uint64_t time_a;
uint64_t a = 2, b = 4, c = 6, d = 8;
uint64_t total = a + b + c + d;
printf("This testcase does test the performance of load instructions.\n\n");
printf("A set of loops of 512m iterations which do:\n");
printf("\n4x LDR + addition of the loaded value, which is returned and then checked.\n");
printf("\n4x LDR, with each one surrounded by dmb ishld + addition of the loaded value\nwhich is returned and then checked.\n");
printf("\nOn hardware with RCpc only: 4x LDAPR + addition of the loaded value, which is\nreturned and then checked.\n");
printf("\nOn hardware with RCpc only: 4x LDAPR, with each one surrounded by dmb ishld +\naddition of the loaded value which is returned and then checked.\n");
printf("\n4x LDAR + addition of the loaded value, which is returned and then checked.\n");
printf("\n4x LDAR, with each one surrounded by dmb ishld + addition of the loaded value\nwhich is returned and then checked.\n");
printf("\nNow testing.\n");
fflush(stdout);
time_a = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
for (uint64_t i = 0; i < INT_MAX / 4; i++)
if (loadRegular(&a, &b, &c, &d) != total) break;
printf("(DMB ISHLD between reads: No ) LDR : %llu ns\n", clock_gettime_nsec_np(CLOCK_UPTIME_RAW) - time_a);
time_a = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
for (uint64_t i = 0; i < INT_MAX / 4; i++)
if (loadRegularDmb(&a, &b, &c, &d) != total) break;
printf("(DMB ISHLD between reads: Yes) LDR : %llu ns\n", clock_gettime_nsec_np(CLOCK_UPTIME_RAW) - time_a);
time_a = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
#ifdef __linux__
unsigned long hwcaps = getauxval(AT_HWCAP);
if (!(hwcaps & HWCAP_LRCPC))
goto no_rcpc;
#endif
for (int i = 0; i < INT_MAX / 4; i++)
if (loadRcpc(&a, &b, &c, &d) != total) break;
printf("(DMB ISHLD between reads: No ) LDAPR: %llu ns\n", clock_gettime_nsec_np(CLOCK_UPTIME_RAW) - time_a);
time_a = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
for (int i = 0; i < INT_MAX / 4; i++)
if (loadRcpcDmb(&a, &b, &c, &d) != total) break;
printf("(DMB ISHLD between reads: Yes) LDAPR: %llu ns\n", clock_gettime_nsec_np(CLOCK_UPTIME_RAW) - time_a);
time_a = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
no_rcpc:
for (int i = 0; i < INT_MAX / 4; i++)
if (loadAcquire(&a, &b, &c, &d) != total) break;
printf("(DMB ISHLD between reads: No ) LDAR : %llu ns\n", clock_gettime_nsec_np(CLOCK_UPTIME_RAW) - time_a);
time_a = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
for (int i = 0; i < INT_MAX / 4; i++)
if (loadAcquireDmb(&a, &b, &c, &d) != total) break;
printf("(DMB ISHLD between reads: Yes) LDAR : %llu ns\n", clock_gettime_nsec_np(CLOCK_UPTIME_RAW) - time_a);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment