Last active
November 3, 2024 17:07
-
-
Save raindev/232f8b782134fe9215eaa600f8527d27 to your computer and use it in GitHub Desktop.
Static memory alignment benchmark on Pi 4 (Arm 64)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdalign.h> | |
#include <assert.h> | |
#include <sys/mman.h> | |
//#include <x86intrin.h> | |
#define COUNT 4096 | |
struct __attribute__((packed)) { | |
//struct { | |
uint32_t a; | |
uint8_t b; | |
uint16_t c; | |
} test_mem[COUNT]; | |
//} alignas(64) test_mem[COUNT]; | |
static inline uint64_t read_cntvct(void) { | |
uint64_t cnt; | |
asm volatile("mrs %0, cntvct_el0" : "=r"(cnt)); | |
return cnt; | |
} | |
//static inline uint64_t read_cycle_counter(void) { | |
// uint64_t value; | |
// asm volatile("mrs %0, pmccntr_el0" : "=r"(value)); | |
// return value; | |
//} | |
int main() { | |
unsigned int cpu; | |
int err = mlockall(MCL_CURRENT|MCL_FUTURE); | |
assert(err == 0); | |
//auto tsc1 = __rdtscp(&cpu); | |
uint64_t tsc1 = read_cntvct(); | |
//uint64_t tsc1 = read_cntvct(); | |
// do memory stuff here | |
long sum = 0; | |
for (int i = 0; i < COUNT; i++) { | |
test_mem[i].a = i % (2<<16); | |
test_mem[i].b = i % (2<<16) + 1; | |
sum += test_mem[i].a - test_mem[i].b; | |
} | |
//auto tsc2 = __rdtscp(&cpu); | |
uint64_t tsc2 = read_cntvct(); | |
//uint64_t tsc2 = read_cntvct(); | |
printf("tsc diff: %ld\n", tsc2 - tsc1); | |
printf("sum: %ld\n", sizeof(test_mem)); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ sudo perf stat -r10 ./main | |
tsc diff: 589 | |
sum: 28672 | |
tsc diff: 737 | |
sum: 28672 | |
tsc diff: 620 | |
sum: 28672 | |
tsc diff: 575 | |
sum: 28672 | |
tsc diff: 593 | |
sum: 28672 | |
tsc diff: 591 | |
sum: 28672 | |
tsc diff: 565 | |
sum: 28672 | |
tsc diff: 572 | |
sum: 28672 | |
tsc diff: 526 | |
sum: 28672 | |
tsc diff: 587 | |
sum: 28672 | |
Performance counter stats for './main' (10 runs): | |
3.33 msec task-clock # 0.631 CPUs utilized ( +- 3.00% ) | |
0 context-switches # 0.000 /sec | |
0 cpu-migrations # 0.000 /sec | |
37 page-faults # 11.124 K/sec ( +- 0.70% ) | |
4613519 cycles # 1.387 GHz ( +- 0.60% ) | |
4474784 instructions # 0.97 insn per cycle ( +- 0.32% ) | |
<not supported> branches | |
14479 branch-misses ( +- 1.20% ) | |
0.005268 +- 0.000134 seconds time elapsed ( +- 2.54% ) | |
# perf report --stdio # repeated multiple times | |
# Overhead Command Shared Object Symbol | |
# ........ ....... ................. ............................... | |
# | |
20.55% main [kernel.kallsyms] [k] el0_da | |
20.19% main [kernel.kallsyms] [k] _raw_spin_unlock | |
17.52% main [kernel.kallsyms] [k] arch_local_irq_restore | |
16.29% main [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore | |
14.47% main [kernel.kallsyms] [k] locks_remove_posix | |
10.19% main [kernel.kallsyms] [k] strnlen_user | |
0.77% main [kernel.kallsyms] [k] vma_expand | |
0.03% perf-ex [kernel.kallsyms] [k] perf_event_enable_on_exec | |
# Overhead Command Shared Object Symbol | |
# ........ ....... ................. ............................. | |
# | |
39.82% main [kernel.kallsyms] [k] arch_local_irq_restore | |
25.53% main [kernel.kallsyms] [k] handle_softirqs | |
23.23% main [kernel.kallsyms] [k] percpu_counter_add_batch | |
10.62% main [kernel.kallsyms] [k] strnlen_user | |
0.77% main [kernel.kallsyms] [k] mas_find | |
0.03% perf-ex [kernel.kallsyms] [k] perf_event_enable_on_exec | |
# Overhead Command Shared Object Symbol | |
# ........ ....... ..................... ............................. | |
# | |
31.19% main ld-linux-aarch64.so.1 [.] 0x0000000000008a20 | |
27.40% main [kernel.kallsyms] [k] __pi_clear_page | |
25.11% main [kernel.kallsyms] [k] release_pages | |
15.19% main [kernel.kallsyms] [k] kfree | |
1.06% main [kernel.kallsyms] [k] mas_wr_walk | |
0.05% perf-ex [kernel.kallsyms] [k] perf_event_enable_on_exec | |
# Overhead Command Shared Object Symbol | |
# ........ ....... ................. ............................. | |
# | |
22.85% main [kernel.kallsyms] [k] release_pages | |
22.85% main [kernel.kallsyms] [k] vma_interval_tree_remove | |
19.47% main [kernel.kallsyms] [k] _raw_spin_unlock | |
18.07% main libc.so.6 [.] write | |
16.05% main [kernel.kallsyms] [k] __rcu_read_unlock | |
0.68% main [kernel.kallsyms] [k] get_random_u64 | |
0.03% perf-ex [kernel.kallsyms] [k] perf_event_enable_on_exec | |
# Overhead Command Shared Object Symbol | |
# ........ ....... ..................... ............................. | |
# | |
25.70% main ld-linux-aarch64.so.1 [.] 0x00000000000111d8 | |
23.23% main [kernel.kallsyms] [k] next_uptodate_folio | |
21.42% main [kernel.kallsyms] [k] __pi_clear_page | |
18.60% main [kernel.kallsyms] [k] arch_local_irq_restore | |
10.27% main [kernel.kallsyms] [k] strnlen_user | |
0.74% main [kernel.kallsyms] [k] get_random_u16 | |
0.03% perf-ex [kernel.kallsyms] [k] perf_event_enable_on_exec |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment