Skip to content

Instantly share code, notes, and snippets.

@mmozeiko
Last active March 5, 2025 21:00
Show Gist options
  • Save mmozeiko/98bb947fb5a9d5b8a695adf503308a58 to your computer and use it in GitHub Desktop.
Save mmozeiko/98bb947fb5a9d5b8a695adf503308a58 to your computer and use it in GitHub Desktop.
armv8 timer & cycle counter
#pragma once
#if defined(__linux__)
# define _GNU_SOURCE
# include <sched.h>
# include <unistd.h>
# include <sys/syscall.h>
# include <linux/perf_event.h>
#elif defined(_WIN32)
# include <intrin.h>
# include <windows.h>
#else
# error not supported
#endif
#include <stdint.h>
#include <stdbool.h>
//
// fixed frequency counter, always available
//
static inline uint64_t armv8_cntvct(void)
{
uint64_t value;
#if defined(__linux__)
__asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(value));
#elif defined(_WIN32)
// "Accessing CNTVCT_EL0" in https://developer.arm.com/documentation/ddi0601/latest/AArch64-Registers/CNTVCT-EL0--Counter-timer-Virtual-Count-Register
value = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
#endif
return value;
}
static inline uint64_t armv8_cntfrq(void)
{
uint64_t value;
#if defined(__linux__)
__asm__ __volatile__("mrs %0, cntfrq_el0" : "=r"(value));
#elif defined(_WIN32)
// "Accessing CNTFRQ_EL0" in https://developer.arm.com/documentation/ddi0601/latest/AArch64-Registers/CNTFRQ-EL0--Counter-timer-Frequency-Register
value = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 0));
#endif
return value;
}
//
// performance monitor cycle counter, pinned to one core
// on Linux requires extra setup - read comments below
//
#if defined(__linux__)
static int armv8_perf_fd;
static cpu_set_t armv8_perf_mask;
#elif defined(_WIN32)
static DWORD_PTR armv8_thread_mask;
#endif
static inline bool armv8_tsc_init(void)
{
#if defined(__linux__)
int core = sched_getcpu();
sched_getaffinity(0, sizeof(cpu_set_t), &armv8_perf_mask);
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(core, &set);
sched_setaffinity(0, sizeof(cpu_set_t), &set);
struct perf_event_attr attr =
{
.size = sizeof(attr),
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
.config1 = 1 | 2, // 1=64-bit counters, 2=allow user access
.pinned = 1,
};
int fd = syscall(__NR_perf_event_open, &attr, 0, core, -1, 0);
if (fd < 0)
{
// perf not enabled in kernel, or perf requires root
// to allow non-root access, run the following:
// echo 1 | sudo tee /proc/sys/kernel/perf_event_paranoid
return false;
}
uint64_t value;
__asm__ __volatile__("mrs %0, pmuserenr_el0" : "=r"(value));
if (!(value & 4))
{
// PMU not allowed for user-space access, to allow run this:
// echo 1 | sudo tee /proc/sys/kernel/perf_user_access
close(fd);
return false;
}
armv8_perf_fd = fd;
return true;
#elif defined(_WIN32)
armv8_thread_mask = SetThreadAffinityMask(GetCurrentThread(), 1ULL << GetCurrentProcessorNumber());
// no setup needed to access cycle counter from user-space
// https://learn.microsoft.com/en-us/cpp/build/arm64-windows-abi-conventions?view=msvc-170#cycle-counter
return true;
#else
return false;
#endif
}
static inline void armv8_tsc_done(void)
{
#if defined(__linux__)
close(armv8_perf_fd);
sched_setaffinity(0, sizeof(cpu_set_t), &armv8_perf_mask);
#elif defined(_WIN32)
SetThreadAffinityMask(GetCurrentThread(), armv8_thread_mask);
#endif
}
// use only when armv8_tsc_init() returned true, otherwise SIGILL will be raised
static inline uint64_t armv8_pmccntr(void)
{
uint64_t value;
#if defined(__linux__)
__asm__ __volatile__("mrs %0, pmccntr_el0" : "=r"(value));
#elif defined(_WIN32)
// "Accessing PMCCNTR_EL0" in https://developer.arm.com/documentation/ddi0601/latest/AArch64-Registers/PMCCNTR-EL0--Performance-Monitors-Cycle-Count-Register
value = _ReadStatusReg(ARM64_SYSREG(3, 3, 9, 13, 0));
#endif
return value;
}
#include "armv8_tsc.h"
#include <stdio.h>
static void loop()
{
for (int i=0; i<1000000; i++)
{
#if defined(_MSC_VER)
__nop();
#else
__asm__ __volatile__("");
#endif
}
}
int main()
{
uint64_t ticks0 = armv8_cntvct();
loop();
uint64_t ticks1 = armv8_cntvct();
uint64_t freq = armv8_cntfrq();
printf("cntvct : %zu ticks @ %zu MHz = %.2f msec\n",
(size_t)(ticks1 - ticks0),
(size_t)(freq / 1000000),
(ticks1 - ticks0) * 1000.0 / freq);
if (armv8_tsc_init())
{
uint64_t cycles0 = armv8_pmccntr();
loop();
uint64_t cycles1 = armv8_pmccntr();
printf("pmccntr: %zu cycles\n", (size_t)(cycles1 - cycles0));
armv8_tsc_done();
}
else
{
printf("perf not available! not enough privileges?\n");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment