|
#pragma once |
|
|
|
#include <stdint.h> |
|
|
|
// |
|
// interface |
|
// |
|
|
|
enum |
|
{ |
|
MP_CycleCount, |
|
MP_Instructions, |
|
MP_BranchMisses, |
|
MP_BranchCount, |
|
MP_DataMisses, |
|
MP_DataAccess, |
|
MP_COUNT, |
|
}; |
|
|
|
typedef struct { |
|
double ElapsedTime; |
|
uint64_t ContextSwitches; |
|
uint64_t Counters[MP_COUNT]; |
|
} MiniPerfResult; |
|
|
|
typedef void MiniPerfFun(void* Arg); |
|
|
|
// IMPORTANT NOTES |
|
// |
|
// == WINDOWS == |
|
// |
|
// * Must run process as "administrator" |
|
// |
|
// * Check available counters with "wpr -pmcsources" command. If you see only "Timer" there |
|
// then that means Windows ETW does have PMU counters available. On AMD system you might |
|
// need to disble Virtualization in BIOS (be aware that prevents using WSL2) |
|
// |
|
// * ETW is setup to report PMU counters for every context switch. Code calculates delta |
|
// between measurements for target thread, and returns accumulated values. |
|
// |
|
// == LINUX == |
|
// |
|
// * Must run process as root, for example, with "sudo" |
|
// |
|
// * Check available counters with "perf list" command. It should show multiple |
|
// [Hardware event] entries |
|
// |
|
// * Make sure you have NMI watchdog disbled, as that uses one PMU counter for itself. |
|
// To disable NMI watchdog, run: "echo 0 | sudo tee /proc/sys/kernel/nmi_watchdog" |
|
// |
|
// * perf uses only generic PMU counters for generic hardware events. It does not use fixed |
|
// ones. This means for Skylake+ only 4 events per core will be available. It should be |
|
// possible to use 3 fixed ones (cycles, instruction, refcycles) too, but setting them up |
|
// requires using arch specific register numbers which is not done here. |
|
// |
|
// == APPLE == |
|
// |
|
// * Must run process as root, for example, with "sudo" |
|
// |
|
|
|
// runs function with argument and return measured PMU counter values |
|
// execution of function is pinned to one CPU core |
|
|
|
static MiniPerfResult MiniPerf(MiniPerfFun* Fun, void* Arg); |
|
|
|
// |
|
// implementation |
|
// |
|
|
|
#if defined(_WIN32) |
|
|
|
#define WIN32_LEAN_AND_MEAN |
|
#include <windows.h> |
|
#include <evntrace.h> |
|
#include <evntcons.h> |
|
|
|
#include <intrin.h> |
|
#define MP_ASSERT(Cond) do { if (!(Cond)) __debugbreak(); } while (0) |
|
|
|
#pragma comment (lib, "advapi32") |
|
|
|
typedef struct |
|
{ |
|
MiniPerfResult Result; |
|
DWORD ThreadId; |
|
DWORD CpuIndex; |
|
ULONG* CountersUsed; |
|
size_t CounterCount; |
|
|
|
MiniPerfFun* Fun; |
|
void* Arg; |
|
|
|
LARGE_INTEGER StartTime; |
|
LARGE_INTEGER EndTime; |
|
} MiniPerfContext; |
|
|
|
static const GUID MP_ThreadGuid = { 0x3d6fa8d1, 0xfe05, 0x11d0, { 0x9d, 0xda, 0x00, 0xc0, 0x4f, 0xd7, 0xba, 0x7c } }; |
|
static const GUID MP_PageFaultGuid = { 0x3d6fa8d3, 0xfe05, 0x11d0, { 0x9d, 0xda, 0x00, 0xc0, 0x4f, 0xd7, 0xba, 0x7c } }; |
|
|
|
// Skylake+ can have 4 generic counters + 3 fixed (cycles, instructions, refcycles) |
|
static const LPCWSTR MP_IntelCounters[] = |
|
{ |
|
/* [MP_CycleCount] = */ L"UnhaltedCoreCyclesFixed", |
|
/* [MP_Instructions] = */ L"InstructionsRetiredFixed", |
|
/* [MP_BranchMisses] = */ L"BranchMispredictions", |
|
/* [MP_BranchCount] = */ L"BranchInstructions", |
|
// on Intel can use L3 cache counters |
|
/* [MP_DataMisses] = */ L"LLCMisses", |
|
/* [MP_DataAccess] = */ L"LLCReference", |
|
}; |
|
|
|
// AMD Zen can have 6 generic counters |
|
static const LPCWSTR MP_AmdCounters[] = |
|
{ |
|
/* [MP_CycleCount] */ L"TotalCycles", |
|
/* [MP_Instructions] */ L"TotalIssues", |
|
/* [MP_BranchMisses] */ L"BranchMispredictions", |
|
/* [MP_BranchCount] */ L"BranchInstructions", |
|
// on AMD can use L1 cache counters |
|
/* [MP_DataMisses] */ L"DcacheMisses", |
|
/* [MP_DataAccess] */ L"DcacheAccesses", |
|
}; |
|
|
|
static const LPCWSTR MP_ArmCounters[] = |
|
{ |
|
/* [MP_CycleCount] */ L"TotalCycles", |
|
/* [MP_Instructions] */ L"TotalIssues", |
|
/* [MP_BranchMisses] */ L"BranchMispredictions", |
|
/* [MP_BranchCount] */ L"BranchInstructions", |
|
/* [MP_DataMisses] */ L"DcacheMisses", |
|
/* [MP_DataAccess] */ L"DcacheAccesses", |
|
}; |
|
|
|
static void CALLBACK MiniPerf__Callback(EVENT_RECORD* Event) |
|
{ |
|
const GUID* Provider = &Event->EventHeader.ProviderId; |
|
UCHAR Opcode = Event->EventHeader.EventDescriptor.Opcode; |
|
UCHAR CpuIndex = GetEventProcessorIndex(Event); |
|
MiniPerfContext* Context = (MiniPerfContext*)Event->UserContext; |
|
|
|
if (RtlEqualMemory(Provider, &MP_ThreadGuid, sizeof(MP_ThreadGuid)) && Opcode == 0x24 && CpuIndex == Context->CpuIndex) |
|
{ |
|
MP_ASSERT(Event->UserDataLength >= 24); |
|
DWORD NewThreadId = *(DWORD*)((BYTE*)Event->UserData + 0); |
|
DWORD OldThreadId = *(DWORD*)((BYTE*)Event->UserData + 4); |
|
DWORD ThreadId = Context->ThreadId; |
|
|
|
for (size_t i = 0; i < Event->ExtendedDataCount; i++) |
|
{ |
|
EVENT_HEADER_EXTENDED_DATA_ITEM* Item = Event->ExtendedData + i; |
|
if (Item->ExtType == EVENT_HEADER_EXT_TYPE_PMC_COUNTERS) |
|
{ |
|
MP_ASSERT(Item->DataSize == sizeof(ULONG64) * Context->CounterCount); |
|
|
|
EVENT_EXTENDED_ITEM_PMC_COUNTERS* Pmc = (EVENT_EXTENDED_ITEM_PMC_COUNTERS*)Item->DataPtr; |
|
for (size_t c = 0; c < Item->DataSize / sizeof(ULONG64); c++) |
|
{ |
|
size_t Counter = Context->CountersUsed[c]; |
|
Context->Result.Counters[Counter] -= (NewThreadId == ThreadId) ? Pmc->Counter[c] : 0; |
|
Context->Result.Counters[Counter] += (OldThreadId == ThreadId) ? Pmc->Counter[c] : 0; |
|
} |
|
} |
|
} |
|
|
|
Context->Result.ContextSwitches += (OldThreadId == ThreadId); |
|
} |
|
} |
|
|
|
static DWORD CALLBACK MiniPerf__ProcessThread(LPVOID Arg) |
|
{ |
|
TRACEHANDLE Session = (TRACEHANDLE)Arg; |
|
ProcessTrace(&Session, 1, NULL, NULL); |
|
return 0; |
|
} |
|
|
|
static DWORD CALLBACK MiniPerf__FunThread(LPVOID Arg) |
|
{ |
|
MiniPerfContext* Context = (MiniPerfContext*)Arg; |
|
QueryPerformanceCounter(&Context->StartTime); |
|
Context->Fun(Context->Arg); |
|
QueryPerformanceCounter(&Context->EndTime); |
|
return 0; |
|
} |
|
|
|
MiniPerfResult MiniPerf(MiniPerfFun* Fun, void* Arg) |
|
{ |
|
ULONG Status; |
|
|
|
MiniPerfContext Context; |
|
ZeroMemory(&Context, sizeof(Context)); |
|
|
|
// find PMU counters by looking up available names |
|
static ULONG CounterSources[MP_COUNT]; |
|
static ULONG CountersUsed[MP_COUNT]; |
|
static size_t CounterCount = 0; |
|
|
|
if (CounterCount == 0) |
|
{ |
|
#if defined(_M_AMD64) |
|
int CpuName[4]; |
|
__cpuid(CpuName, 0); |
|
|
|
const LPCWSTR* CounterNames; |
|
if (CpuName[2] == 0x6c65746e) // GenuineI[ntel] |
|
{ |
|
CounterNames = MP_IntelCounters; |
|
} |
|
else if (CpuName[2] == 0x444d4163) // Authenti[cAMD] |
|
{ |
|
CounterNames = MP_AmdCounters; |
|
} |
|
else |
|
{ |
|
MP_ASSERT(!"Unknown CPU"); |
|
return Context.Result; |
|
} |
|
#elif defined(_M_ARM64) |
|
const LPCWSTR* CounterNames = MP_ArmCounters; |
|
#else |
|
# error Unknown architecture |
|
#endif |
|
|
|
ULONG BufferSize; |
|
|
|
// how much memory needed to query PMU counter names |
|
Status = TraceQueryInformation(0, TraceProfileSourceListInfo, NULL, 0, &BufferSize); |
|
MP_ASSERT(Status == ERROR_BAD_LENGTH); |
|
|
|
BYTE* Buffer = (BYTE*)HeapAlloc(GetProcessHeap(), 0, BufferSize); |
|
MP_ASSERT(Buffer); |
|
|
|
// get PMU counter names |
|
Status = TraceQueryInformation(0, TraceProfileSourceListInfo, Buffer, BufferSize, &BufferSize); |
|
MP_ASSERT(Status == ERROR_SUCCESS); |
|
|
|
size_t Offset = 0; |
|
for (;;) |
|
{ |
|
PROFILE_SOURCE_INFO* Info = (PROFILE_SOURCE_INFO*)(Buffer + Offset); |
|
|
|
for (size_t i = 0; i < MP_COUNT; i++) |
|
{ |
|
if (lstrcmpW(Info->Description, CounterNames[i]) == 0) |
|
{ |
|
CounterSources[CounterCount] = Info->Source; |
|
CountersUsed[CounterCount++] = i; |
|
break; |
|
} |
|
} |
|
|
|
if (Info->NextEntryOffset == 0) |
|
{ |
|
break; |
|
} |
|
Offset += Info->NextEntryOffset; |
|
} |
|
|
|
HeapFree(GetProcessHeap(), 0, Buffer); |
|
} |
|
Context.CountersUsed = CountersUsed; |
|
Context.CounterCount = CounterCount; |
|
Context.Fun = Fun; |
|
Context.Arg = Arg; |
|
|
|
struct |
|
{ |
|
EVENT_TRACE_PROPERTIES_V2 Properties; |
|
WCHAR Name[1024]; |
|
} Trace; |
|
|
|
const WCHAR TraceName[] = L"MiniPerf"; |
|
|
|
EVENT_TRACE_PROPERTIES_V2* Properties = &Trace.Properties; |
|
|
|
// stop existing trace in case it is already running |
|
ZeroMemory(&Trace, sizeof(Trace)); |
|
Properties->Wnode.BufferSize = sizeof(Trace); |
|
Properties->LoggerNameOffset = sizeof(Trace.Properties); |
|
|
|
Status = ControlTraceW(0, TraceName, (EVENT_TRACE_PROPERTIES*)Properties, EVENT_TRACE_CONTROL_STOP); |
|
MP_ASSERT(Status == ERROR_SUCCESS || Status == ERROR_MORE_DATA || Status == ERROR_WMI_INSTANCE_NOT_FOUND); |
|
|
|
// start a new trace, capture context switches |
|
ZeroMemory(&Trace, sizeof(Trace)); |
|
Properties->Wnode.BufferSize = sizeof(Trace); |
|
Properties->Wnode.ClientContext = 3; |
|
Properties->Wnode.Flags = WNODE_FLAG_TRACED_GUID | WNODE_FLAG_VERSIONED_PROPERTIES; |
|
Properties->LogFileMode = EVENT_TRACE_REAL_TIME_MODE | EVENT_TRACE_SYSTEM_LOGGER_MODE; |
|
Properties->VersionNumber = 2; |
|
Properties->EnableFlags = EVENT_TRACE_FLAG_CSWITCH; |
|
Properties->LoggerNameOffset = sizeof(Trace.Properties); |
|
|
|
TRACEHANDLE TraceHandle; |
|
Status = StartTraceW(&TraceHandle, TraceName, (EVENT_TRACE_PROPERTIES*)Properties); |
|
if (Status != ERROR_SUCCESS) |
|
{ |
|
// ERROR_ACCESS_DENIED -> need to run with admin privileges |
|
// ERROR_NO_SYSTEM_RESOURCES -> too many system traces already running |
|
|
|
// just run the function, which will measure time |
|
MiniPerf__FunThread(&Context); |
|
} |
|
else |
|
{ |
|
// enable PMU counters if there are any (otherwise only context switch count will be captured) |
|
if (CounterCount != 0) |
|
{ |
|
Status = TraceSetInformation(TraceHandle, TracePmcCounterListInfo, CounterSources, CounterCount * sizeof(CounterSources[0])); |
|
// if this triggers ERROR_BUSY = 0xaa, then I believe that that someone else is collecting PMU counters |
|
// in the system, and I'm not sure how or if at all you to forcefully stop/reconfigure it. Rebooting helps. |
|
MP_ASSERT(Status == ERROR_SUCCESS); |
|
|
|
// collect PMU counters on context switch event |
|
CLASSIC_EVENT_ID EventId = { MP_ThreadGuid, 0x24 }; |
|
Status = TraceSetInformation(TraceHandle, TracePmcEventListInfo, &EventId, sizeof(EventId)); |
|
MP_ASSERT(Status == ERROR_SUCCESS); |
|
} |
|
|
|
EVENT_TRACE_LOGFILEW Log; |
|
ZeroMemory(&Log, sizeof(Log)); |
|
Log.LoggerName = Trace.Name; |
|
Log.EventRecordCallback = &MiniPerf__Callback; |
|
Log.ProcessTraceMode = PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP | PROCESS_TRACE_MODE_REAL_TIME; |
|
Log.Context = &Context; |
|
|
|
// open trace for processing incoming events |
|
TRACEHANDLE Session = OpenTraceW(&Log); |
|
MP_ASSERT(Session != INVALID_PROCESSTRACE_HANDLE); |
|
|
|
// start ETW processing thread |
|
HANDLE ProcessingThread = CreateThread(NULL, 0, &MiniPerf__ProcessThread, (LPVOID)Session, 0, NULL); |
|
MP_ASSERT(ProcessingThread); |
|
|
|
// execute target function |
|
// it will happen on thread so there is a context switch right at the start of execution to capture initial PMU counter values |
|
{ |
|
// create suspended thread so we know ThreadId is fully available |
|
HANDLE FunThread = CreateThread(NULL, 0, &MiniPerf__FunThread, &Context, CREATE_SUSPENDED, &Context.ThreadId); |
|
MP_ASSERT(FunThread); |
|
|
|
// pin thread to one CPU core |
|
Context.CpuIndex = SetThreadIdealProcessor(FunThread, MAXIMUM_PROCESSORS); |
|
SetThreadAffinityMask(FunThread, 1ULL << Context.CpuIndex); |
|
|
|
// now allow thread to run, thus force context switch for target thread |
|
ResumeThread(FunThread); |
|
|
|
WaitForSingleObject(FunThread, INFINITE); |
|
CloseHandle(FunThread); |
|
} |
|
|
|
// stop producing new events |
|
Status = ControlTraceW(TraceHandle, NULL, (EVENT_TRACE_PROPERTIES*)Properties, EVENT_TRACE_CONTROL_STOP); |
|
MP_ASSERT(Status == ERROR_SUCCESS); |
|
|
|
// closes trace processing, this will make ETW to process all the pending events in buffers |
|
Status = CloseTrace(Session); |
|
MP_ASSERT(Status == ERROR_SUCCESS || Status == ERROR_CTX_CLOSE_PENDING); |
|
|
|
// wait until ETW processing thread finishes with callbacks |
|
WaitForSingleObject(ProcessingThread, INFINITE); |
|
CloseHandle(ProcessingThread); |
|
} |
|
|
|
LARGE_INTEGER Freq; |
|
QueryPerformanceFrequency(&Freq); |
|
Context.Result.ElapsedTime = (double)(Context.EndTime.QuadPart - Context.StartTime.QuadPart) / Freq.QuadPart; |
|
|
|
return Context.Result; |
|
} |
|
|
|
#elif defined(__linux__) |
|
|
|
#include <time.h> |
|
#if defined(__x86_64__) |
|
# include <cpuid.h> |
|
#endif |
|
#include <sched.h> |
|
#include <unistd.h> |
|
#include <sys/ioctl.h> |
|
#include <sys/syscall.h> |
|
#include <linux/perf_event.h> |
|
#include <assert.h> |
|
|
|
MiniPerfResult MiniPerf(MiniPerfFun* Fun, void* Arg) |
|
{ |
|
MiniPerfResult Result = { 0 }; |
|
|
|
int CounterCount = 0; |
|
#if defined(__x86_64__) |
|
{ |
|
int eax, ebx, ecx, edx; |
|
__cpuid(0, eax, ebx, ecx, edx); |
|
|
|
if (ecx == signature_INTEL_ecx) |
|
{ |
|
__cpuid(0xa, eax, ebx, ecx, edx); |
|
CounterCount = (eax >> 8) & 0xff; |
|
} |
|
else if (ecx == signature_AMD_ecx) |
|
{ |
|
__cpuid(0x80000001, eax, ebx, ecx, edx); |
|
CounterCount = ((eax >> 23) & 1) ? 6 : 0; |
|
} |
|
else |
|
{ |
|
assert(!"Unknown CPU"); |
|
return Result; |
|
} |
|
} |
|
#else |
|
CounterCount = MP_COUNT; // TODO: is it possible to get this value on armv8 at runtime? |
|
#endif |
|
|
|
static const uint32_t PerfConfig[MP_COUNT][2] = |
|
{ |
|
[MP_CycleCount] = { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES }, |
|
[MP_Instructions] = { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS }, |
|
[MP_BranchMisses] = { PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES }, |
|
[MP_BranchCount] = { PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, |
|
[MP_DataMisses] = { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES }, |
|
[MP_DataAccess] = { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES }, |
|
// [MP_DataMisses] = { PERF_TYPE_HW_CACHE, (PERF_COUNT_HW_CACHE_L1D | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)) }, |
|
// [MP_DataAccess] = { PERF_TYPE_HW_CACHE, (PERF_COUNT_HW_CACHE_L1D | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)) }, |
|
}; |
|
|
|
// capture up to MP_COUNT counters |
|
if (CounterCount > MP_COUNT) |
|
{ |
|
CounterCount = MP_COUNT; |
|
} |
|
|
|
int PerfFile[MP_COUNT + 1] = { 0 }; |
|
PerfFile[0] = -1; |
|
|
|
// query index of current CPU core |
|
int CpuIndex = sched_getcpu(); |
|
|
|
// pin current thread to CPU core |
|
cpu_set_t CpuMask, CpuMaskOld; |
|
CPU_ZERO(&CpuMask); |
|
CPU_SET(CpuIndex, &CpuMask); |
|
sched_getaffinity(0, sizeof(CpuMaskOld), &CpuMaskOld); |
|
sched_setaffinity(0, sizeof(CpuMask), &CpuMask); |
|
|
|
int SetupFailed = 0; |
|
|
|
// perf syscall setup |
|
for (int i=0; i<CounterCount; i++) |
|
{ |
|
struct perf_event_attr PerfAttr = { 0 }; |
|
PerfAttr.type = PerfConfig[i][0]; |
|
PerfAttr.size = sizeof(PerfAttr); |
|
PerfAttr.config = PerfConfig[i][1]; |
|
PerfAttr.disabled = 1; |
|
PerfAttr.pinned = i == 0; |
|
PerfAttr.read_format = PERF_FORMAT_GROUP; |
|
|
|
PerfFile[i] = syscall(SYS_perf_event_open, &PerfAttr, 0, CpuIndex, PerfFile[0], 0); |
|
if (PerfFile[i] < 0) |
|
{ |
|
// errno == EACCES - no permissions |
|
// errno == ENOENT - counter not available |
|
SetupFailed = 1; |
|
break; |
|
} |
|
} |
|
|
|
if (!SetupFailed) |
|
{ |
|
// also collect context switches |
|
struct perf_event_attr PerfAttr = { 0 }; |
|
PerfAttr.type = PERF_TYPE_SOFTWARE; |
|
PerfAttr.size = sizeof(PerfAttr); |
|
PerfAttr.config = PERF_COUNT_SW_CONTEXT_SWITCHES; |
|
PerfAttr.disabled = 1; |
|
PerfAttr.read_format = PERF_FORMAT_GROUP; |
|
|
|
PerfFile[CounterCount] = syscall(SYS_perf_event_open, &PerfAttr, 0, CpuIndex, PerfFile[0], 0); |
|
} |
|
|
|
struct timespec TimeStart; |
|
struct timespec TimeEnd; |
|
|
|
if (!SetupFailed) |
|
{ |
|
ioctl(PerfFile[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); |
|
} |
|
|
|
clock_gettime(CLOCK_MONOTONIC_RAW, &TimeStart); |
|
Fun(Arg); |
|
clock_gettime(CLOCK_MONOTONIC_RAW, &TimeEnd); |
|
|
|
if (!SetupFailed) |
|
{ |
|
ioctl(PerfFile[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); |
|
} |
|
|
|
// restore CPU affinity |
|
sched_setaffinity(0, sizeof(CpuMaskOld), &CpuMaskOld); |
|
|
|
// read counter values |
|
uint64_t Values[1+MP_COUNT+1]; |
|
|
|
// if this condition fails, then most likely you have not disabled NMI watchdog |
|
// which means perf is not able to setup all the PMU counters - during setup |
|
// the value pinned=1 means to read counter values only if all of them are available |
|
if (-1 != read(PerfFile[0], Values, sizeof(Values))) |
|
{ |
|
for (int i=0; i<CounterCount; i++) |
|
{ |
|
Result.Counters[i] = Values[1+i]; |
|
} |
|
Result.ContextSwitches = Values[1+CounterCount]; |
|
} |
|
|
|
// done with perf |
|
for (int i=0; i<MP_COUNT+1; i++) |
|
{ |
|
if (PerfFile[i] > 0) |
|
{ |
|
close(PerfFile[i]); |
|
} |
|
} |
|
|
|
Result.ElapsedTime = (TimeEnd.tv_sec - TimeStart.tv_sec) + 1e-9 * (TimeEnd.tv_nsec - TimeStart.tv_nsec); |
|
|
|
return Result; |
|
} |
|
|
|
#elif defined(__APPLE__) |
|
|
|
#include <stdint.h> |
|
#include <stdlib.h> |
|
#include <dlfcn.h> |
|
#include <assert.h> |
|
#include <pthread.h> |
|
#include <time.h> |
|
#include <sys/resource.h> |
|
|
|
// adapted from https://gist.github.com/ibireme/173517c208c7dc333ba962c1f0d67d12 |
|
|
|
typedef struct kpep_db kpep_db; |
|
typedef struct kpep_event kpep_event; |
|
typedef struct kpep_config kpep_config; |
|
typedef uint64_t kpc_config_t; |
|
|
|
#define KPC_MAX_COUNTERS 32 |
|
#define KPC_CLASS_CONFIGURABLE (1) |
|
#define KPC_CLASS_CONFIGURABLE_MASK (1U << KPC_CLASS_CONFIGURABLE) |
|
|
|
#define KPERF_FUNCS(X) \ |
|
X(int, kpc_force_all_ctrs_set, int value) \ |
|
X(int, kpc_set_config, uint32_t classes, kpc_config_t* config) \ |
|
X(int, kpc_set_counting, uint32_t classes) \ |
|
X(int, kpc_set_thread_counting, uint32_t classes) \ |
|
X(int, kpc_get_thread_counters, uint32_t tid, uint32_t buf_count, void* buf) \ |
|
|
|
#define KPERFDATA_FUNCS(X) \ |
|
X(int, kpep_db_create, const char *name, kpep_db** db) \ |
|
X(int, kpep_db_events_count, kpep_db* db, size_t* count) \ |
|
X(int, kpep_db_events, kpep_db* db, kpep_event** buf, size_t buf_size) \ |
|
X(int, kpep_db_event, kpep_db* db, const char* name, kpep_event** ev) \ |
|
X(int, kpep_event_name, kpep_event* ev, const char** name) \ |
|
X(int, kpep_event_description, kpep_event* ev, const char** desc) \ |
|
X(void, kpep_db_free, kpep_db* db) \ |
|
X(int, kpep_config_create, kpep_db* db, kpep_config** config) \ |
|
X(int, kpep_config_force_counters, kpep_config* cfg) \ |
|
X(int, kpep_config_add_event, kpep_config* cfg, kpep_event** ev, uint32_t flag, uint32_t* err) \ |
|
X(int, kpep_config_kpc_classes, kpep_config* cfg, uint32_t* classes) \ |
|
X(int, kpep_config_kpc_count, kpep_config* cfg, size_t* count) \ |
|
X(int, kpep_config_kpc_map, kpep_config* cfg, void* buf, size_t buf_size) \ |
|
X(int, kpep_config_kpc, kpep_config* cfg, kpc_config_t* buf, size_t buf_size) \ |
|
X(void, kpep_config_free, kpep_config *cfg) |
|
|
|
#define X(ret, name, ...) static ret (*name)(__VA_ARGS__); |
|
KPERF_FUNCS(X) |
|
KPERFDATA_FUNCS(X) |
|
#undef X |
|
|
|
MiniPerfResult MiniPerf(MiniPerfFun* Fun, void* Arg) |
|
{ |
|
MiniPerfResult Result = { 0 }; |
|
|
|
static uint32_t CounterClasses; |
|
static size_t CounterRegCount; |
|
static size_t CounterMap[KPC_MAX_COUNTERS]; |
|
static kpc_config_t CounterRegs[KPC_MAX_COUNTERS]; |
|
int ret; |
|
|
|
static int Init; |
|
if (!Init) |
|
{ |
|
void* KPerf = dlopen("/System/Library/PrivateFrameworks/kperf.framework/kperf", RTLD_LAZY); |
|
assert(KPerf); |
|
|
|
#define X(ret, name, ...) name = dlsym(KPerf, #name); assert(name); |
|
KPERF_FUNCS(X) |
|
#undef X |
|
|
|
void* KPerfData = dlopen("/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata", RTLD_LAZY); |
|
assert(KPerfData); |
|
|
|
#define X(ret, name, ...) name = dlsym(KPerfData, #name); assert(name); |
|
KPERFDATA_FUNCS(X) |
|
#undef X |
|
|
|
kpep_db* KpepDb; |
|
kpep_config* KpepConfig; |
|
|
|
ret = kpep_db_create(NULL, &KpepDb); assert(!ret && "kpep_db_create failed"); |
|
ret = kpep_config_create(KpepDb, &KpepConfig); assert(!ret && "kpep_config_create failed"); |
|
ret = kpep_config_force_counters(KpepConfig); assert(!ret && "kpep_config_force_counters failed"); |
|
|
|
#if 0 // dump all available events |
|
size_t Count; |
|
kpep_db_events_count(KpepDb, &Count); |
|
kpep_event** Events = (kpep_event**)calloc(sizeof(*Events), Count); |
|
kpep_db_events(KpepDb, Events, Count * sizeof(*Events)); |
|
for (size_t i=0; i<Count; i++) |
|
{ |
|
const char* Name; |
|
const char* Desc; |
|
kpep_event_name(Events[i], &Name); |
|
kpep_event_description(Events[i], &Desc); |
|
printf("%-35s %s\n", Name, Desc); |
|
} |
|
free(Events); |
|
#endif |
|
|
|
static const char* EventNames[][3] = |
|
{ |
|
{ "FIXED_CYCLES", "CPU_CLK_UNHALTED.THREAD", 0 }, // cycles |
|
{ "FIXED_INSTRUCTIONS", "INST_RETIRED.ANY", 0 }, // instructions |
|
{ "BRANCH_MISPRED_NONSPEC", "BRANCH_MISPREDICT", "BR_MISP_RETIRED.ALL_BRANCHES" }, // branch-misses |
|
{ "INST_BRANCH", "BR_INST_RETIRED.ALL_BRANCHES", 0 }, // branch-count |
|
}; |
|
|
|
for (size_t e=0; e<sizeof(EventNames)/sizeof(EventNames[0]); e++) |
|
{ |
|
for (size_t n=0; n<sizeof(EventNames[0])/sizeof(EventNames[0][0]); n++) |
|
{ |
|
kpep_event* Event; |
|
if (EventNames[e][n] && kpep_db_event(KpepDb, EventNames[e][n], &Event) == 0) |
|
{ |
|
const int UserSpaceOnly = 1; |
|
ret = kpep_config_add_event(KpepConfig, &Event, UserSpaceOnly, NULL); |
|
assert(!ret && "kpep_config_add_event failed"); |
|
break; |
|
} |
|
} |
|
} |
|
|
|
ret = kpep_config_kpc_classes(KpepConfig, &CounterClasses); assert(!ret && "kpep_config_kpc_classes failed"); |
|
ret = kpep_config_kpc_count(KpepConfig, &CounterRegCount); assert(!ret && "kpep_config_kpc_count failed"); |
|
ret = kpep_config_kpc_map(KpepConfig, CounterMap, sizeof(CounterMap)); assert(!ret && "kpep_config_kpc_map failed"); |
|
ret = kpep_config_kpc(KpepConfig, CounterRegs, sizeof(CounterRegs)); assert(!ret && "kpep_config_kpc failed"); |
|
|
|
kpep_config_free(KpepConfig); |
|
kpep_db_free(KpepDb); |
|
|
|
Init = 1; |
|
} |
|
|
|
qos_class_t ThreadClass; |
|
int ThreadPriority; |
|
pthread_get_qos_class_np(pthread_self(), &ThreadClass, &ThreadPriority); |
|
|
|
const int UseHighPerfCores = 1; |
|
pthread_set_qos_class_self_np(UseHighPerfCores ? QOS_CLASS_USER_INTERACTIVE : QOS_CLASS_BACKGROUND, ThreadPriority); |
|
|
|
int CountersEnabled = kpc_force_all_ctrs_set(1); |
|
if (CountersEnabled == 0) |
|
{ |
|
if ((CounterClasses & KPC_CLASS_CONFIGURABLE_MASK) && CounterRegCount) |
|
{ |
|
ret = kpc_set_config(CounterClasses, CounterRegs); |
|
assert(!ret && "kpc_set_config failed"); |
|
} |
|
ret = kpc_set_counting(CounterClasses); |
|
assert(!ret && "kpc_set_counting failed"); |
|
|
|
ret = kpc_set_thread_counting(CounterClasses); |
|
assert(!ret && "kpc_set_thread_counting failed"); |
|
} |
|
|
|
struct rusage UsageStart; |
|
getrusage(RUSAGE_SELF, &UsageStart); |
|
|
|
uint64_t CountersStart[KPC_MAX_COUNTERS] = { 0 }; |
|
if (CountersEnabled == 0) |
|
{ |
|
ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, CountersStart); |
|
assert(!ret && "kpc_get_thread_counters failed"); |
|
} |
|
|
|
struct timespec TimeStart; |
|
struct timespec TimeEnd; |
|
clock_gettime(CLOCK_MONOTONIC_RAW, &TimeStart); |
|
Fun(Arg); |
|
clock_gettime(CLOCK_MONOTONIC_RAW, &TimeEnd); |
|
|
|
uint64_t CountersEnd[KPC_MAX_COUNTERS] = { 0 }; |
|
if (CountersEnabled == 0) |
|
{ |
|
ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, CountersEnd); |
|
assert(!ret && "kpc_get_thread_counters failed"); |
|
} |
|
|
|
struct rusage UsageEnd; |
|
getrusage(RUSAGE_SELF, &UsageEnd); |
|
|
|
if (CountersEnabled == 0) |
|
{ |
|
kpc_set_thread_counting(0); |
|
kpc_set_counting(0); |
|
kpc_force_all_ctrs_set(0); |
|
} |
|
|
|
pthread_set_qos_class_self_np(ThreadClass, ThreadPriority); |
|
|
|
for (size_t i=0; i<MP_COUNT; i++) |
|
{ |
|
size_t Index = CounterMap[i]; |
|
Result.Counters[i] = CountersEnd[Index] - CountersStart[Index]; |
|
} |
|
Result.ElapsedTime = (TimeEnd.tv_sec - TimeStart.tv_sec) + 1e-9 * (TimeEnd.tv_nsec - TimeStart.tv_nsec); |
|
Result.ContextSwitches = UsageEnd.ru_nvcsw + UsageEnd.ru_nivcsw - UsageStart.ru_nvcsw - UsageStart.ru_nivcsw; |
|
return Result; |
|
} |
|
|
|
#endif |
@mmozeiko Nice code!
It has minor issues when compiling on windows like warnings because of size_t -> ULONG casts, and it took a while for me to understand that I need to set WINVER to at least 0x0602 in project macro definitions (otherwise TraceSetInformation is not defined in evntrace.h header), But in the end it works!
Don't you want to create git project out of this? I was thinking of making some kind of executable launcher for other apps to track their PMU counters. Like
perf stat --verbose -e instructions
but for windows. I could not find anything like this for windows.Also it is interesting to see how on windows we need this trick to monitor context switches on whole CPU just to correctly save PMCs (very clever by the way!), I guess on linux perf syscall is doing the same under the hood.