Skip to content

Instantly share code, notes, and snippets.

@BigRedEye
Created January 9, 2022 12:32
Show Gist options
  • Save BigRedEye/582d25ad01c634347c0aa31c13369aa1 to your computer and use it in GitHub Desktop.
Save BigRedEye/582d25ad01c634347c0aa31c13369aa1 to your computer and use it in GitHub Desktop.
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <stdexcept>
#include <optional>
namespace perf {
using int64 = long long;
struct PerfMetrics
{
int64 CpuInstructions = 0;
int64 CacheMisses = 0;
int64 TimeNs = 0;
};
class PerfEvent {
public:
explicit PerfEvent(__u32 type, __u32 config) {
struct perf_event_attr attr{};
attr.size = sizeof(attr);
attr.type = type;
attr.config = config;
attr.disabled = 1;
fd_ = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
if (fd_ == -1) {
throw std::runtime_error{"Failed to open perf event"};
}
}
void Start() {
ioctl(fd_, PERF_EVENT_IOC_RESET);
ioctl(fd_, PERF_EVENT_IOC_ENABLE);
}
void Stop() {
ioctl(fd_, PERF_EVENT_IOC_DISABLE);
}
std::optional<int64> Read() {
int64 res = 0;
if (::read(fd_, &res, sizeof(res)) != sizeof(res)) {
return std::nullopt;
}
return res;
}
~PerfEvent() {
::close(fd_);
}
public:
static PerfEvent CpuInstructions() {
return PerfEvent{PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS};
}
static PerfEvent CacheMisses() {
return PerfEvent{PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES};
}
static PerfEvent TaskClock() {
return PerfEvent{PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK};
}
private:
int fd_ = -1;
};
} // namespace perf
#include <fmt/format.h>
#include <chrono>
#include <cstddef>
#include <unordered_map>
#include <vector>
// The best allocator ever
char* init = (char*)malloc(8ull * 1024 * 1024 * 1024);
char* buf = init;
void* operator new(size_t count) {
char* ptr = buf;
while (count % 16) {
++count;
}
buf += count;
return ptr;
}
void operator delete(void* ptr, size_t count) {}
void reset_mem() {
buf = init;
}
constexpr int ITERATIONS = 100'000'000;
using Clock = std::chrono::steady_clock;
double Seconds(Clock::duration delta) {
return std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
}
__attribute__((noinline)) void fill_map(std::vector<int>& map) {
volatile int* ptr = map.data();
for (int i = 0; i < ITERATIONS; ++i) {
map[i] = -i;
}
}
template <template <class ...> class Map>
__attribute__((noinline)) void run_map_test() {
perf::PerfEvent event = perf::PerfEvent::CpuInstructions();
event.Start();
std::vector<int> map;
map.resize(ITERATIONS);
fill_map(map);
auto delta = *event.Read();
fmt::print("{} instructions per iteration\n", static_cast<double>(delta) / ITERATIONS);
}
int main() {
for (int i = 0; i < 10; ++i) {
auto start = Clock::now();
reset_mem();
run_map_test<std::unordered_map>();
auto end = Clock::now();
fmt::print("It took {} secs for {} iterations\n", Seconds(end - start), ITERATIONS);
}
}
/* Possible output:
4.84316289 instructions per iteration
It took 0.242586475 secs for 100000000 iterations
2.00129663 instructions per iteration
It took 0.068408634 secs for 100000000 iterations
2.00115756 instructions per iteration
It took 0.068283233 secs for 100000000 iterations
2.0010276 instructions per iteration
It took 0.068269616 secs for 100000000 iterations
2.00129688 instructions per iteration
It took 0.06803983 secs for 100000000 iterations
2.00105244 instructions per iteration
It took 0.065850256 secs for 100000000 iterations
2.00100018 instructions per iteration
It took 0.06551507 secs for 100000000 iterations
2.00124106 instructions per iteration
It took 0.065625378 secs for 100000000 iterations
2.00103205 instructions per iteration
It took 0.065819786 secs for 100000000 iterations
2.00099453 instructions per iteration
It took 0.06574377 secs for 100000000 iterations
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment