Skip to content

Instantly share code, notes, and snippets.

@s417-lama
Created January 31, 2023 13:37
Show Gist options
  • Save s417-lama/0ad2466c3b5f7cd49eec1cac4f17819d to your computer and use it in GitHub Desktop.
Save s417-lama/0ad2466c3b5f7cd49eec1cac4f17819d to your computer and use it in GitHub Desktop.
C program to measure actual throughput of AVX-512 FMA instructions and frequency
/*
* Usage:
* $ gcc -march=native -O3 -fopenmp avx_fma_checker.c
* $ ./a.out <n_threads> <n_iterations> <n_repeats>
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
#include <omp.h>
#define STR_(x) #x
#define STR(x) STR_(x)
#define CONCAT_(a, b) a##b
#define CONCAT(a, b) CONCAT_(a, b)
/* #define PRECISION float */
#define PRECISION double
#if PRECISION == float
#define TYPE_SUFFIX
#define FN_SUFFIX s
#elif PRECISION == double
#define TYPE_SUFFIX d
#define FN_SUFFIX d
#else
#error "PRECISION must be float or double"
#endif
#if defined(__AVX512F__)
#include <immintrin.h>
#define SIMD_REGS 16
#define SIMD_LEN 512
#define SIMD_REG_TYPE CONCAT(__m512 , TYPE_SUFFIX)
#define SIMD_SET CONCAT(_mm512_set1_p , FN_SUFFIX)
#define SIMD_FMA CONCAT(_mm512_fmadd_p, FN_SUFFIX)
#elif defined(__AVX2__)
#include <immintrin.h>
#define SIMD_REGS 16
#define SIMD_LEN 256
#define SIMD_REG_TYPE CONCAT(__m256 , TYPE_SUFFIX)
#define SIMD_SET CONCAT(_mm256_set1_p , FN_SUFFIX)
#define SIMD_FMA CONCAT(_mm256_fmadd_p, FN_SUFFIX)
#else
#error "__AVX512F__ or __AVX2__ must be defined"
#endif
uint64_t gettime(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t)ts.tv_sec * 1000000000 + (uint64_t)ts.tv_nsec;
}
static inline int perf_event_open(struct perf_event_attr* pe,
pid_t pid,
int cpu,
int group_fd,
unsigned long flags) {
int ret = syscall(__NR_perf_event_open, pe, pid, cpu, group_fd, flags);
return ret;
}
static inline int perf_event_init(uint64_t config) {
struct perf_event_attr pe;
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.size = sizeof(struct perf_event_attr);
pe.type = PERF_TYPE_HARDWARE;
pe.config = config;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
pe.disabled = 1;
pe.inherit = 1;
int fd = perf_event_open(&pe, 0, -1, -1, 0);
if (fd == -1) {
perror("perf_event_open");
exit(EXIT_FAILURE);
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
return fd;
}
static inline void perf_event_fini(int fd) {
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
close(fd);
}
static inline uint64_t perf_event_read(int fd) {
uint64_t count;
if (read(fd, &count, sizeof(uint64_t)) != sizeof(uint64_t)) {
fprintf(stderr, "Failed to read perf events\n");
exit(EXIT_FAILURE);
}
return count;
}
int main(int argc, char** argv) {
int n_cpus = 1;
if (argc > 1) {
n_cpus = atoi(argv[1]);
}
uint64_t n_iters = 1000000000;
if (argc > 2) {
n_iters = atol(argv[2]);
}
uint64_t n_repeats = 3;
if (argc > 3) {
n_repeats = atol(argv[3]);
}
int cpu_cycle_fd = perf_event_init(PERF_COUNT_HW_CPU_CYCLES);
int ref_cycle_fd = perf_event_init(PERF_COUNT_HW_REF_CPU_CYCLES);
omp_set_num_threads(n_cpus);
int n_real_cpus;
#pragma omp parallel
{
#pragma omp single
n_real_cpus = omp_get_num_threads();
}
printf("[AVX FMA Performance Checker]\n");
printf("SIMD Length : %d bits\n", SIMD_LEN);
printf("Presicion : " STR(PRECISION) " (%ld bytes)\n", sizeof(PRECISION));
printf("# of Threads (CPUs) : %d\n" , n_real_cpus);
printf("# of Iterations : %ld\n" , n_iters);
printf("# of Repeats : %ld\n" , n_repeats);
printf("\n");
#pragma omp parallel
{
uint64_t t0, c0, r0, t1, c1, r1;
SIMD_REG_TYPE A[SIMD_REGS];
for (int i = 0; i < SIMD_REGS; i++) {
A[i] = SIMD_SET(1.0f);
}
for (int r = 0; r < n_repeats; r++) {
#pragma omp barrier
#pragma omp master
{
printf("========== Repeat: %d ==========\n", r);
t0 = gettime();
c0 = perf_event_read(cpu_cycle_fd);
r0 = perf_event_read(ref_cycle_fd);
}
#pragma omp barrier
for (size_t it = 0; it < n_iters; it++) {
for (int i = 0; i < SIMD_REGS; i++) {
A[i] = SIMD_FMA(A[i], A[i], A[i]);
}
}
#pragma omp barrier
#pragma omp master
{
t1 = gettime();
c1 = perf_event_read(cpu_cycle_fd);
r1 = perf_event_read(ref_cycle_fd);
}
for (int i = 0; i < SIMD_REGS; i++) {
if (*(double*)(A + i) == 0) {
printf("error\n");
}
}
#pragma omp master
{
uint64_t time_ns = t1 - t0;
uint64_t cpu_cycles = (c1 - c0) / n_real_cpus;
uint64_t ref_cycles = (r1 - r0) / n_real_cpus;
double cpu_ghz = (double)cpu_cycles / time_ns;
double ref_ghz = (double)ref_cycles / time_ns;
uint64_t total_fmas = n_iters * SIMD_REGS * SIMD_LEN / 8 / sizeof(PRECISION) * n_real_cpus;
double fmas_pc = (double)total_fmas / cpu_cycles;
double gflops = (double)2.0 * total_fmas / time_ns;
printf("Execution Time : %ld ns\n" , time_ns);
printf("CPU Cycles : %ld\n" , cpu_cycles);
printf("Ref Cycles : %ld\n" , ref_cycles);
printf("CPU Frequency : %f GHz\n" , cpu_ghz);
printf("Ref Frequency : %f GHz\n" , ref_ghz);
printf("Total FMAs : %ld\n" , total_fmas);
printf("FMAs/CPU cycle : %f\n" , fmas_pc);
printf("Performance : %f GFLOPS\n" , gflops);
printf("\n");
}
#pragma omp barrier
}
}
perf_event_fini(cpu_cycle_fd);
perf_event_fini(ref_cycle_fd);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment