Created
January 31, 2023 13:37
-
-
Save s417-lama/0ad2466c3b5f7cd49eec1cac4f17819d to your computer and use it in GitHub Desktop.
C program to measure actual throughput of AVX-512 FMA instructions and frequency
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Usage: | |
* $ gcc -march=native -O3 -fopenmp avx_fma_checker.c | |
* $ ./a.out <n_threads> <n_iterations> <n_repeats> | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <unistd.h> | |
#include <string.h> | |
#include <time.h> | |
#include <sys/time.h> | |
#include <sys/ioctl.h> | |
#include <sys/types.h> | |
#include <linux/perf_event.h> | |
#include <asm/unistd.h> | |
#include <omp.h> | |
#define STR_(x) #x | |
#define STR(x) STR_(x) | |
#define CONCAT_(a, b) a##b | |
#define CONCAT(a, b) CONCAT_(a, b) | |
/* #define PRECISION float */ | |
#define PRECISION double | |
#if PRECISION == float | |
#define TYPE_SUFFIX | |
#define FN_SUFFIX s | |
#elif PRECISION == double | |
#define TYPE_SUFFIX d | |
#define FN_SUFFIX d | |
#else | |
#error "PRECISION must be float or double" | |
#endif | |
#if defined(__AVX512F__) | |
#include <immintrin.h> | |
#define SIMD_REGS 16 | |
#define SIMD_LEN 512 | |
#define SIMD_REG_TYPE CONCAT(__m512 , TYPE_SUFFIX) | |
#define SIMD_SET CONCAT(_mm512_set1_p , FN_SUFFIX) | |
#define SIMD_FMA CONCAT(_mm512_fmadd_p, FN_SUFFIX) | |
#elif defined(__AVX2__) | |
#include <immintrin.h> | |
#define SIMD_REGS 16 | |
#define SIMD_LEN 256 | |
#define SIMD_REG_TYPE CONCAT(__m256 , TYPE_SUFFIX) | |
#define SIMD_SET CONCAT(_mm256_set1_p , FN_SUFFIX) | |
#define SIMD_FMA CONCAT(_mm256_fmadd_p, FN_SUFFIX) | |
#else | |
#error "__AVX512F__ or __AVX2__ must be defined" | |
#endif | |
uint64_t gettime(void) { | |
struct timespec ts; | |
clock_gettime(CLOCK_MONOTONIC, &ts); | |
return (uint64_t)ts.tv_sec * 1000000000 + (uint64_t)ts.tv_nsec; | |
} | |
static inline int perf_event_open(struct perf_event_attr* pe, | |
pid_t pid, | |
int cpu, | |
int group_fd, | |
unsigned long flags) { | |
int ret = syscall(__NR_perf_event_open, pe, pid, cpu, group_fd, flags); | |
return ret; | |
} | |
static inline int perf_event_init(uint64_t config) { | |
struct perf_event_attr pe; | |
memset(&pe, 0, sizeof(struct perf_event_attr)); | |
pe.size = sizeof(struct perf_event_attr); | |
pe.type = PERF_TYPE_HARDWARE; | |
pe.config = config; | |
pe.exclude_kernel = 1; | |
pe.exclude_hv = 1; | |
pe.disabled = 1; | |
pe.inherit = 1; | |
int fd = perf_event_open(&pe, 0, -1, -1, 0); | |
if (fd == -1) { | |
perror("perf_event_open"); | |
exit(EXIT_FAILURE); | |
} | |
ioctl(fd, PERF_EVENT_IOC_RESET, 0); | |
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); | |
return fd; | |
} | |
static inline void perf_event_fini(int fd) { | |
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); | |
close(fd); | |
} | |
static inline uint64_t perf_event_read(int fd) { | |
uint64_t count; | |
if (read(fd, &count, sizeof(uint64_t)) != sizeof(uint64_t)) { | |
fprintf(stderr, "Failed to read perf events\n"); | |
exit(EXIT_FAILURE); | |
} | |
return count; | |
} | |
int main(int argc, char** argv) { | |
int n_cpus = 1; | |
if (argc > 1) { | |
n_cpus = atoi(argv[1]); | |
} | |
uint64_t n_iters = 1000000000; | |
if (argc > 2) { | |
n_iters = atol(argv[2]); | |
} | |
uint64_t n_repeats = 3; | |
if (argc > 3) { | |
n_repeats = atol(argv[3]); | |
} | |
int cpu_cycle_fd = perf_event_init(PERF_COUNT_HW_CPU_CYCLES); | |
int ref_cycle_fd = perf_event_init(PERF_COUNT_HW_REF_CPU_CYCLES); | |
omp_set_num_threads(n_cpus); | |
int n_real_cpus; | |
#pragma omp parallel | |
{ | |
#pragma omp single | |
n_real_cpus = omp_get_num_threads(); | |
} | |
printf("[AVX FMA Performance Checker]\n"); | |
printf("SIMD Length : %d bits\n", SIMD_LEN); | |
printf("Presicion : " STR(PRECISION) " (%ld bytes)\n", sizeof(PRECISION)); | |
printf("# of Threads (CPUs) : %d\n" , n_real_cpus); | |
printf("# of Iterations : %ld\n" , n_iters); | |
printf("# of Repeats : %ld\n" , n_repeats); | |
printf("\n"); | |
#pragma omp parallel | |
{ | |
uint64_t t0, c0, r0, t1, c1, r1; | |
SIMD_REG_TYPE A[SIMD_REGS]; | |
for (int i = 0; i < SIMD_REGS; i++) { | |
A[i] = SIMD_SET(1.0f); | |
} | |
for (int r = 0; r < n_repeats; r++) { | |
#pragma omp barrier | |
#pragma omp master | |
{ | |
printf("========== Repeat: %d ==========\n", r); | |
t0 = gettime(); | |
c0 = perf_event_read(cpu_cycle_fd); | |
r0 = perf_event_read(ref_cycle_fd); | |
} | |
#pragma omp barrier | |
for (size_t it = 0; it < n_iters; it++) { | |
for (int i = 0; i < SIMD_REGS; i++) { | |
A[i] = SIMD_FMA(A[i], A[i], A[i]); | |
} | |
} | |
#pragma omp barrier | |
#pragma omp master | |
{ | |
t1 = gettime(); | |
c1 = perf_event_read(cpu_cycle_fd); | |
r1 = perf_event_read(ref_cycle_fd); | |
} | |
for (int i = 0; i < SIMD_REGS; i++) { | |
if (*(double*)(A + i) == 0) { | |
printf("error\n"); | |
} | |
} | |
#pragma omp master | |
{ | |
uint64_t time_ns = t1 - t0; | |
uint64_t cpu_cycles = (c1 - c0) / n_real_cpus; | |
uint64_t ref_cycles = (r1 - r0) / n_real_cpus; | |
double cpu_ghz = (double)cpu_cycles / time_ns; | |
double ref_ghz = (double)ref_cycles / time_ns; | |
uint64_t total_fmas = n_iters * SIMD_REGS * SIMD_LEN / 8 / sizeof(PRECISION) * n_real_cpus; | |
double fmas_pc = (double)total_fmas / cpu_cycles; | |
double gflops = (double)2.0 * total_fmas / time_ns; | |
printf("Execution Time : %ld ns\n" , time_ns); | |
printf("CPU Cycles : %ld\n" , cpu_cycles); | |
printf("Ref Cycles : %ld\n" , ref_cycles); | |
printf("CPU Frequency : %f GHz\n" , cpu_ghz); | |
printf("Ref Frequency : %f GHz\n" , ref_ghz); | |
printf("Total FMAs : %ld\n" , total_fmas); | |
printf("FMAs/CPU cycle : %f\n" , fmas_pc); | |
printf("Performance : %f GFLOPS\n" , gflops); | |
printf("\n"); | |
} | |
#pragma omp barrier | |
} | |
} | |
perf_event_fini(cpu_cycle_fd); | |
perf_event_fini(ref_cycle_fd); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment