Created
December 28, 2023 07:31
-
-
Save jyc/768872038b4cd2056e93a08d9cdef4f1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// cc -O3 -o dotproduct dotproduct.c -march=native -ffast-math | |
// | |
// Pretty much just Prof. Daniel Lemire's code, hacked by me | |
// All errors are mine | |
// | |
// Prof. Lemire says in https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/README.md: | |
// Unless otherwise stated, I make no copyright claim | |
// on this code: you may consider it to be in the public | |
// domain. | |
// Don't bother forking this code: just steal it. | |
#include <arm_neon.h> | |
#include <assert.h> | |
#include <stdbool.h> | |
#include <stddef.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <time.h> | |
const char *unitname = " nanoseconds "; | |
#define RDTSC_START(cycles) \ | |
do { \ | |
cycles = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); \ | |
} while (0) | |
#define RDTSC_STOP(cycles) \ | |
do { \ | |
cycles = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); \ | |
} while (0) | |
static __attribute__((noinline)) uint64_t rdtsc_overhead_func(uint64_t dummy) { | |
return dummy; | |
} | |
uint64_t global_rdtsc_overhead = (uint64_t)UINT64_MAX; | |
#define RDTSC_SET_OVERHEAD(test, repeat) \ | |
do { \ | |
uint64_t cycles_start, cycles_final, cycles_diff; \ | |
uint64_t min_diff = UINT64_MAX; \ | |
for (int i = 0; i < repeat; i++) { \ | |
__asm volatile("" ::: /* pretend to clobber */ "memory"); \ | |
RDTSC_START(cycles_start); \ | |
test; \ | |
RDTSC_STOP(cycles_final); \ | |
cycles_diff = (cycles_final - cycles_start); \ | |
if (cycles_diff < min_diff) \ | |
min_diff = cycles_diff; \ | |
} \ | |
global_rdtsc_overhead = min_diff; \ | |
} while (0) | |
/* | |
* Prints the best number of operations per cycle where | |
* test is the function call, answer is the expected answer generated by | |
* test, repeat is the number of times we should repeat and size is the | |
* number of operations represented by test. | |
*/ | |
#define BEST_TIME(test, expected, pre, repeat, size, bytes, verbose) \ | |
do { \ | |
if (global_rdtsc_overhead == UINT64_MAX) { \ | |
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \ | |
} \ | |
if (verbose) \ | |
printf("%-60s\t: ", #test); \ | |
fflush(NULL); \ | |
uint64_t cycles_start, cycles_final, cycles_diff; \ | |
uint64_t min_diff = (uint64_t)-1; \ | |
uint64_t sum_diff = 0; \ | |
for (int i = 0; i < repeat; i++) { \ | |
pre; \ | |
__asm volatile("" ::: /* pretend to clobber */ "memory"); \ | |
RDTSC_START(cycles_start); \ | |
if (test != expected) { \ | |
printf("not expected (%d , %d )", (int)test, (int)expected); \ | |
break; \ | |
} \ | |
RDTSC_STOP(cycles_final); \ | |
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \ | |
if (cycles_diff < min_diff) \ | |
min_diff = cycles_diff; \ | |
sum_diff += cycles_diff; \ | |
} \ | |
uint64_t S = size; \ | |
float cycle_per_op = (min_diff) / (double)S; \ | |
float bytes_per_cycle = (double)bytes / (min_diff); \ | |
float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \ | |
if (verbose) \ | |
printf(" %.3f %s per operation (best) ", cycle_per_op, unitname); \ | |
if (verbose) \ | |
printf(" %.3f bytes per cycle (best) ", bytes_per_cycle); \ | |
if (verbose) \ | |
printf("\t%.3f %s per operation (avg) ", avg_cycle_per_op, unitname); \ | |
if (verbose) \ | |
printf("\n"); \ | |
if (!verbose) \ | |
printf(" %.3f (%.3f) ", cycle_per_op,bytes_per_cycle); \ | |
fflush(NULL); \ | |
} while (0) | |
// like BEST_TIME, but no check | |
#define BEST_TIME_NOCHECK(test, pre, repeat, size, verbose) \ | |
do { \ | |
if (global_rdtsc_overhead == UINT64_MAX) { \ | |
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \ | |
} \ | |
if (verbose) \ | |
printf("%-20s\t: ", #test); \ | |
fflush(NULL); \ | |
uint64_t cycles_start, cycles_final, cycles_diff; \ | |
uint64_t min_diff = (uint64_t)-1; \ | |
uint64_t sum_diff = 0; \ | |
for (int i = 0; i < repeat; i++) { \ | |
pre; \ | |
__asm volatile("" ::: /* pretend to clobber */ "memory"); \ | |
RDTSC_START(cycles_start); \ | |
test; \ | |
RDTSC_STOP(cycles_final); \ | |
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \ | |
if (cycles_diff < min_diff) \ | |
min_diff = cycles_diff; \ | |
sum_diff += cycles_diff; \ | |
} \ | |
uint64_t S = size; \ | |
float cycle_per_op = (min_diff) / (double)S; \ | |
float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \ | |
if (verbose) \ | |
printf(" %.3f %s per operation (best) ", cycle_per_op, unitname); \ | |
if (verbose) \ | |
printf("\n"); \ | |
if (!verbose) \ | |
printf(" %.3f ", cycle_per_op); \ | |
fflush(NULL); \ | |
} while (0) | |
// like BEST_TIME except that we run a function to check the result | |
#define BEST_TIME_CHECK(test, check, pre, repeat, size, verbose) \ | |
do { \ | |
if (global_rdtsc_overhead == UINT64_MAX) { \ | |
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \ | |
} \ | |
if (verbose) \ | |
printf("%-60s\t: ", #test); \ | |
fflush(NULL); \ | |
uint64_t cycles_start, cycles_final, cycles_diff; \ | |
uint64_t min_diff = (uint64_t)-1; \ | |
uint64_t sum_diff = 0; \ | |
for (int i = 0; i < repeat; i++) { \ | |
pre; \ | |
__asm volatile("" ::: /* pretend to clobber */ "memory"); \ | |
RDTSC_START(cycles_start); \ | |
test; \ | |
RDTSC_STOP(cycles_final); \ | |
if (!check) { \ | |
printf("error"); \ | |
break; \ | |
} \ | |
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \ | |
if (cycles_diff < min_diff) \ | |
min_diff = cycles_diff; \ | |
sum_diff += cycles_diff; \ | |
} \ | |
uint64_t S = size; \ | |
float cycle_per_op = (min_diff) / (double)S; \ | |
float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \ | |
if (verbose) \ | |
printf(" %.3f cycles per operation (best) ", cycle_per_op); \ | |
if (verbose) \ | |
printf("\t%.3f cycles per operation (avg) ", avg_cycle_per_op); \ | |
if (verbose) \ | |
printf("\n"); \ | |
if (!verbose) \ | |
printf(" %.3f ", cycle_per_op); \ | |
fflush(NULL); \ | |
} while (0) | |
__attribute__((noinline)) float dot(float *x1, float *x2, size_t len) { | |
float sum = 0; | |
for (size_t i = 0; i < len; i++) { | |
sum += x1[i] * x2[i]; | |
} | |
return sum; | |
} | |
__attribute__((noinline)) float dot512fma2(float *x1, float *x2, size_t len) { | |
assert(len % 16 == 0); // Adjust this to match the vector size for NEON (4 floats) | |
float32x4_t sum = vdupq_n_f32(0); | |
for (size_t i = 0; i < len; i += 16) { | |
float32x4_t v11 = vld1q_f32(x1 + i); | |
float32x4_t v21 = vld1q_f32(x2 + i); | |
float32x4_t v12 = vld1q_f32(x1 + i + 4); | |
float32x4_t v22 = vld1q_f32(x2 + i + 4); | |
float32x4_t v13 = vld1q_f32(x1 + i + 8); | |
float32x4_t v23 = vld1q_f32(x2 + i + 8); | |
float32x4_t v14 = vld1q_f32(x1 + i + 12); | |
float32x4_t v24 = vld1q_f32(x2 + i + 12); | |
sum = vfmaq_f32(sum, v11, v21); | |
sum = vfmaq_f32(sum, v12, v22); | |
sum = vfmaq_f32(sum, v13, v23); | |
sum = vfmaq_f32(sum, v14, v24); | |
} | |
// Reduce the partial results | |
float32x2_t r = vadd_f32(vget_high_f32(sum), vget_low_f32(sum)); | |
r = vpadd_f32(r, r); | |
// Extract the final result | |
return vget_lane_f32(r, 0); | |
} | |
void demo(size_t number) { | |
number = number / 16 * 16; | |
printf("number = %zu.\n", number); | |
size_t bytes = number * 2 * sizeof(float); | |
if(bytes < 1024 * 1024) { | |
printf(" %.2f kB \n", (number * 2 * sizeof(float)) / (1024.0)); | |
} else { | |
printf(" %.2f MB \n", (number * 2 * sizeof(float)) / (1024.0* 1024.0)); | |
} | |
float *x1 = (float *)malloc(sizeof(float) * number); | |
float *x2 = (float *)malloc(sizeof(float) * number); | |
for (size_t i = 0; i < number; i++) { | |
x1[i] = i * 1.f / number; | |
x2[i] = (10 - i) * 1.f / number; | |
} | |
int repeat = 50; | |
float expected = dot(x1, x2, number); | |
printf("1: %f\n", expected); | |
expected = dot512fma2(x1, x2, number); | |
printf("2: %f\n", expected); | |
BEST_TIME(dot512fma2(x1, x2, number), expected, , repeat, number, bytes, true); | |
printf("\n"); | |
free(x1); | |
free(x2); | |
} | |
int main() { | |
demo(1024); | |
demo(2097152); | |
demo(4194304); | |
demo(8388608); | |
demo(16777216); | |
demo(33554432); | |
demo(134217728); | |
return EXIT_SUCCESS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment