Last active
July 17, 2024 19:15
-
-
Save bitonic/78887f5d3238bab5e31f3c5a41d404b2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// See <https://gist.github.com/pervognsen/b58108d134824e61caedffcc01004e03> for | |
// Per Vognsen gist on value speculation. | |
// | |
// I compile this on linux with | |
// | |
// $ clang --version | |
// clang version 12.0.0 | |
// Target: x86_64-unknown-linux-gnu | |
// $ clang -static -Wall -O3 value-speculation-linux.c -o value-speculation-linux | |
// | |
// On a "Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz" I get: | |
// | |
// 16kB, 10000 iterations | |
// sum1: 8465052097154389858, 1.12us, 14.24GB/s, 3.91 cycles/elem, 1.02 instrs/cycle, 3.48GHz, 4.01 instrs/elem | |
// sum2: 8465052097154389858, 0.57us, 27.95GB/s, 1.99 cycles/elem, 5.02 instrs/cycle, 3.48GHz, 10.01 instrs/elem | |
// sum3: 8465052097154389858, 0.37us, 43.40GB/s, 1.28 cycles/elem, 3.91 instrs/cycle, 3.48GHz, 5.01 instrs/elem | |
// sum4: 8465052097154389858, 0.39us, 41.03GB/s, 1.36 cycles/elem, 3.69 instrs/cycle, 3.48GHz, 5.01 instrs/elem | |
// sum5: 8465052097154389858, 0.37us, 43.55GB/s, 1.28 cycles/elem, 3.92 instrs/cycle, 3.48GHz, 5.01 instrs/elem | |
// 128kB, 10000 iterations | |
// sum1: 6947699366156898439, 9.07us, 14.11GB/s, 3.95 cycles/elem, 1.01 instrs/cycle, 3.49GHz, 4.00 instrs/elem | |
// sum2: 6947699366156898439, 4.51us, 28.36GB/s, 1.97 cycles/elem, 5.09 instrs/cycle, 3.49GHz, 10.00 instrs/elem | |
// sum3: 6947699366156898439, 3.78us, 33.84GB/s, 1.65 cycles/elem, 3.04 instrs/cycle, 3.48GHz, 5.00 instrs/elem | |
// sum4: 6947699366156898439, 3.32us, 38.51GB/s, 1.45 cycles/elem, 3.45 instrs/cycle, 3.48GHz, 5.00 instrs/elem | |
// sum5: 6947699366156898439, 3.78us, 33.86GB/s, 1.65 cycles/elem, 3.04 instrs/cycle, 3.49GHz, 5.00 instrs/elem | |
// 5000kB, 100 iterations | |
// sum1: 2134986631019855758, 0.36ms, 13.96GB/s, 3.99 cycles/elem, 1.00 instrs/cycle, 3.48GHz, 4.00 instrs/elem | |
// sum2: 2134986631019855758, 0.19ms, 26.22GB/s, 2.12 cycles/elem, 4.71 instrs/cycle, 3.48GHz, 10.00 instrs/elem | |
// sum3: 2134986631019855758, 0.17ms, 28.90GB/s, 1.93 cycles/elem, 2.59 instrs/cycle, 3.48GHz, 5.00 instrs/elem | |
// sum4: 2134986631019855758, 0.17ms, 30.07GB/s, 1.85 cycles/elem, 2.70 instrs/cycle, 3.48GHz, 5.00 instrs/elem | |
// sum5: 2134986631019855758, 0.17ms, 28.59GB/s, 1.95 cycles/elem, 2.57 instrs/cycle, 3.48GHz, 5.00 instrs/elem | |
// 4294MB, 1 iterations | |
// sum1: 15446485409674718527, 0.45 s, 9.54GB/s, 5.84 cycles/elem, 0.68 instrs/cycle, 3.48GHz, 4.00 instrs/elem | |
// sum2: 15446485409674718527, 0.33 s, 13.15GB/s, 4.23 cycles/elem, 2.36 instrs/cycle, 3.48GHz, 10.00 instrs/elem | |
// sum3: 15446485409674718527, 0.31 s, 13.82GB/s, 4.02 cycles/elem, 1.24 instrs/cycle, 3.48GHz, 5.00 instrs/elem | |
// sum4: 15446485409674718527, 0.30 s, 14.45GB/s, 3.85 cycles/elem, 1.30 instrs/cycle, 3.47GHz, 5.00 instrs/elem | |
// sum5: 15446485409674718527, 0.31 s, 14.00GB/s, 3.97 cycles/elem, 1.26 instrs/cycle, 3.48GHz, 5.00 instrs/elem | |
// | |
// When the data fits in the cache (L1, L2, and L3 specifically) the increase in | |
// performance is pronounced (and in fact higher than Per's experiments). With a | |
// bigger dataset (the siz Per Vognsen uses) I believe I am hitting the limits of | |
// the RAM itself, given that: | |
// | |
// $ sysbench memory --memory-block-size=1G --memory-oper=read --threads=1 run | |
// ... | |
// 102400.00 MiB transferred (12081.43 MiB/sec) | |
// | |
// The datasets are designed to fit in L1, L2, and L3 respectively. | |
#define _GNU_SOURCE | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <unistd.h> | |
#include <sys/ioctl.h> | |
#include <linux/perf_event.h> | |
#include <asm/unistd.h> | |
#include <sched.h> | |
#include <stdbool.h> | |
#define NOINLINE __attribute__((noinline)) | |
// The functions we're benchmarking | |
typedef struct Node { | |
uint64_t value; | |
struct Node *next; | |
} Node; | |
NOINLINE | |
static uint64_t sum1(Node *node) { | |
uint64_t value = 0; | |
while (node) { | |
value += node->value; | |
node = node->next; | |
} | |
return value; | |
} | |
// This is the version from Per, without the loop unrolling -- it works (in | |
// clang anyway), but we can do a bit better (see `sum3`). | |
NOINLINE | |
static uint64_t sum2(Node *node) { | |
uint64_t value = 0; | |
while (node) { | |
value += node->value; | |
Node *predicted_next = node + 1; | |
Node *next = node->next; | |
if (next == predicted_next) { | |
// Prevent compilers optimizing this apparently meaningless branch away | |
// by making them think we're changing predicted_next here. | |
// | |
// This trick, however, does not work with GCC, only with clang. GCC here | |
// derives that `next` and `predicted_next` are the same, and therefore | |
// merges them into the same variable, which re-introduces the data | |
// dependency we wanted to get rid of. | |
asm("" : "+r"(predicted_next)); | |
node = predicted_next; | |
} else { | |
node = next; | |
} | |
} | |
return value; | |
} | |
// We would like to just write the following C code: | |
// | |
// uint64_t sum2(Node* node) { | |
// uint64_t value = 0; | |
// while (node) { | |
// value += node->value; | |
// node++; | |
// if (node != node->next) { | |
// node = node->next; | |
// } | |
// } | |
// return value; | |
// } | |
// | |
// But it's hard to get compilers to not compile the semantically pointless | |
// node++ and the if away. | |
// | |
// This uses 5 instructions per element, leveraging the fact that if next | |
// is the same as the predicted next, we know there's a next iteration. | |
// | |
// The original version used 6 instructions per element, but Rihalto pointed | |
// out on twitter that a jump could be eliminated: | |
// <https://twitter.com/RhialtoTheM/status/1418926515526459394> | |
NOINLINE | |
static uint64_t sum3(Node *node) { | |
uint64_t value = 0; | |
Node *next = NULL; | |
asm ( | |
"begin_loop_%=:\n" | |
" testq %[node], %[node]\n" | |
" je end_%=\n" | |
"loop_body_%=:\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" je loop_body_%=\n" | |
" movq %[next], %[node]\n" | |
" testq %[node], %[node]\n" | |
" jne loop_body_%=\n" | |
"end_%=:\n" | |
: [node]"+r"(node), [value]"+r"(value), [next]"+r"(next) | |
: | |
: "cc" | |
); | |
return value; | |
} | |
// Like `sum3`, but unroll the loop a bunch of times. | |
NOINLINE | |
static uint64_t sum4(Node *node) { | |
uint64_t value = 0; | |
Node *next = NULL; | |
asm ( | |
"begin_loop_%=:\n" | |
" testq %[node], %[node]\n" | |
" je end_%=\n" | |
"loop_body_%=:\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" jne assign_node_%=\n" | |
" addq (%[node]), %[value]\n" | |
" movq 8(%[node]), %[next]\n" | |
" addq $16, %[node]\n" | |
" cmpq %[node], %[next]\n" | |
" je loop_body_%=\n" | |
"assign_node_%=:\n" | |
" movq %[next], %[node]\n" | |
" testq %[node], %[node]\n" | |
" jne loop_body_%=\n" | |
"end_%=:\n" | |
: [node]"+r"(node), [value]"+r"(value), [next]"+r"(next) | |
: | |
: "cc" | |
); | |
return value; | |
} | |
// Version without needing assembly due to Alexander Monakov | |
// <https://twitter.com/_monoid/status/1418663360871141376>. Does | |
// roughly as well as sum3. | |
NOINLINE | |
static uint64_t sum5(Node *node) { | |
uint64_t value = 0; | |
for (; node; node = node->next) { | |
for (;;) { | |
value += node->value; | |
if (node + 1 != node->next) { | |
break; | |
} | |
node++; | |
} | |
} | |
return value; | |
} | |
// Pin to first CPU | |
static void pin_to_cpu_0(void) { | |
cpu_set_t cpu_mask; | |
CPU_ZERO(&cpu_mask); | |
CPU_SET(0, &cpu_mask); | |
if (sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask) != 0) { | |
fprintf(stderr, "Could not set CPU affinity\n"); | |
exit(EXIT_FAILURE); | |
} | |
} | |
// perf instrumentation -- a mixture of man 3 perf_event_open and | |
// <https://stackoverflow.com/a/42092180> | |
static long | |
perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { | |
int ret; | |
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); | |
return ret; | |
} | |
static void setup_perf_event( | |
struct perf_event_attr *evt, | |
int *fd, | |
uint64_t *id, | |
uint32_t evt_type, | |
uint64_t evt_config, | |
int group_fd | |
) { | |
memset(evt, 0, sizeof(struct perf_event_attr)); | |
evt->type = evt_type; | |
evt->size = sizeof(struct perf_event_attr); | |
evt->config = evt_config; | |
evt->disabled = 1; | |
evt->exclude_kernel = 1; | |
evt->exclude_hv = 1; | |
evt->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; | |
*fd = perf_event_open(evt, 0, -1, group_fd, 0); | |
if (*fd == -1) { | |
fprintf(stderr, "Error opening leader %llx\n", evt->config); | |
exit(EXIT_FAILURE); | |
} | |
ioctl(*fd, PERF_EVENT_IOC_ID, id); | |
} | |
static struct perf_event_attr perf_cycles_evt; | |
static int perf_cycles_fd; | |
static uint64_t perf_cycles_id; | |
static struct perf_event_attr perf_clock_evt; | |
static int perf_clock_fd; | |
static uint64_t perf_clock_id; | |
static struct perf_event_attr perf_instrs_evt; | |
static int perf_instrs_fd; | |
static uint64_t perf_instrs_id; | |
static struct perf_event_attr perf_cache_misses_evt; | |
static int perf_cache_misses_fd; | |
static uint64_t perf_cache_misses_id; | |
static struct perf_event_attr perf_cache_references_evt; | |
static int perf_cache_references_fd; | |
static uint64_t perf_cache_references_id; | |
static void perf_init(void) { | |
// Cycles | |
setup_perf_event( | |
&perf_cycles_evt, &perf_cycles_fd, &perf_cycles_id, | |
PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES, -1 | |
); | |
// Clock | |
setup_perf_event( | |
&perf_clock_evt, &perf_clock_fd, &perf_clock_id, | |
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, perf_cycles_fd | |
); | |
// Instructions | |
setup_perf_event( | |
&perf_instrs_evt, &perf_instrs_fd, &perf_instrs_id, | |
PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, perf_cycles_fd | |
); | |
// Cache misses | |
setup_perf_event( | |
&perf_cache_misses_evt, &perf_cache_misses_fd, &perf_cache_misses_id, | |
PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, perf_cycles_fd | |
); | |
// Cache references | |
setup_perf_event( | |
&perf_cache_references_evt, &perf_cache_references_fd, &perf_cache_references_id, | |
PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, perf_cycles_fd | |
); | |
} | |
static void perf_close(void) { | |
close(perf_clock_fd); | |
close(perf_cycles_fd); | |
close(perf_instrs_fd); | |
close(perf_cache_misses_fd); | |
close(perf_cache_references_fd); | |
} | |
static void disable_perf_count(void) { | |
ioctl(perf_cycles_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); | |
} | |
static void enable_perf_count(void) { | |
ioctl(perf_cycles_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); | |
} | |
static void reset_perf_count(void) { | |
ioctl(perf_cycles_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); | |
} | |
struct perf_read_value { | |
uint64_t value; | |
uint64_t id; | |
}; | |
struct perf_read_format { | |
uint64_t nr; | |
struct perf_read_value values[]; | |
}; | |
static char perf_read_buf[4096]; | |
struct perf_count { | |
uint64_t cycles; | |
double seconds; | |
uint64_t instructions; | |
uint64_t cache_misses; | |
uint64_t cache_references; | |
}; | |
static void read_perf_count(struct perf_count *count) { | |
if (!read(perf_cycles_fd, perf_read_buf, sizeof(perf_read_buf))) { | |
fprintf(stderr, "Could not read cycles from perf\n"); | |
exit(EXIT_FAILURE); | |
} | |
struct perf_read_format* rf = (struct perf_read_format *) perf_read_buf; | |
if (rf->nr != 5) { | |
fprintf(stderr, "Bad number of perf events\n"); | |
exit(EXIT_FAILURE); | |
} | |
for (int i = 0; i < rf->nr; i++) { | |
struct perf_read_value *value = &rf->values[i]; | |
if (value->id == perf_cycles_id) { | |
count->cycles = value->value; | |
} else if (value->id == perf_clock_id) { | |
count->seconds = ((double) (value->value / 1000ull)) / 1000000.0; | |
} else if (value->id == perf_instrs_id) { | |
count->instructions = value->value; | |
} else if (value->id == perf_cache_misses_id) { | |
count->cache_misses = value->value; | |
} else if (value->id == perf_cache_references_id) { | |
count->cache_references = value->value; | |
} else { | |
fprintf(stderr, "Spurious value in perf read (%lld)\n", value->id); | |
exit(EXIT_FAILURE); | |
} | |
} | |
} | |
// random numbers, from Per's other gist | |
static uint64_t random_uint64(void) { | |
static uint64_t x = 0x2545F4914F6CDD1D; | |
x ^= x << 13; | |
x ^= x >> 7; | |
x ^= x << 17; | |
return x; | |
} | |
// run our functions | |
NOINLINE | |
static void run_sum(int pre_iterations, int iterations, uint64_t n, Node *node, const char* name, uint64_t (*f)(Node *)) { | |
uint64_t value; | |
for (int i = 0; i < pre_iterations; i++) { | |
value = f(node); | |
} | |
struct perf_count counts; | |
counts.seconds = 0.0; | |
counts.cycles = 0; | |
counts.instructions = 0; | |
counts.cache_misses = 0; | |
counts.cache_references = 0; | |
// enabling / disabling it inside each iteration does not seem | |
// to yield precise result for short durations. maybe we can tune | |
// the sampling rate? | |
disable_perf_count(); | |
reset_perf_count(); | |
enable_perf_count(); | |
for (int i = 0; i < iterations; i++) { | |
value = f(node); | |
} | |
disable_perf_count(); | |
read_perf_count(&counts); | |
counts.seconds /= ((double) iterations); | |
counts.cycles /= ((uint64_t) iterations); | |
counts.instructions /= ((uint64_t) iterations); | |
counts.cache_misses /= ((uint64_t) iterations); | |
counts.cache_references /= ((uint64_t) iterations); | |
uint64_t bytes = n * sizeof(Node); | |
double gb_per_s = (((double) bytes) / 1000000000.0) / counts.seconds; | |
double time = counts.seconds; | |
const char *unit = " s"; | |
if (time < 0.1) { | |
time *= 1000.0; | |
unit = "ms"; | |
} | |
if (time < 0.1) { | |
time *= 1000.0; | |
unit = "us"; | |
} | |
printf( | |
" %s: %20lu, %5.2f%s, %6.2fGB/s, %5.2f cycles/elem, %5.2f instrs/cycle, %5.2fGHz, %5.2f instrs/elem\n", | |
name, value, time, unit, gb_per_s, ((double) counts.cycles) / ((double) n), ((double) counts.instructions) / ((double) counts.cycles), ((double) counts.cycles) / counts.seconds / 1000000000.0, ((double) counts.instructions) / ((double) n) | |
); | |
} | |
static void run_tests(int iterations, uint64_t n) { | |
Node *nodes = malloc(n * sizeof(Node)); | |
if (!nodes) { | |
fprintf(stderr, "Could not allocate nodes\n"); | |
exit(EXIT_FAILURE); | |
} | |
for (uint64_t i = 0; i < n - 1; i++) { | |
nodes[i].value = random_uint64(); | |
nodes[i].next = &nodes[i+1]; | |
} | |
nodes[n-1].value = random_uint64(); | |
nodes[n-1].next = NULL; | |
perf_init(); | |
uint64_t amount = n * sizeof(Node); | |
const char *unit = "B"; | |
if (amount > 10000ull) { | |
amount /= 1000ull; | |
unit = "kB"; | |
} | |
if (amount > 10000ull) { | |
amount /= 1000ull; | |
unit = "MB"; | |
} | |
if (amount > 10000ull) { | |
amount /= 1000ull; | |
unit = "GB"; | |
} | |
printf("%lu%s, %d iterations\n", amount, unit, iterations); | |
run_sum(iterations, iterations, n, &nodes[0], "sum1", &sum1); | |
run_sum(iterations, iterations, n, &nodes[0], "sum2", &sum2); | |
run_sum(iterations, iterations, n, &nodes[0], "sum3", &sum3); | |
run_sum(iterations, iterations, n, &nodes[0], "sum4", &sum4); | |
run_sum(iterations, iterations, n, &nodes[0], "sum5", &sum5); | |
perf_close(); | |
free(nodes); | |
} | |
int main() { | |
pin_to_cpu_0(); | |
run_tests(10000, 1000llu); // 16kB -- should fit in L1 | |
run_tests(10000, 8000llu); // 128kB -- should fit in L2 | |
run_tests( 100, 312500llu); // 5MB -- should fit in L3 | |
run_tests( 1, 1llu << 28); // 4GB -- doesn't fit anywhere | |
return EXIT_SUCCESS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment