Last active
July 17, 2024 14:24
-
-
Save pervognsen/b58108d134824e61caedffcc01004e03 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Estimating CPU frequency... | |
// CPU frequency: 4.52 GHz | |
// sum1: value = 15182118497126522709, 0.31 secs, 5.14 cycles/elem | |
// sum2: value = 15182118497126522709, 0.17 secs, 2.93 cycles/elem | |
#define RW(x) asm("" : "+r"(x)) | |
typedef struct Node { | |
u64 value; | |
struct Node *next; | |
} Node; | |
// Naive linked list accumulation. Speed limited by L1 latency of 4-5 cycles/elem on Zen 3. | |
NOINLINE | |
u64 sum1(Node *node) { | |
u64 value = 0; | |
for (;;) { | |
for (int i = 0; i < 4; i++) { | |
if (!node) return value; | |
value += node->value; | |
node = node->next; | |
} | |
} | |
} | |
// Value speculation (via control speculation) on the next pointers to break data dependencies. | |
NOINLINE | |
u64 sum2(Node *node) { | |
u64 value = 0; | |
for (;;) { | |
for (int i = 0; i < 4; i++) { | |
if (!node) return value; | |
value += node->value; | |
Node *predicted_next = node + 1; | |
Node *next = node->next; | |
if (next == predicted_next) { | |
RW(predicted_next); | |
node = predicted_next; | |
} else { | |
node = next; | |
} | |
} | |
} | |
} | |
int main() { | |
printf("Estimating CPU frequency...\n"); | |
calc_cpufreq(); | |
printf("CPU frequency: %.2f GHz\n", 1e-9 * cpufreq); | |
u64 n = 1ull << 28; | |
Node *nodes = malloc(n * sizeof(Node)); | |
for (u64 i = 0; i < n - 1; i++) { | |
nodes[i].value = random_u64(); | |
nodes[i].next = &nodes[i+1]; | |
} | |
nodes[n-1].value = random_u64(); | |
nodes[n-1].next = NULL; | |
start_timer(); | |
u64 value1 = sum1(&nodes[0]); | |
double secs1 = stop_timer(); | |
printf("sum1: value = %llu, %.2f secs, %.2f cycles/elem\n", value1, secs1, secs1 * cpufreq / n); | |
start_timer(); | |
u64 value2 = sum2(&nodes[0]); | |
double secs2 = stop_timer(); | |
printf("sum2: value = %llu, %.2f secs, %.2f cycles/elem\n", value2, secs2, secs2 * cpufreq / n); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment