Last active
December 23, 2019 00:19
-
-
Save danielealbano/b631e6d80a10d248ebcc527d32090c34 to your computer and use it in GitHub Desktop.
Test - OOP vs DOD data processing - C
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// compile with | |
// gcc -O3 -m64 -mavx2 -funroll-loops -march=native -o test-oop-vs-dod-simple-data-processing test-oop-vs-dod-simple-data-processing.c | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <stdbool.h> | |
#include <string.h> | |
// The compiler will not strip out anything related to this variable because it will assume it can be used by other | |
// code that is outside the scope of the code using it | |
volatile uint32_t foo = 0; | |
// Data structures used to build and print out the stats, they use hard-defined char buffers, the text is all | |
// defined in the code and will never get past the boundaries | |
struct test_report | |
{ | |
uint32_t game_objects_count; | |
uint64_t cpu_cycles; | |
uint64_t allocated_memory; | |
}; | |
typedef struct test_report test_report_t; | |
typedef int (*test_fn_t)(uint64_t game_object_count, test_report_t* report); | |
struct test | |
{ | |
char name[100]; | |
test_fn_t fn; | |
test_report_t* reports; | |
}; | |
typedef struct test test_t; | |
// Support functions to get the cpu cycles | |
inline uint64_t cpu_cycles_start() | |
{ | |
uint64_t cycles; | |
asm volatile( | |
"lfence\n\t" | |
"rdtsc\n\t" | |
"shl $32, %%rdx\n\t" | |
"or %%rdx, %0\n\t" | |
"lfence" | |
: "=a"(cycles) | |
: | |
// "memory" avoids reordering. rdx = TSC >> 32. | |
// "cc" = flags modified by SHL. | |
: "rdx", "memory", "cc"); | |
return cycles; | |
} | |
inline uint64_t cpu_cycles_end() | |
{ | |
uint64_t cycles; | |
asm volatile( | |
"rdtscp\n\t" | |
"shl $32, %%rdx\n\t" | |
"or %%rdx, %0\n\t" | |
"lfence" | |
: "=a"(cycles) | |
: | |
// "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. | |
// "cc" = flags modified by SHL. | |
: "rcx", "rdx", "memory", "cc"); | |
return cycles; | |
} | |
// Object Oriented Pattern | |
struct oop_game_object | |
{ | |
uint8_t deleted; | |
uint8_t is_visible; | |
uint16_t object_type; | |
uint32_t mesh_id; | |
struct | |
{ | |
double x; | |
double y; | |
double z; | |
} position; | |
struct | |
{ | |
double x; | |
double y; | |
double z; | |
} orientation; | |
struct | |
{ | |
bool has_animation; | |
uint32_t animation_id; | |
uint32_t frame; | |
} animation; | |
}; | |
typedef struct oop_game_object oop_game_object_t; | |
int test_oop_loop(uint64_t game_object_count, test_report_t* report) | |
{ | |
uint64_t allocated_memory = 0; | |
allocated_memory += sizeof(oop_game_object_t) * game_object_count; | |
// Allocate memory | |
oop_game_object_t* oop_game_objects = (oop_game_object_t*)malloc(sizeof(oop_game_object_t) * game_object_count); | |
if (oop_game_objects == NULL) | |
{ | |
perror("Unable to allocate oop_game_objects"); | |
return -1; | |
} | |
// Init the memory for the test | |
memset(oop_game_objects, 0, sizeof(oop_game_object_t) * game_object_count); | |
for(uint64_t index = 0; index < game_object_count / 2; index++) | |
{ | |
oop_game_objects[index].is_visible = 1; | |
} | |
// Loop of the data - warmup | |
// This gives an HUGE boost to the OOP test, the data will be copied into the L2/L3 cache and because they are | |
// reused right after it's very likely that most of them will still be there, this will reduce the cache misses a lot. | |
for(uint64_t index = 0; index < game_object_count; index++) | |
{ | |
if (oop_game_objects[index].deleted == 1) | |
{ | |
continue; | |
} | |
if (oop_game_objects[index].is_visible == 1) | |
{ | |
// This assigment is used to avoid that the compiler strips out the entire if because not being in use as | |
// part of the optimization process, process that we want to keep in place for the performance comparison | |
foo = index; | |
} | |
} | |
// Get the initial cpu cycles count | |
uint64_t cpu_cycles = cpu_cycles_start(); | |
// Loop of the data - very simple algorithm | |
for(uint64_t index = 0; index < game_object_count; index++) | |
{ | |
if (oop_game_objects[index].deleted == 1) | |
{ | |
continue; | |
} | |
if (oop_game_objects[index].is_visible == 1) | |
{ | |
// This assignment is used to avoid that the compiler strips out the entire if because not being in use as | |
// part of the optimization process, process that we want to keep in place for the performance comparison | |
foo = index; | |
} | |
} | |
// Calculate the CPU cycles | |
cpu_cycles = cpu_cycles_end() - cpu_cycles; | |
free(oop_game_objects); | |
// Update the report | |
report->cpu_cycles = cpu_cycles; | |
report->game_objects_count = game_object_count; | |
report->allocated_memory = allocated_memory; | |
return 0; | |
} | |
// | |
// Data Oriented Design | |
// | |
typedef uint8_t dod_game_object_deleted_t; | |
typedef uint8_t dod_game_object_is_visible_t; | |
struct dod_game_object | |
{ | |
uint16_t object_type; | |
uint32_t mesh_id; | |
struct | |
{ | |
double x; | |
double y; | |
double z; | |
} position; | |
struct | |
{ | |
double x; | |
double y; | |
double z; | |
} orientation; | |
struct | |
{ | |
bool has_animation; | |
uint32_t animation_id; | |
uint32_t frame; | |
} animation; | |
}; | |
typedef struct dod_game_object dod_game_object_t; | |
int test_dod_loop(uint64_t game_object_count, test_report_t* report) | |
{ | |
uint64_t allocated_memory = 0; | |
allocated_memory += sizeof(dod_game_object_t) * game_object_count; | |
allocated_memory += sizeof(dod_game_object_deleted_t) * game_object_count; | |
allocated_memory += sizeof(dod_game_object_is_visible_t) * game_object_count; | |
// Allocate the memory | |
dod_game_object_t* dod_game_objects = | |
(dod_game_object_t*)malloc(sizeof(dod_game_object_t) * game_object_count); | |
if (dod_game_objects == NULL) | |
{ | |
perror("Unable to allocate dod_game_objects"); | |
return -1; | |
} | |
dod_game_object_deleted_t* dod_game_object_deleted = | |
(dod_game_object_deleted_t*)malloc(sizeof(dod_game_object_deleted_t) * game_object_count); | |
if (dod_game_object_deleted == NULL) | |
{ | |
perror("Unable to allocate dod_game_object_deleted"); | |
return -2; | |
} | |
dod_game_object_is_visible_t* dod_game_object_is_visible = | |
(dod_game_object_is_visible_t*)malloc(sizeof(dod_game_object_is_visible_t) * game_object_count); | |
if (dod_game_object_is_visible == NULL) | |
{ | |
perror("Unable to allocate dod_game_object_is_visible"); | |
return -3; | |
} | |
// Init the memory for the test | |
memset(dod_game_objects, 0, sizeof(dod_game_object_t) * game_object_count); | |
memset(dod_game_objects, 0, sizeof(dod_game_object_deleted_t) * game_object_count); | |
memset(dod_game_objects, 0, sizeof(dod_game_object_is_visible_t) * game_object_count); | |
for(uint64_t index = 0; index < game_object_count / 2; index++) | |
{ | |
dod_game_object_is_visible[index] = 1; | |
} | |
// Loop of the data - warmup | |
// This gives an HUGE boost to the OOP test, the data will be copied into the L2/L3 cache and because they are | |
// reused right after it's very likely that most of them will still be there, this will reduce the cache misses a lot. | |
for(uint64_t index = 0; index < game_object_count; index++) | |
{ | |
if (dod_game_object_deleted[index] == 1) | |
{ | |
continue; | |
} | |
if (dod_game_object_is_visible[index] == 1) | |
{ | |
// This assigment is used to avoid that the compiler strips out the entire if because not being in use as | |
// part of the optimization process, process that we want to keep in place for the performance comparison | |
foo = index; | |
} | |
} | |
// Get the initial cpu cycles count | |
uint64_t cpu_cycles = cpu_cycles_start(); | |
// Loop of the data - very simple algorithm | |
for(uint64_t index = 0; index < game_object_count; index++) | |
{ | |
if (dod_game_object_deleted[index] == 1) | |
{ | |
continue; | |
} | |
if (dod_game_object_is_visible[index] == 1) | |
{ | |
// This assignment is used to avoid that the compiler strips out the entire if because not being in use as | |
// part of the optimization process, process that we want to keep in place for the performance comparison | |
foo = index; | |
} | |
} | |
// Calculate the CPU cycles | |
cpu_cycles = cpu_cycles_end() - cpu_cycles; | |
free(dod_game_objects); | |
// Update the report | |
report->cpu_cycles = cpu_cycles; | |
report->game_objects_count = game_object_count; | |
report->allocated_memory = allocated_memory; | |
return 0; | |
} | |
int main(int argc, char** argv) | |
{ | |
test_t* test; | |
uint8_t runs = 10; | |
uint32_t game_objects_counts[] = { 1000, 10000, 100000, 1000000, 10000000 }; | |
uint32_t tests_count = sizeof(game_objects_counts) / sizeof(uint32_t); | |
test_t tests[] = { | |
{ | |
.name = "Object Oriented Pattern", | |
.fn = &test_oop_loop, | |
.reports = (test_report_t*)malloc(sizeof(test_report_t) * tests_count) | |
}, | |
{ | |
.name = "Data Oriented Design", | |
.fn = &test_dod_loop, | |
.reports = (test_report_t*)malloc(sizeof(test_report_t) * tests_count) | |
}, | |
{ | |
0,0,0 | |
} | |
}; | |
test = tests; | |
do | |
{ | |
printf("Testing %s\n", test->name); | |
for(uint32_t loop_index = 0; loop_index < tests_count; loop_index++) | |
{ | |
uint32_t game_objects_count = game_objects_counts[loop_index]; | |
printf(" - loop %d, game_objects_count = %d (%d runs)\n", loop_index, game_objects_count, runs); | |
uint64_t cpu_cycles = 0; | |
for(uint8_t run_index = 0; run_index < runs; run_index++) | |
{ | |
int res = (*test->fn)(game_objects_count, &test->reports[loop_index]); | |
if (res != 0) | |
{ | |
fprintf(stderr, "Error during the execution, unable to continue\n"); | |
return -1; | |
} | |
cpu_cycles += test->reports[loop_index].cpu_cycles; | |
} | |
test->reports[loop_index].cpu_cycles = cpu_cycles / runs; | |
} | |
printf("\n"); | |
} | |
while(*(++test)->name != 0); | |
// Print the report | |
uint32_t separator_length = 28 + (18 * tests_count + 1); | |
printf("REPORT - CPU CYCLES (LOOP TOTAL / LOOP ITERATION)\n"); | |
for(uint32_t i = 0; i < separator_length; i++) printf("-"); | |
printf("\n"); | |
// Header | |
printf("| %25s ", "ARRAY SIZE"); | |
for(uint32_t loop_index = 0; loop_index < tests_count; loop_index++) | |
{ | |
printf("| %15d ", game_objects_counts[loop_index]); | |
} | |
printf("|\n"); | |
for(uint32_t i = 0; i < separator_length; i++) printf("-"); | |
printf("\n"); | |
test = tests; | |
do | |
{ | |
printf("| %25s ", test->name); | |
for(uint32_t loop_index = 0; loop_index < tests_count; loop_index++) | |
{ | |
printf("| %9lu ", test->reports[loop_index].cpu_cycles); | |
printf("| %3lu ", test->reports[loop_index].cpu_cycles / test->reports[loop_index].game_objects_count); | |
} | |
printf("|\n"); | |
} | |
while(*(++test)->name != 0); | |
for(uint32_t i = 0; i < separator_length; i++) printf("-"); | |
printf("\n"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment