Last active
November 13, 2023 17:31
-
-
Save ceberly/d35e1ca2c385fdd0ba2d2cc6c62d2909 to your computer and use it in GitHub Desktop.
Row vs. Column access patterns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <time.h> | |
#define NUM_RECORDS 1000000 | |
#define NUM_QUERIES 1000 // number of queries to run in a row | |
#define LOCATION_ID 42 // for the by_location queries | |
struct User { | |
int id; | |
char name[255]; | |
int birth_location_id; | |
int time_of_birth; | |
}; | |
struct ColDB { | |
int *id; | |
char *name; // store the names contiguously to mimic row db | |
int *birth_location_id; | |
int *time_of_birth; | |
} col_db; | |
// row oriented DB is just an array of users. | |
struct User *row_db; | |
struct ColDB col_db; | |
static void setup(void) { | |
row_db = malloc(sizeof(struct User) * NUM_RECORDS); | |
assert(row_db != NULL); | |
col_db.id = malloc(sizeof(int) * NUM_RECORDS); | |
col_db.name = malloc(255 * NUM_RECORDS); | |
col_db.birth_location_id = malloc(sizeof(int) * NUM_RECORDS); | |
col_db.time_of_birth = malloc(sizeof(int) * NUM_RECORDS); | |
srand((unsigned int)time(NULL)); | |
for (int i = 0; i < NUM_RECORDS; i++) { | |
int id = abs(rand()); | |
int l = abs(rand()) % 100; // make some duplicate birth locations... | |
int t = abs(rand()) % 86400; // seconds in a day | |
row_db[i].id = id; | |
row_db[i].name[32] = '\0'; // just so we can print the names | |
row_db[i].birth_location_id = l; | |
row_db[i].time_of_birth = t; | |
col_db.id[i] = id; | |
col_db.name[i * 255 + 32] = '\0'; | |
col_db.birth_location_id[i] = l; | |
col_db.time_of_birth[i] = t; | |
} | |
} | |
static size_t rusers_by_location(struct User *results) { | |
size_t count = 0; | |
for (int i = 0; i < NUM_RECORDS; i++) { | |
struct User u = row_db[i]; | |
if (u.birth_location_id == LOCATION_ID) { | |
results[count++] = u; | |
} | |
} | |
return count; | |
} | |
static size_t cusers_by_location(struct User *results) { | |
size_t count = 0; | |
for (int i = 0; i < NUM_RECORDS; i++) { | |
if (col_db.birth_location_id[i] == LOCATION_ID) { | |
struct User u = { | |
.id = col_db.id[i], | |
.birth_location_id = col_db.birth_location_id[i], | |
.time_of_birth = col_db.time_of_birth[i] | |
}; | |
memcpy(u.name, col_db.name + (i * 255), 255); | |
results[count++] = u; | |
} | |
} | |
return count; | |
} | |
static int cmax_time_of_birth(void) { | |
int result = 0; | |
for (int i = 0; i < NUM_RECORDS; i++) { | |
int t = col_db.time_of_birth[i]; | |
if (t > result) | |
result = t; | |
} | |
return result; | |
} | |
static int rmax_time_of_birth(void) { | |
int result = 0; | |
for (int i = 0; i < NUM_RECORDS; i++) { | |
int t = row_db[i].time_of_birth; | |
if (t > result) | |
result = t; | |
} | |
return result; | |
} | |
int main(int argc, char **argv) { | |
setup(); | |
int c, r = 0; | |
size_t result_count = 0; | |
struct User *results = malloc(sizeof(struct User) * NUM_RECORDS); | |
assert(results != NULL); | |
time_t start = time(NULL); | |
for (int i = 0; i < NUM_QUERIES; i++) { | |
c = cmax_time_of_birth(); | |
} | |
printf("elapsed: %lds\t cmax_time_of_birth: %d\n", time(NULL) - start, c); | |
start = time(NULL); | |
for (int i = 0; i < NUM_QUERIES; i++) { | |
r = rmax_time_of_birth(); | |
} | |
printf("elapsed: %lds\trmax_time_of_birth: %d\n", time(NULL) - start, r); | |
start = time(NULL); | |
for (int i = 0; i < NUM_QUERIES; i++) { | |
result_count = rusers_by_location(results); | |
} | |
printf("elasped: %lds records for rusers_by_location count: %ld\n", | |
time(NULL) - start, result_count); | |
start = time(NULL); | |
for (int i = 0; i < NUM_QUERIES; i++) { | |
result_count = cusers_by_location(results); | |
} | |
printf("elasped: %lds records for cusers_by_location count: %ld\n", | |
time(NULL) - start, result_count); | |
return EXIT_SUCCESS; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
default: | |
gcc -O0 -g3 -Wall -Wextra -Wconversion -Wdouble-promotion \ | |
-Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion \ | |
main.c | |
format: | |
clang-format -i main.c |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Output on my Intel i5 NUC: | |
# The corresponding SQL queries are roughly this: | |
# | |
# max_time_of_birth: | |
# SELECT MAX(time_of_birth) FROM users; | |
# | |
# users_by_location: | |
# -- Originally this was just supposed to select users in location "42" | |
# -- but I had to add extra lookups to try to break the cache on the column-based data structure. | |
# -- So let's call it: | |
# "select users from location 42 who were born in the first half of the day and aren't the super user" | |
# SELECT COUNT(1) FROM users WHERE location = 42 AND time_of_birth < 40000 AND id != 1 | |
./a.out | |
elapsed: 0s cmax_time_of_birth: 86399 | |
elapsed: 5s rmax_time_of_birth: 86399 | |
elasped: 23s records for rusers_by_location count: 9975 | |
elasped: 2s records for cusers_by_location count: 9975 |
I don't think your experiment is incorrect. I think that the row-wise storage only really helps when you must iterate over all data and/or where you must return a large amount of data.
Here is my version. https://gist.github.com/eatonphil/c2948f65d216a9ad3667801578154eee
Nice, thank you. I am working on an updated version as well that shows more clearly what I was getting at (which your's does as well), since this got a lot of traffic
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
What is the point of storing anything in the
results
struct? When you doresults[count++] = u
?