Skip to content

Instantly share code, notes, and snippets.

@ceberly
Last active November 13, 2023 17:31
Show Gist options
  • Save ceberly/d35e1ca2c385fdd0ba2d2cc6c62d2909 to your computer and use it in GitHub Desktop.
Save ceberly/d35e1ca2c385fdd0ba2d2cc6c62d2909 to your computer and use it in GitHub Desktop.
Row vs. Column access patterns
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define NUM_RECORDS 1000000
#define NUM_QUERIES 1000 // number of queries to run in a row
#define LOCATION_ID 42 // for the by_location queries
struct User {
int id;
char name[255];
int birth_location_id;
int time_of_birth;
};
struct ColDB {
int *id;
char *name; // store the names contiguously to mimic row db
int *birth_location_id;
int *time_of_birth;
} col_db;
// row oriented DB is just an array of users.
struct User *row_db;
struct ColDB col_db;
static void setup(void) {
row_db = malloc(sizeof(struct User) * NUM_RECORDS);
assert(row_db != NULL);
col_db.id = malloc(sizeof(int) * NUM_RECORDS);
col_db.name = malloc(255 * NUM_RECORDS);
col_db.birth_location_id = malloc(sizeof(int) * NUM_RECORDS);
col_db.time_of_birth = malloc(sizeof(int) * NUM_RECORDS);
srand((unsigned int)time(NULL));
for (int i = 0; i < NUM_RECORDS; i++) {
int id = abs(rand());
int l = abs(rand()) % 100; // make some duplicate birth locations...
int t = abs(rand()) % 86400; // seconds in a day
row_db[i].id = id;
row_db[i].name[32] = '\0'; // just so we can print the names
row_db[i].birth_location_id = l;
row_db[i].time_of_birth = t;
col_db.id[i] = id;
col_db.name[i * 255 + 32] = '\0';
col_db.birth_location_id[i] = l;
col_db.time_of_birth[i] = t;
}
}
static size_t rusers_by_location(struct User *results) {
size_t count = 0;
for (int i = 0; i < NUM_RECORDS; i++) {
struct User u = row_db[i];
if (u.birth_location_id == LOCATION_ID) {
results[count++] = u;
}
}
return count;
}
static size_t cusers_by_location(struct User *results) {
size_t count = 0;
for (int i = 0; i < NUM_RECORDS; i++) {
if (col_db.birth_location_id[i] == LOCATION_ID) {
struct User u = {
.id = col_db.id[i],
.birth_location_id = col_db.birth_location_id[i],
.time_of_birth = col_db.time_of_birth[i]
};
memcpy(u.name, col_db.name + (i * 255), 255);
results[count++] = u;
}
}
return count;
}
static int cmax_time_of_birth(void) {
int result = 0;
for (int i = 0; i < NUM_RECORDS; i++) {
int t = col_db.time_of_birth[i];
if (t > result)
result = t;
}
return result;
}
static int rmax_time_of_birth(void) {
int result = 0;
for (int i = 0; i < NUM_RECORDS; i++) {
int t = row_db[i].time_of_birth;
if (t > result)
result = t;
}
return result;
}
int main(int argc, char **argv) {
setup();
int c, r = 0;
size_t result_count = 0;
struct User *results = malloc(sizeof(struct User) * NUM_RECORDS);
assert(results != NULL);
time_t start = time(NULL);
for (int i = 0; i < NUM_QUERIES; i++) {
c = cmax_time_of_birth();
}
printf("elapsed: %lds\t cmax_time_of_birth: %d\n", time(NULL) - start, c);
start = time(NULL);
for (int i = 0; i < NUM_QUERIES; i++) {
r = rmax_time_of_birth();
}
printf("elapsed: %lds\trmax_time_of_birth: %d\n", time(NULL) - start, r);
start = time(NULL);
for (int i = 0; i < NUM_QUERIES; i++) {
result_count = rusers_by_location(results);
}
printf("elasped: %lds records for rusers_by_location count: %ld\n",
time(NULL) - start, result_count);
start = time(NULL);
for (int i = 0; i < NUM_QUERIES; i++) {
result_count = cusers_by_location(results);
}
printf("elasped: %lds records for cusers_by_location count: %ld\n",
time(NULL) - start, result_count);
return EXIT_SUCCESS;
}
default:
gcc -O0 -g3 -Wall -Wextra -Wconversion -Wdouble-promotion \
-Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion \
main.c
format:
clang-format -i main.c
# Output on my Intel i5 NUC:
# The corresponding SQL queries are roughly this:
#
# max_time_of_birth:
# SELECT MAX(time_of_birth) FROM users;
#
# users_by_location:
# -- Originally this was just supposed to select users in location "42"
# -- but I had to add extra lookups to try to break the cache on the column-based data structure.
# -- So let's call it:
# "select users from location 42 who were born in the first half of the day and aren't the super user"
# SELECT COUNT(1) FROM users WHERE location = 42 AND time_of_birth < 40000 AND id != 1
./a.out
elapsed: 0s cmax_time_of_birth: 86399
elapsed: 5s rmax_time_of_birth: 86399
elasped: 23s records for rusers_by_location count: 9975
elasped: 2s records for cusers_by_location count: 9975
@eatonphil
Copy link

What is the point of storing anything in the results struct? When you do results[count++] = u?

@eatonphil
Copy link

I don't think your experiment is incorrect. I think that the row-wise storage only really helps when you must iterate over all data and/or where you must return a large amount of data.

Here is my version. https://gist.github.com/eatonphil/c2948f65d216a9ad3667801578154eee

@ceberly
Copy link
Author

ceberly commented Nov 12, 2023

Nice, thank you. I am working on an updated version as well that shows more clearly what I was getting at (which your's does as well), since this got a lot of traffic

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment