ceberly · November 13, 2023 17:31 · eatonphil · Nov 12, 2023 · eatonphil · Nov 12, 2023
diff --git a/main.c b/main.c
 #include <assert.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>

 #define NUM_RECORDS 1000000
 #define NUM_QUERIES 1000 // number of queries to run in a row

 #define LOCATION_ID 42 // for the by_location queries

 struct User {
  int id;
  char name[255];
  int birth_location_id;
  int time_of_birth;
 };

 struct ColDB {
  int *id;
  char *name; // store the names contiguously to mimic row db
  int *birth_location_id;
  int *time_of_birth;
 } col_db;

 // row oriented DB is just an array of users.
 struct User *row_db;
 struct ColDB col_db;

 static void setup(void) {
  row_db = malloc(sizeof(struct User) * NUM_RECORDS);
  assert(row_db != NULL);

  col_db.id = malloc(sizeof(int) * NUM_RECORDS);
  col_db.name = malloc(255 * NUM_RECORDS);
  col_db.birth_location_id = malloc(sizeof(int) * NUM_RECORDS);
  col_db.time_of_birth = malloc(sizeof(int) * NUM_RECORDS);

  srand((unsigned int)time(NULL));

  for (int i = 0; i < NUM_RECORDS; i++) {
    int id = abs(rand());
    int l = abs(rand()) % 100;   // make some duplicate birth locations...
    int t = abs(rand()) % 86400; // seconds in a day

    row_db[i].id = id;
    row_db[i].name[32] = '\0'; // just so we can print the names
    row_db[i].birth_location_id = l;
    row_db[i].time_of_birth = t;

    col_db.id[i] = id;
    col_db.name[i * 255 + 32] = '\0';
    col_db.birth_location_id[i] = l;
    col_db.time_of_birth[i] = t;
  }
 }

 static size_t rusers_by_location(struct User *results) {
  size_t count = 0;

  for (int i = 0; i < NUM_RECORDS; i++) {
    struct User u = row_db[i];
    if (u.birth_location_id == LOCATION_ID) {
      results[count++] = u;
    }
  }

  return count;
 }

 static size_t cusers_by_location(struct User *results) {
  size_t count = 0;

  for (int i = 0; i < NUM_RECORDS; i++) {
    if (col_db.birth_location_id[i] == LOCATION_ID) {
      struct User u = {
        .id = col_db.id[i],
        .birth_location_id = col_db.birth_location_id[i],
        .time_of_birth = col_db.time_of_birth[i]
      };

      memcpy(u.name, col_db.name + (i * 255), 255);

      results[count++] = u;
    }
  }

  return count;
 }

 static int cmax_time_of_birth(void) {
  int result = 0;

  for (int i = 0; i < NUM_RECORDS; i++) {
    int t = col_db.time_of_birth[i];
    if (t > result)
      result = t;
  }

  return result;
 }

 static int rmax_time_of_birth(void) {
  int result = 0;

  for (int i = 0; i < NUM_RECORDS; i++) {
    int t = row_db[i].time_of_birth;
    if (t > result)
      result = t;
  }

  return result;
 }

 int main(int argc, char **argv) {
  setup();

  int c, r = 0;
  size_t result_count = 0;

  struct User *results = malloc(sizeof(struct User) * NUM_RECORDS);
  assert(results != NULL);

  time_t start = time(NULL);
  for (int i = 0; i < NUM_QUERIES; i++) {
    c = cmax_time_of_birth();
  }
  printf("elapsed: %lds\t cmax_time_of_birth: %d\n", time(NULL) - start, c);

  start = time(NULL);
  for (int i = 0; i < NUM_QUERIES; i++) {
    r = rmax_time_of_birth();
  }
  printf("elapsed: %lds\trmax_time_of_birth: %d\n", time(NULL) - start, r);

  start = time(NULL);
  for (int i = 0; i < NUM_QUERIES; i++) {
    result_count = rusers_by_location(results);
  }
  printf("elasped: %lds records for rusers_by_location count: %ld\n",
      time(NULL) - start, result_count);

  start = time(NULL);
  for (int i = 0; i < NUM_QUERIES; i++) {
    result_count = cusers_by_location(results);
  }
  printf("elasped: %lds records for cusers_by_location count: %ld\n",
      time(NULL) - start, result_count);

  return EXIT_SUCCESS;
 }
diff --git a/Makefile b/Makefile
 default:
 	gcc -O0 -g3 -Wall -Wextra -Wconversion -Wdouble-promotion \
 		-Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion \
 		main.c

 format:
 	clang-format -i main.c
diff --git a/output.txt b/output.txt
 # Output on my Intel i5 NUC:

 # The corresponding SQL queries are roughly this:
 #
 # max_time_of_birth:
 # SELECT MAX(time_of_birth) FROM users;
 #
 # users_by_location:
 # -- Originally this was just supposed to select users in location "42"
 # -- but I had to add extra lookups to try to break the cache on the column-based data structure.
 # -- So let's call it: 
 # "select users from location 42 who were born in the first half of the day and aren't the super user"
 # SELECT COUNT(1) FROM users WHERE location = 42 AND time_of_birth < 40000 AND id != 1


 ./a.out
 elapsed: 0s	 cmax_time_of_birth: 86399
 elapsed: 5s	rmax_time_of_birth: 86399
 elasped: 23s records for rusers_by_location count: 9975
 elasped: 2s records for cusers_by_location count: 9975
	#include <assert.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <time.h>

	#define NUM_RECORDS 1000000
	#define NUM_QUERIES 1000 // number of queries to run in a row

	#define LOCATION_ID 42 // for the by_location queries

	struct User {
	int id;
	char name[255];
	int birth_location_id;
	int time_of_birth;
	};

	struct ColDB {
	int *id;
	char *name; // store the names contiguously to mimic row db
	int *birth_location_id;
	int *time_of_birth;
	} col_db;

	// row oriented DB is just an array of users.
	struct User *row_db;
	struct ColDB col_db;

	static void setup(void) {
	row_db = malloc(sizeof(struct User) * NUM_RECORDS);
	assert(row_db != NULL);

	col_db.id = malloc(sizeof(int) * NUM_RECORDS);
	col_db.name = malloc(255 * NUM_RECORDS);
	col_db.birth_location_id = malloc(sizeof(int) * NUM_RECORDS);
	col_db.time_of_birth = malloc(sizeof(int) * NUM_RECORDS);

	srand((unsigned int)time(NULL));

	for (int i = 0; i < NUM_RECORDS; i++) {
	int id = abs(rand());
	int l = abs(rand()) % 100; // make some duplicate birth locations...
	int t = abs(rand()) % 86400; // seconds in a day

	row_db[i].id = id;
	row_db[i].name[32] = '\0'; // just so we can print the names
	row_db[i].birth_location_id = l;
	row_db[i].time_of_birth = t;

	col_db.id[i] = id;
	col_db.name[i * 255 + 32] = '\0';
	col_db.birth_location_id[i] = l;
	col_db.time_of_birth[i] = t;
	}
	}

	static size_t rusers_by_location(struct User *results) {
	size_t count = 0;

	for (int i = 0; i < NUM_RECORDS; i++) {
	struct User u = row_db[i];
	if (u.birth_location_id == LOCATION_ID) {
	results[count++] = u;
	}
	}

	return count;
	}

	static size_t cusers_by_location(struct User *results) {
	size_t count = 0;

	for (int i = 0; i < NUM_RECORDS; i++) {
	if (col_db.birth_location_id[i] == LOCATION_ID) {
	struct User u = {
	.id = col_db.id[i],
	.birth_location_id = col_db.birth_location_id[i],
	.time_of_birth = col_db.time_of_birth[i]
	};

	memcpy(u.name, col_db.name + (i * 255), 255);

	results[count++] = u;
	}
	}

	return count;
	}

	static int cmax_time_of_birth(void) {
	int result = 0;

	for (int i = 0; i < NUM_RECORDS; i++) {
	int t = col_db.time_of_birth[i];
	if (t > result)
	result = t;
	}

	return result;
	}

	static int rmax_time_of_birth(void) {
	int result = 0;

	for (int i = 0; i < NUM_RECORDS; i++) {
	int t = row_db[i].time_of_birth;
	if (t > result)
	result = t;
	}

	return result;
	}

	int main(int argc, char **argv) {
	setup();

	int c, r = 0;
	size_t result_count = 0;

	struct User results = malloc(sizeof(struct User) NUM_RECORDS);
	assert(results != NULL);

	time_t start = time(NULL);
	for (int i = 0; i < NUM_QUERIES; i++) {
	c = cmax_time_of_birth();
	}
	printf("elapsed: %lds\t cmax_time_of_birth: %d\n", time(NULL) - start, c);

	start = time(NULL);
	for (int i = 0; i < NUM_QUERIES; i++) {
	r = rmax_time_of_birth();
	}
	printf("elapsed: %lds\trmax_time_of_birth: %d\n", time(NULL) - start, r);

	start = time(NULL);
	for (int i = 0; i < NUM_QUERIES; i++) {
	result_count = rusers_by_location(results);
	}
	printf("elasped: %lds records for rusers_by_location count: %ld\n",
	time(NULL) - start, result_count);

	start = time(NULL);
	for (int i = 0; i < NUM_QUERIES; i++) {
	result_count = cusers_by_location(results);
	}
	printf("elasped: %lds records for cusers_by_location count: %ld\n",
	time(NULL) - start, result_count);

	return EXIT_SUCCESS;
	}
	default:
	gcc -O0 -g3 -Wall -Wextra -Wconversion -Wdouble-promotion \
	-Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion \
	main.c

	format:
	clang-format -i main.c
	# Output on my Intel i5 NUC:

	# The corresponding SQL queries are roughly this:
	#
	# max_time_of_birth:
	# SELECT MAX(time_of_birth) FROM users;
	#
	# users_by_location:
	# -- Originally this was just supposed to select users in location "42"
	# -- but I had to add extra lookups to try to break the cache on the column-based data structure.
	# -- So let's call it:
	# "select users from location 42 who were born in the first half of the day and aren't the super user"
	# SELECT COUNT(1) FROM users WHERE location = 42 AND time_of_birth < 40000 AND id != 1


	./a.out
	elapsed: 0s cmax_time_of_birth: 86399
	elapsed: 5s rmax_time_of_birth: 86399
	elasped: 23s records for rusers_by_location count: 9975
	elasped: 2s records for cusers_by_location count: 9975