Last active
May 9, 2022 19:40
-
-
Save s-macke/08c8c603ec7c034eb4af49b825c49861 to your computer and use it in GitHub Desktop.
The code measures the direct and indirect cost (L2 cache misses) of context switches
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// compile with | |
// 'gcc -O2 main.c -o contextperf.c' -lpthread | |
#define _GNU_SOURCE | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <pthread.h> | |
#include <sched.h> | |
#include <unistd.h> | |
#include <stdlib.h> | |
#include <sys/errno.h> | |
#include <time.h> | |
// set used RAM per thread. Default is 12 MB. Can be changed by program argument | |
int64_t SIZE = 12 * 1024 * 1024; | |
// --------------------------------------------------------------- | |
// get cpu clock counter | |
int64_t rdtsc() { | |
unsigned int lo, hi; | |
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); | |
return (int64_t) (((uint64_t) hi << 32) | lo); | |
} | |
// get time in nanoseconds | |
int64_t getTime() { | |
struct timespec ts; | |
timespec_get(&ts, TIME_UTC); | |
char buff[100]; | |
//strftime(buff, sizeof buff, "%D %T", gmtime(&ts.tv_sec)); | |
//printf("Current time: %s.%09ld UTC\n", buff, ts.tv_nsec); | |
return ts.tv_nsec + ts.tv_sec * 1000000000; | |
} | |
// --------------------------------------------------------------- | |
int stick_this_thread_to_core(int core_id) { | |
int num_cores = sysconf(_SC_NPROCESSORS_ONLN); | |
if (core_id < 0 || core_id >= num_cores) | |
return EINVAL; | |
cpu_set_t cpuset; | |
CPU_ZERO(&cpuset); | |
CPU_SET(core_id, &cpuset); | |
pthread_t current_thread = pthread_self(); | |
return pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset); | |
} | |
// --------------------------------------------------------------- | |
// some random calculation, which uses a specific amount of RAM. Read and write to RAM. | |
// Returns the number of clock cycles needed. | |
int64_t __attribute__ ((noinline)) calculate(char *data) { | |
int64_t clock = rdtsc(); | |
for (int i = 0; i < SIZE; i++) { | |
data[i] *= data[SIZE - i - 1]; | |
} | |
int64_t delta = rdtsc() - clock; | |
return delta; | |
} | |
// --------------------------------------------------------------- | |
struct { | |
int64_t shortest_time; // shortest number of clock cycles, the calculation took. As baseline for the other calculations | |
int64_t switch_direct; // clock cycles, the calculation switched to another thread | |
int64_t switch0; // clock cycles, the calculation took on first run | |
int64_t switch1; // clock cycles, the calculation took on second run. Assumed to be just the same as "shortest_time" variable | |
int64_t n; // number of measurements | |
double cpu_frequency; // CPU frequency in GHz | |
} statistics; | |
// --------------------------------------------------------------- | |
volatile int64_t cycles_before = 0; // the clock cycle before the thread switch | |
// --------------------------------------------------------------- | |
// mutex and condition to control the thread switching | |
pthread_mutex_t ready_mutex = PTHREAD_MUTEX_INITIALIZER; | |
pthread_cond_t ready_cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER}; | |
int ready_flag[2] = {0, 0}; | |
// --------------------------------------------------------------- | |
void *thread(void *vargp) { | |
int threadid = *(int *) vargp; | |
char *data; | |
data = malloc(SIZE); | |
statistics.switch0 = 0; | |
statistics.switch1 = 0; | |
statistics.switch_direct = 0; | |
statistics.n = 0; | |
// warmup 1, allow the kernel to assign the RAM | |
calculate(data); | |
//stick_this_thread_to_core(threadid); | |
stick_this_thread_to_core(0); | |
// warmup 2. calculate on core 0 | |
calculate(data); | |
printf("thread %i ready\n", threadid); | |
for (int j = 0; j < 100; j++) { // average of 100 measurements | |
pthread_mutex_lock(&ready_mutex); | |
while (!ready_flag[threadid]) { | |
pthread_cond_wait(&ready_cond[threadid], &ready_mutex); | |
} | |
ready_flag[threadid] = 0; | |
statistics.switch_direct += (rdtsc() - cycles_before); //the clock cycles the other thread switched to this thread | |
statistics.switch0 += calculate(data); | |
statistics.switch1 += statistics.shortest_time; | |
statistics.n++; | |
// tell the other thread to run. Because of the mutex, the other thread will not run before this thread is done. | |
ready_flag[1 - threadid] = 1; | |
pthread_cond_signal(&ready_cond[1 - threadid]); | |
if (statistics.n != 0) { | |
//int64_t lost = (switch0 - switch1)/n; | |
int64_t lost = statistics.switch0 / statistics.n - statistics.shortest_time; | |
printf("%3i: thread=%i, direct_clock= %li cycles, indirect_clock= %li cycles cpu_frequency= %lf GHz\n", j, threadid, statistics.switch_direct / statistics.n, lost, statistics.cpu_frequency); | |
} | |
// warmup 3, don't trust the first 10 measurements | |
if (j == 10) { | |
statistics.switch0 = 0; | |
statistics.switch1 = 0; | |
statistics.switch_direct = 0; | |
statistics.n = 0; | |
} | |
cycles_before = rdtsc(); // store the clock cycle before the thread switch | |
pthread_mutex_unlock(&ready_mutex); // switch to the other thread | |
} | |
// end of the program | |
free(data); | |
return NULL; | |
} | |
// --------------------------------------------------------------- | |
// determine shortest number of clock cycles, the calculation takes | |
int64_t fastest() { | |
int64_t clock = INT64_MAX; | |
char *data = malloc(SIZE); | |
if (data == NULL) { | |
printf("malloc failed\n"); | |
exit(1); | |
} | |
for (int i = 0; i < 30; i++) { | |
int64_t delta = calculate(data); | |
if (delta < clock) clock = delta; | |
} | |
free(data); | |
return clock; | |
} | |
// determine cpu frequency and store it in the global variable "cpu_frequency" | |
void getFastestRun() { | |
int64_t starttime = getTime(); | |
int64_t startclock = rdtsc(); | |
stick_this_thread_to_core(0); | |
statistics.shortest_time = fastest(); | |
statistics.shortest_time = fastest(); | |
printf("shortest=%li\n", statistics.shortest_time); | |
statistics.cpu_frequency = (double)(rdtsc() - startclock) / (double) (getTime() - starttime); | |
printf("%lf GHz\n", statistics.cpu_frequency); | |
} | |
// --------------------------------------------------------------- | |
int main(int argc, char *argv[]) { | |
// allow to change the amount of RAM used by the threads | |
if (argc == 2) { | |
SIZE = atoi(argv[1]) * 1024 * 1024; | |
if (SIZE == 0) { | |
printf("Error: argument must be a integer larger than 0\n"); | |
return 1; | |
} | |
} | |
printf("RAM usage per thread: %liMB\n", SIZE / 1024 / 1024); | |
getFastestRun(); // determine the CPU frequency and the shortest number of clock cycles for the calculation. | |
// create the threads | |
pthread_t thread_id1; | |
int id0 = 0; | |
pthread_create(&thread_id1, NULL, thread, &id0); | |
pthread_t thread_id2; | |
int id1 = 1; | |
pthread_create(&thread_id2, NULL, thread, &id1); | |
// send thread 0 the signat to start | |
cycles_before = rdtsc(); | |
ready_flag[0] = 1; | |
pthread_cond_signal(&ready_cond[0]); | |
// wait for the threads to finish | |
pthread_join(thread_id2, NULL); | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment