Skip to content

Instantly share code, notes, and snippets.

@stedolan
Created October 2, 2011 23:39
Show Gist options
  • Save stedolan/1258111 to your computer and use it in GitHub Desktop.
Save stedolan/1258111 to your computer and use it in GitHub Desktop.
Inter-core communication latencies
#define _GNU_SOURCE
#include <pthread.h>
#include <sys/types.h>
#include <stdio.h>
#include <sched.h>
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
struct {
char pad1[128];
volatile int turn;
char pad2[128];
} shared;
struct thread_info{
cpu_set_t cpu1, cpu2;
pthread_barrier_t barrier;
unsigned long long start, end;
};
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
void* a(struct thread_info* th){
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &th->cpu1);
pthread_barrier_wait(&th->barrier);
int i;
while (shared.turn == 1)asm volatile ("pause\n");
th->start = rdtsc();
shared.turn = 1;
for (i=0; i<COUNT-1; i++){
while (shared.turn == 1)asm volatile ("pause\n");
shared.turn = 1;
}
th->end = rdtsc();
return 0;
}
void* b(struct thread_info* th){
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &th->cpu2);
pthread_barrier_wait(&th->barrier);
int i;
for (i = 0; i<COUNT; i++){
while (shared.turn == 0)asm volatile ("pause\n");
shared.turn = 0;
}
return 0;
}
unsigned long long intercore_latency_test(int i, int j){
struct thread_info th;
CPU_ZERO(&th.cpu1);
CPU_ZERO(&th.cpu2);
CPU_SET(i, &th.cpu1);
CPU_SET(j, &th.cpu2);
pthread_barrier_init(&th.barrier, 0, 2);
pthread_t ta, tb;
pthread_create(&ta, 0, (void*(*)(void*))a, &th);
pthread_create(&tb, 0, (void*(*)(void*))b, &th);
pthread_join(ta, 0);
pthread_join(tb, 0);
return th.end - th.start;
}
typedef struct domain_{
struct domain_* parent;
int thiscpu; // some CPU in this domain
int size;
double width;
} domain;
typedef struct{
domain* d1;
domain* d2;
double latency; // average cost of communication d1 <-> d2
double variance; // variance of latency estimate
double joined_width; // average cost of communication within d1 + d2
} comm_latency;
void intercore_latency(comm_latency* c, int i, int j){
double s = 0;
double sq = 0;
int n;
for (n=0; n<NSAMPLES; n++){
double l = intercore_latency_test(i, j) / COUNT / 2;
s += l;
sq += l * l;
}
c->latency = s / NSAMPLES;
c->variance = (sq - s*s/NSAMPLES) / (NSAMPLES - 1);
}
void mk_cpu_domain(domain* d, int cpu){
d->parent = 0;
d->thiscpu = cpu;
d->size = 1;
d->width = 0;
}
void mk_joined_domain(domain* res, comm_latency* l){
assert(!l->d1->parent);
assert(!l->d2->parent);
res->parent = 0;
l->d1->parent = res;
l->d2->parent = res;
res->thiscpu = l->d1->thiscpu;
res->size = l->d1->size + l->d2->size;
res->width = l->joined_width;
}
double calc_joined_width(domain* d1, domain* d2, double latency){
double sd1 = d1->size, sd2 = d2->size;
double stot = sd1 + sd2;
return (1.0/(stot * stot)) *
((2 * sd1 * sd2 * latency) + /* comm between d1, d2 */
(sd1 * sd1 * d1->width) + /* comm within d1 */
(sd2 * sd2 * d2->width)); /* comm within d2 */
}
double calc_joined_latency(comm_latency* cl1, comm_latency* cl2){
assert(cl1->d2 == cl2->d2);
double sd1 = cl1->d1->size, sd2 = cl2->d1->size;
double stot = sd1 + sd2;
return (sd1 * cl1->latency + sd2 * cl2->latency) / stot;
}
double calc_joined_variance(comm_latency* cl1, comm_latency* cl2){
assert(cl1->d2 == cl2->d2);
double sd1 = cl1->d1->size, sd2 = cl2->d1->size;
double stot = sd1 + sd2;
return (sd1 * sd1 * cl1->variance + sd2 * sd2 * cl2->variance) / (stot * stot);
}
void mk_comm_latency(comm_latency* cl, domain* d1, domain* d2){
cl->d1 = d1;
cl->d2 = d2;
cl->joined_width = calc_joined_width(d1, d2, cl->latency);
}
double calc_mergecost(const comm_latency* l){
return l->latency + l->joined_width;
}
domain cpus[NCPUS];
void dump_domain(domain* dom, int l){
int i;
int first = 1;
char str[40];
char* s = str;
for (i=0;i<NCPUS;i++){
domain* d = &cpus[i];
while (d && d != dom)
d = d->parent;
if (d){
s += sprintf(s, "%s%d", first ? "" : "/", i);
first = 0;
}
}
printf(l ? "%-20s" : "%20s", str);
}
void dump_comm_latency(comm_latency* cl){
printf("%6.2f + %6.2f (+/- %6.2f)[", cl->latency, cl->joined_width, sqrt(cl->variance));
dump_domain(cl->d1, 0);
printf(" <-> ");
dump_domain(cl->d2, 1);
printf("]\n");
}
int cmp_comm_latency(const void* c1, const void* c2){
double d1 = calc_mergecost(c1), d2 = calc_mergecost(c2);
if (d1 < d2) return -1;
if (d2 < d1) return +1;
else return 0;
}
void dump(comm_latency* results, int nres){
int i;
qsort(results, nres, sizeof(comm_latency), cmp_comm_latency);
for (i=0; i<nres; i++){
dump_comm_latency(&results[i]);
}
printf("\n");
}
void move_to_d1(comm_latency* cl, domain* d){
if (cl->d2 == d){
cl->d2 = cl->d1;
cl->d1 = d;
}
}
void find_trees(comm_latency* results, int nres){
int i;
dump(results, nres);
while (nres){
// Find the pair of domains to merge
double min_mergecost = calc_mergecost(&results[0]);
comm_latency* min_mergecost_cl = &results[0];
for (i=1; i<nres; i++){
if (calc_mergecost(&results[i]) < min_mergecost){
min_mergecost = calc_mergecost(&results[i]);
min_mergecost_cl = &results[i];
}
}
// Create the merged domain
domain* merged = malloc(sizeof(domain));
mk_joined_domain(merged, min_mergecost_cl);
// Update all the latency scores to use the new domain
domain* d1 = min_mergecost_cl->d1;
domain* d2 = min_mergecost_cl->d2;
printf("Merging "); dump_domain(d1,0); printf(" and "); dump_domain(d2,1); printf("\n");
// Find comparisons with d2
comm_latency* out = results;
comm_latency* comparison_with_d2[NCPUS];
memset(comparison_with_d2, 0, sizeof(comparison_with_d2));
for (i=0;i<nres;i++){
if (&results[i] == min_mergecost_cl) continue;
move_to_d1(&results[i], d2);
if (results[i].d1 == d2){
comparison_with_d2[results[i].d2->thiscpu] = &results[i];
}
}
// Find comparisons with d1, update to reflect scores with d1+d2
for (i=0;i<nres;i++){
if (&results[i] == min_mergecost_cl) continue;
move_to_d1(&results[i], d1);
if (results[i].d1 == d1){
domain* other = results[i].d2;
comm_latency* cmp_d1 = &results[i];
comm_latency* cmp_d2 = comparison_with_d2[other->thiscpu];
assert(cmp_d1 && cmp_d2 && cmp_d1->d2 == other && cmp_d2->d2 == other);
// replace cmp_d1 with a new comparison of (d1+d2) and other
cmp_d1->latency = calc_joined_latency(cmp_d1, cmp_d2);
mk_comm_latency(cmp_d1, merged, other);
}
}
// Remove old comparisons
for (i=0;i<nres;i++){
if (&results[i] == min_mergecost_cl) continue;
move_to_d1(&results[i], d2);
if (results[i].d1 != d2){
*out++ = results[i];
}
}
nres = out - results;
dump(results, nres);
}
}
int main(){
int i, j;
printf("starting\n");
for (i=0; i<NCPUS; i++){
mk_cpu_domain(&cpus[i], i);
}
comm_latency results[NCPUS * NCPUS];
int nres = 0;
intercore_latency(&results[0], 0, 1);
printf("testing cpu... ");
fflush(stdout);
for (i=0; i<NCPUS; i++){
printf("%d ", i);
fflush(stdout);
for (j=0; j<NCPUS; j++){
if (i >= j) continue;
intercore_latency(&results[nres], i, j);
mk_comm_latency(&results[nres], &cpus[i], &cpus[j]);
nres++;
}
}
printf("\n");
printf("Building trees...\n");
find_trees(results, nres);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment