Created
October 2, 2011 23:39
-
-
Save stedolan/1258111 to your computer and use it in GitHub Desktop.
Inter-core communication latencies
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _GNU_SOURCE | |
#include <pthread.h> | |
#include <sys/types.h> | |
#include <stdio.h> | |
#include <sched.h> | |
#include <assert.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <math.h> | |
struct { | |
char pad1[128]; | |
volatile int turn; | |
char pad2[128]; | |
} shared; | |
struct thread_info{ | |
cpu_set_t cpu1, cpu2; | |
pthread_barrier_t barrier; | |
unsigned long long start, end; | |
}; | |
static __inline__ unsigned long long rdtsc(void) | |
{ | |
unsigned hi, lo; | |
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); | |
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); | |
} | |
void* a(struct thread_info* th){ | |
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &th->cpu1); | |
pthread_barrier_wait(&th->barrier); | |
int i; | |
while (shared.turn == 1)asm volatile ("pause\n"); | |
th->start = rdtsc(); | |
shared.turn = 1; | |
for (i=0; i<COUNT-1; i++){ | |
while (shared.turn == 1)asm volatile ("pause\n"); | |
shared.turn = 1; | |
} | |
th->end = rdtsc(); | |
return 0; | |
} | |
void* b(struct thread_info* th){ | |
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &th->cpu2); | |
pthread_barrier_wait(&th->barrier); | |
int i; | |
for (i = 0; i<COUNT; i++){ | |
while (shared.turn == 0)asm volatile ("pause\n"); | |
shared.turn = 0; | |
} | |
return 0; | |
} | |
unsigned long long intercore_latency_test(int i, int j){ | |
struct thread_info th; | |
CPU_ZERO(&th.cpu1); | |
CPU_ZERO(&th.cpu2); | |
CPU_SET(i, &th.cpu1); | |
CPU_SET(j, &th.cpu2); | |
pthread_barrier_init(&th.barrier, 0, 2); | |
pthread_t ta, tb; | |
pthread_create(&ta, 0, (void*(*)(void*))a, &th); | |
pthread_create(&tb, 0, (void*(*)(void*))b, &th); | |
pthread_join(ta, 0); | |
pthread_join(tb, 0); | |
return th.end - th.start; | |
} | |
typedef struct domain_{ | |
struct domain_* parent; | |
int thiscpu; // some CPU in this domain | |
int size; | |
double width; | |
} domain; | |
typedef struct{ | |
domain* d1; | |
domain* d2; | |
double latency; // average cost of communication d1 <-> d2 | |
double variance; // variance of latency estimate | |
double joined_width; // average cost of communication within d1 + d2 | |
} comm_latency; | |
void intercore_latency(comm_latency* c, int i, int j){ | |
double s = 0; | |
double sq = 0; | |
int n; | |
for (n=0; n<NSAMPLES; n++){ | |
double l = intercore_latency_test(i, j) / COUNT / 2; | |
s += l; | |
sq += l * l; | |
} | |
c->latency = s / NSAMPLES; | |
c->variance = (sq - s*s/NSAMPLES) / (NSAMPLES - 1); | |
} | |
void mk_cpu_domain(domain* d, int cpu){ | |
d->parent = 0; | |
d->thiscpu = cpu; | |
d->size = 1; | |
d->width = 0; | |
} | |
void mk_joined_domain(domain* res, comm_latency* l){ | |
assert(!l->d1->parent); | |
assert(!l->d2->parent); | |
res->parent = 0; | |
l->d1->parent = res; | |
l->d2->parent = res; | |
res->thiscpu = l->d1->thiscpu; | |
res->size = l->d1->size + l->d2->size; | |
res->width = l->joined_width; | |
} | |
double calc_joined_width(domain* d1, domain* d2, double latency){ | |
double sd1 = d1->size, sd2 = d2->size; | |
double stot = sd1 + sd2; | |
return (1.0/(stot * stot)) * | |
((2 * sd1 * sd2 * latency) + /* comm between d1, d2 */ | |
(sd1 * sd1 * d1->width) + /* comm within d1 */ | |
(sd2 * sd2 * d2->width)); /* comm within d2 */ | |
} | |
double calc_joined_latency(comm_latency* cl1, comm_latency* cl2){ | |
assert(cl1->d2 == cl2->d2); | |
double sd1 = cl1->d1->size, sd2 = cl2->d1->size; | |
double stot = sd1 + sd2; | |
return (sd1 * cl1->latency + sd2 * cl2->latency) / stot; | |
} | |
double calc_joined_variance(comm_latency* cl1, comm_latency* cl2){ | |
assert(cl1->d2 == cl2->d2); | |
double sd1 = cl1->d1->size, sd2 = cl2->d1->size; | |
double stot = sd1 + sd2; | |
return (sd1 * sd1 * cl1->variance + sd2 * sd2 * cl2->variance) / (stot * stot); | |
} | |
void mk_comm_latency(comm_latency* cl, domain* d1, domain* d2){ | |
cl->d1 = d1; | |
cl->d2 = d2; | |
cl->joined_width = calc_joined_width(d1, d2, cl->latency); | |
} | |
double calc_mergecost(const comm_latency* l){ | |
return l->latency + l->joined_width; | |
} | |
domain cpus[NCPUS]; | |
void dump_domain(domain* dom, int l){ | |
int i; | |
int first = 1; | |
char str[40]; | |
char* s = str; | |
for (i=0;i<NCPUS;i++){ | |
domain* d = &cpus[i]; | |
while (d && d != dom) | |
d = d->parent; | |
if (d){ | |
s += sprintf(s, "%s%d", first ? "" : "/", i); | |
first = 0; | |
} | |
} | |
printf(l ? "%-20s" : "%20s", str); | |
} | |
void dump_comm_latency(comm_latency* cl){ | |
printf("%6.2f + %6.2f (+/- %6.2f)[", cl->latency, cl->joined_width, sqrt(cl->variance)); | |
dump_domain(cl->d1, 0); | |
printf(" <-> "); | |
dump_domain(cl->d2, 1); | |
printf("]\n"); | |
} | |
int cmp_comm_latency(const void* c1, const void* c2){ | |
double d1 = calc_mergecost(c1), d2 = calc_mergecost(c2); | |
if (d1 < d2) return -1; | |
if (d2 < d1) return +1; | |
else return 0; | |
} | |
void dump(comm_latency* results, int nres){ | |
int i; | |
qsort(results, nres, sizeof(comm_latency), cmp_comm_latency); | |
for (i=0; i<nres; i++){ | |
dump_comm_latency(&results[i]); | |
} | |
printf("\n"); | |
} | |
void move_to_d1(comm_latency* cl, domain* d){ | |
if (cl->d2 == d){ | |
cl->d2 = cl->d1; | |
cl->d1 = d; | |
} | |
} | |
void find_trees(comm_latency* results, int nres){ | |
int i; | |
dump(results, nres); | |
while (nres){ | |
// Find the pair of domains to merge | |
double min_mergecost = calc_mergecost(&results[0]); | |
comm_latency* min_mergecost_cl = &results[0]; | |
for (i=1; i<nres; i++){ | |
if (calc_mergecost(&results[i]) < min_mergecost){ | |
min_mergecost = calc_mergecost(&results[i]); | |
min_mergecost_cl = &results[i]; | |
} | |
} | |
// Create the merged domain | |
domain* merged = malloc(sizeof(domain)); | |
mk_joined_domain(merged, min_mergecost_cl); | |
// Update all the latency scores to use the new domain | |
domain* d1 = min_mergecost_cl->d1; | |
domain* d2 = min_mergecost_cl->d2; | |
printf("Merging "); dump_domain(d1,0); printf(" and "); dump_domain(d2,1); printf("\n"); | |
// Find comparisons with d2 | |
comm_latency* out = results; | |
comm_latency* comparison_with_d2[NCPUS]; | |
memset(comparison_with_d2, 0, sizeof(comparison_with_d2)); | |
for (i=0;i<nres;i++){ | |
if (&results[i] == min_mergecost_cl) continue; | |
move_to_d1(&results[i], d2); | |
if (results[i].d1 == d2){ | |
comparison_with_d2[results[i].d2->thiscpu] = &results[i]; | |
} | |
} | |
// Find comparisons with d1, update to reflect scores with d1+d2 | |
for (i=0;i<nres;i++){ | |
if (&results[i] == min_mergecost_cl) continue; | |
move_to_d1(&results[i], d1); | |
if (results[i].d1 == d1){ | |
domain* other = results[i].d2; | |
comm_latency* cmp_d1 = &results[i]; | |
comm_latency* cmp_d2 = comparison_with_d2[other->thiscpu]; | |
assert(cmp_d1 && cmp_d2 && cmp_d1->d2 == other && cmp_d2->d2 == other); | |
// replace cmp_d1 with a new comparison of (d1+d2) and other | |
cmp_d1->latency = calc_joined_latency(cmp_d1, cmp_d2); | |
mk_comm_latency(cmp_d1, merged, other); | |
} | |
} | |
// Remove old comparisons | |
for (i=0;i<nres;i++){ | |
if (&results[i] == min_mergecost_cl) continue; | |
move_to_d1(&results[i], d2); | |
if (results[i].d1 != d2){ | |
*out++ = results[i]; | |
} | |
} | |
nres = out - results; | |
dump(results, nres); | |
} | |
} | |
int main(){ | |
int i, j; | |
printf("starting\n"); | |
for (i=0; i<NCPUS; i++){ | |
mk_cpu_domain(&cpus[i], i); | |
} | |
comm_latency results[NCPUS * NCPUS]; | |
int nres = 0; | |
intercore_latency(&results[0], 0, 1); | |
printf("testing cpu... "); | |
fflush(stdout); | |
for (i=0; i<NCPUS; i++){ | |
printf("%d ", i); | |
fflush(stdout); | |
for (j=0; j<NCPUS; j++){ | |
if (i >= j) continue; | |
intercore_latency(&results[nres], i, j); | |
mk_comm_latency(&results[nres], &cpus[i], &cpus[j]); | |
nres++; | |
} | |
} | |
printf("\n"); | |
printf("Building trees...\n"); | |
find_trees(results, nres); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment