stedolan · October 2, 2011 23:39
diff --git a/corebench.c b/corebench.c
 #define _GNU_SOURCE
 #include <pthread.h>
 #include <sys/types.h>
 #include <stdio.h>
 #include <sched.h>
 #include <assert.h>
 #include <string.h>
 #include <stdlib.h>
 #include <math.h>
 struct {
  char pad1[128];
  volatile int turn;
  char pad2[128];
 } shared;
 struct thread_info{
  cpu_set_t cpu1, cpu2;
  pthread_barrier_t barrier;
  unsigned long long start, end;
 };

 static __inline__ unsigned long long rdtsc(void)
 {
  unsigned hi, lo;
  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
  return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
 }

 void* a(struct thread_info* th){
  pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &th->cpu1);
  pthread_barrier_wait(&th->barrier);
  int i;

  while (shared.turn == 1)asm volatile ("pause\n");
  th->start = rdtsc();
  shared.turn = 1;

  for (i=0; i<COUNT-1; i++){
    while (shared.turn == 1)asm volatile ("pause\n");
    shared.turn = 1;
  }
  th->end = rdtsc();
  return 0;
 }
 void* b(struct thread_info* th){
  pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &th->cpu2);
  pthread_barrier_wait(&th->barrier);
  int i;

  for (i = 0; i<COUNT; i++){
    while (shared.turn == 0)asm volatile ("pause\n");
    shared.turn = 0;
  }
  return 0;
 }

 unsigned long long intercore_latency_test(int i, int j){
  struct thread_info th;
  CPU_ZERO(&th.cpu1);
  CPU_ZERO(&th.cpu2);
  CPU_SET(i, &th.cpu1);
  CPU_SET(j, &th.cpu2);
  pthread_barrier_init(&th.barrier, 0, 2);
  pthread_t ta, tb;
  pthread_create(&ta, 0, (void*(*)(void*))a, &th);
  pthread_create(&tb, 0, (void*(*)(void*))b, &th);
  pthread_join(ta, 0);
  pthread_join(tb, 0);
  return th.end - th.start;
 }


 typedef struct domain_{
  struct domain_* parent;
  int thiscpu; // some CPU in this domain
  int size;
  double width;
 } domain;

 typedef struct{
  domain* d1;
  domain* d2;
  double latency; // average cost of communication d1 <-> d2
  double variance; // variance of latency estimate
  double joined_width; // average cost of communication within d1 + d2
 } comm_latency;


 void intercore_latency(comm_latency* c, int i, int j){
  double s = 0;
  double sq = 0;
  int n;
  for (n=0; n<NSAMPLES; n++){
    double l = intercore_latency_test(i, j) / COUNT / 2;
    s += l;
    sq += l * l;
  }
  c->latency = s / NSAMPLES;
  c->variance = (sq - s*s/NSAMPLES) / (NSAMPLES - 1);
 }

 void mk_cpu_domain(domain* d, int cpu){
  d->parent = 0;
  d->thiscpu = cpu;
  d->size = 1;
  d->width = 0;
 }

 void mk_joined_domain(domain* res, comm_latency* l){
  assert(!l->d1->parent);
  assert(!l->d2->parent);
  res->parent = 0;
  l->d1->parent = res;
  l->d2->parent = res;
  res->thiscpu = l->d1->thiscpu;
  res->size = l->d1->size + l->d2->size;
  res->width = l->joined_width;
 }


 double calc_joined_width(domain* d1, domain* d2, double latency){
  double sd1 = d1->size, sd2 = d2->size;
  double stot = sd1 + sd2;
  return (1.0/(stot * stot)) * 
    ((2 * sd1 * sd2 * latency) + /* comm between d1, d2 */
     (sd1 * sd1 * d1->width) +   /* comm within d1 */
     (sd2 * sd2 * d2->width));   /* comm within d2 */
 }

 double calc_joined_latency(comm_latency* cl1, comm_latency* cl2){
  assert(cl1->d2 == cl2->d2);
  double sd1 = cl1->d1->size, sd2 = cl2->d1->size;
  double stot = sd1 + sd2;
  return (sd1 * cl1->latency + sd2 * cl2->latency) / stot;
 }

 double calc_joined_variance(comm_latency* cl1, comm_latency* cl2){
  assert(cl1->d2 == cl2->d2);
  double sd1 = cl1->d1->size, sd2 = cl2->d1->size;
  double stot = sd1 + sd2;
  return (sd1 * sd1 * cl1->variance + sd2 * sd2 * cl2->variance) / (stot * stot);
 }

 void mk_comm_latency(comm_latency* cl, domain* d1, domain* d2){
  cl->d1 = d1;
  cl->d2 = d2;
  cl->joined_width = calc_joined_width(d1, d2, cl->latency);
 }

 double calc_mergecost(const comm_latency* l){
  return l->latency + l->joined_width;
 }

 domain cpus[NCPUS];

 void dump_domain(domain* dom, int l){
  int i;
  int first = 1;
  char str[40];
  char* s = str;
  for (i=0;i<NCPUS;i++){
    domain* d = &cpus[i];
    while (d && d != dom)
      d = d->parent;
    if (d){
      s += sprintf(s, "%s%d", first ? "" : "/", i);
      first = 0;
    }
  }
  printf(l ? "%-20s" : "%20s", str);
 }

 void dump_comm_latency(comm_latency* cl){
  printf("%6.2f + %6.2f (+/- %6.2f)[", cl->latency, cl->joined_width, sqrt(cl->variance));
  dump_domain(cl->d1, 0);
  printf(" <-> ");
  dump_domain(cl->d2, 1);
  printf("]\n");
 }

 int cmp_comm_latency(const void* c1, const void* c2){
  double d1 = calc_mergecost(c1), d2 = calc_mergecost(c2);
  if (d1 < d2) return -1;
  if (d2 < d1) return +1;
  else return 0;
 }
 void dump(comm_latency* results, int nres){
  int i;
  qsort(results, nres, sizeof(comm_latency), cmp_comm_latency);
  for (i=0; i<nres; i++){
    dump_comm_latency(&results[i]);
  }
  printf("\n");
 }

 void move_to_d1(comm_latency* cl, domain* d){
  if (cl->d2 == d){
    cl->d2 = cl->d1;
    cl->d1 = d;
  }
 }

 void find_trees(comm_latency* results, int nres){
  int i;
  dump(results, nres);
  while (nres){
    // Find the pair of domains to merge
    double min_mergecost = calc_mergecost(&results[0]);
    comm_latency* min_mergecost_cl = &results[0];
    for (i=1; i<nres; i++){
      if (calc_mergecost(&results[i]) < min_mergecost){
        min_mergecost = calc_mergecost(&results[i]);
        min_mergecost_cl = &results[i];
      }
    }
  
    // Create the merged domain
    domain* merged = malloc(sizeof(domain));
    mk_joined_domain(merged, min_mergecost_cl);
  
    // Update all the latency scores to use the new domain
    domain* d1 = min_mergecost_cl->d1;
    domain* d2 = min_mergecost_cl->d2;
   
    printf("Merging "); dump_domain(d1,0); printf(" and "); dump_domain(d2,1); printf("\n");

    // Find comparisons with d2
    comm_latency* out = results;
    comm_latency* comparison_with_d2[NCPUS];
    memset(comparison_with_d2, 0, sizeof(comparison_with_d2));
    for (i=0;i<nres;i++){
      if (&results[i] == min_mergecost_cl) continue;
      move_to_d1(&results[i], d2);
      if (results[i].d1 == d2){
        comparison_with_d2[results[i].d2->thiscpu] = &results[i];
      }
    }

    // Find comparisons with d1, update to reflect scores with d1+d2
    for (i=0;i<nres;i++){
      if (&results[i] == min_mergecost_cl) continue;
      move_to_d1(&results[i], d1);
      if (results[i].d1 == d1){
        domain* other = results[i].d2;
        comm_latency* cmp_d1 = &results[i];
        comm_latency* cmp_d2 = comparison_with_d2[other->thiscpu];
        assert(cmp_d1 && cmp_d2 && cmp_d1->d2 == other && cmp_d2->d2 == other);
        // replace cmp_d1 with a new comparison of (d1+d2) and other
        cmp_d1->latency = calc_joined_latency(cmp_d1, cmp_d2);
        mk_comm_latency(cmp_d1, merged, other);
      }
    }
  
    // Remove old comparisons
    for (i=0;i<nres;i++){
      if (&results[i] == min_mergecost_cl) continue;
      move_to_d1(&results[i], d2);
      if (results[i].d1 != d2){
        *out++ = results[i];
      }
    }
  
    nres = out - results;
    dump(results, nres);
  }
 }


 int main(){
  int i, j;

  printf("starting\n");
  for (i=0; i<NCPUS; i++){
    mk_cpu_domain(&cpus[i], i);
  }
  comm_latency results[NCPUS * NCPUS];
  int nres = 0;
  intercore_latency(&results[0], 0, 1);
  
  printf("testing cpu... ");
  fflush(stdout);
  for (i=0; i<NCPUS; i++){
    printf("%d ", i);
    fflush(stdout);
    for (j=0; j<NCPUS; j++){
      if (i >= j) continue;
      intercore_latency(&results[nres], i, j);
      mk_comm_latency(&results[nres], &cpus[i], &cpus[j]);
      nres++;
    }
  }
  printf("\n");
  printf("Building trees...\n");
  find_trees(results, nres);
  
  return 0;
 }
	#define _GNU_SOURCE
	#include <pthread.h>
	#include <sys/types.h>
	#include <stdio.h>
	#include <sched.h>
	#include <assert.h>
	#include <string.h>
	#include <stdlib.h>
	#include <math.h>
	struct {
	char pad1[128];
	volatile int turn;
	char pad2[128];
	} shared;
	struct thread_info{
	cpu_set_t cpu1, cpu2;
	pthread_barrier_t barrier;
	unsigned long long start, end;
	};

	static __inline__ unsigned long long rdtsc(void)
	{
	unsigned hi, lo;
	__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
	return ( (unsigned long long)lo)\|( ((unsigned long long)hi)<<32 );
	}

	void* a(struct thread_info* th){
	pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &th->cpu1);
	pthread_barrier_wait(&th->barrier);
	int i;

	while (shared.turn == 1)asm volatile ("pause\n");
	th->start = rdtsc();
	shared.turn = 1;

	for (i=0; i<COUNT-1; i++){
	while (shared.turn == 1)asm volatile ("pause\n");
	shared.turn = 1;
	}
	th->end = rdtsc();
	return 0;
	}
	void* b(struct thread_info* th){
	pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &th->cpu2);
	pthread_barrier_wait(&th->barrier);
	int i;

	for (i = 0; i<COUNT; i++){
	while (shared.turn == 0)asm volatile ("pause\n");
	shared.turn = 0;
	}
	return 0;
	}

	unsigned long long intercore_latency_test(int i, int j){
	struct thread_info th;
	CPU_ZERO(&th.cpu1);
	CPU_ZERO(&th.cpu2);
	CPU_SET(i, &th.cpu1);
	CPU_SET(j, &th.cpu2);
	pthread_barrier_init(&th.barrier, 0, 2);
	pthread_t ta, tb;
	pthread_create(&ta, 0, (void()(void*))a, &th);
	pthread_create(&tb, 0, (void()(void*))b, &th);
	pthread_join(ta, 0);
	pthread_join(tb, 0);
	return th.end - th.start;
	}


	typedef struct domain_{
	struct domain_* parent;
	int thiscpu; // some CPU in this domain
	int size;
	double width;
	} domain;

	typedef struct{
	domain* d1;
	domain* d2;
	double latency; // average cost of communication d1 <-> d2
	double variance; // variance of latency estimate
	double joined_width; // average cost of communication within d1 + d2
	} comm_latency;


	void intercore_latency(comm_latency* c, int i, int j){
	double s = 0;
	double sq = 0;
	int n;
	for (n=0; n<NSAMPLES; n++){
	double l = intercore_latency_test(i, j) / COUNT / 2;
	s += l;
	sq += l * l;
	}
	c->latency = s / NSAMPLES;
	c->variance = (sq - s*s/NSAMPLES) / (NSAMPLES - 1);
	}

	void mk_cpu_domain(domain* d, int cpu){
	d->parent = 0;
	d->thiscpu = cpu;
	d->size = 1;
	d->width = 0;
	}

	void mk_joined_domain(domain* res, comm_latency* l){
	assert(!l->d1->parent);
	assert(!l->d2->parent);
	res->parent = 0;
	l->d1->parent = res;
	l->d2->parent = res;
	res->thiscpu = l->d1->thiscpu;
	res->size = l->d1->size + l->d2->size;
	res->width = l->joined_width;
	}


	double calc_joined_width(domain* d1, domain* d2, double latency){
	double sd1 = d1->size, sd2 = d2->size;
	double stot = sd1 + sd2;
	return (1.0/(stot * stot)) *
	((2 * sd1 * sd2 * latency) + /* comm between d1, d2 */
	(sd1 * sd1 * d1->width) + /* comm within d1 */
	(sd2 * sd2 * d2->width)); /* comm within d2 */
	}

	double calc_joined_latency(comm_latency* cl1, comm_latency* cl2){
	assert(cl1->d2 == cl2->d2);
	double sd1 = cl1->d1->size, sd2 = cl2->d1->size;
	double stot = sd1 + sd2;
	return (sd1 * cl1->latency + sd2 * cl2->latency) / stot;
	}

	double calc_joined_variance(comm_latency* cl1, comm_latency* cl2){
	assert(cl1->d2 == cl2->d2);
	double sd1 = cl1->d1->size, sd2 = cl2->d1->size;
	double stot = sd1 + sd2;
	return (sd1 * sd1 * cl1->variance + sd2 * sd2 * cl2->variance) / (stot * stot);
	}

	void mk_comm_latency(comm_latency* cl, domain* d1, domain* d2){
	cl->d1 = d1;
	cl->d2 = d2;
	cl->joined_width = calc_joined_width(d1, d2, cl->latency);
	}

	double calc_mergecost(const comm_latency* l){
	return l->latency + l->joined_width;
	}

	domain cpus[NCPUS];

	void dump_domain(domain* dom, int l){
	int i;
	int first = 1;
	char str[40];
	char* s = str;
	for (i=0;i<NCPUS;i++){
	domain* d = &cpus[i];
	while (d && d != dom)
	d = d->parent;
	if (d){
	s += sprintf(s, "%s%d", first ? "" : "/", i);
	first = 0;
	}
	}
	printf(l ? "%-20s" : "%20s", str);
	}

	void dump_comm_latency(comm_latency* cl){
	printf("%6.2f + %6.2f (+/- %6.2f)[", cl->latency, cl->joined_width, sqrt(cl->variance));
	dump_domain(cl->d1, 0);
	printf(" <-> ");
	dump_domain(cl->d2, 1);
	printf("]\n");
	}

	int cmp_comm_latency(const void* c1, const void* c2){
	double d1 = calc_mergecost(c1), d2 = calc_mergecost(c2);
	if (d1 < d2) return -1;
	if (d2 < d1) return +1;
	else return 0;
	}
	void dump(comm_latency* results, int nres){
	int i;
	qsort(results, nres, sizeof(comm_latency), cmp_comm_latency);
	for (i=0; i<nres; i++){
	dump_comm_latency(&results[i]);
	}
	printf("\n");
	}

	void move_to_d1(comm_latency* cl, domain* d){
	if (cl->d2 == d){
	cl->d2 = cl->d1;
	cl->d1 = d;
	}
	}

	void find_trees(comm_latency* results, int nres){
	int i;
	dump(results, nres);
	while (nres){
	// Find the pair of domains to merge
	double min_mergecost = calc_mergecost(&results[0]);
	comm_latency* min_mergecost_cl = &results[0];
	for (i=1; i<nres; i++){
	if (calc_mergecost(&results[i]) < min_mergecost){
	min_mergecost = calc_mergecost(&results[i]);
	min_mergecost_cl = &results[i];
	}
	}

	// Create the merged domain
	domain* merged = malloc(sizeof(domain));
	mk_joined_domain(merged, min_mergecost_cl);

	// Update all the latency scores to use the new domain
	domain* d1 = min_mergecost_cl->d1;
	domain* d2 = min_mergecost_cl->d2;

	printf("Merging "); dump_domain(d1,0); printf(" and "); dump_domain(d2,1); printf("\n");

	// Find comparisons with d2
	comm_latency* out = results;
	comm_latency* comparison_with_d2[NCPUS];
	memset(comparison_with_d2, 0, sizeof(comparison_with_d2));
	for (i=0;i<nres;i++){
	if (&results[i] == min_mergecost_cl) continue;
	move_to_d1(&results[i], d2);
	if (results[i].d1 == d2){
	comparison_with_d2[results[i].d2->thiscpu] = &results[i];
	}
	}

	// Find comparisons with d1, update to reflect scores with d1+d2
	for (i=0;i<nres;i++){
	if (&results[i] == min_mergecost_cl) continue;
	move_to_d1(&results[i], d1);
	if (results[i].d1 == d1){
	domain* other = results[i].d2;
	comm_latency* cmp_d1 = &results[i];
	comm_latency* cmp_d2 = comparison_with_d2[other->thiscpu];
	assert(cmp_d1 && cmp_d2 && cmp_d1->d2 == other && cmp_d2->d2 == other);
	// replace cmp_d1 with a new comparison of (d1+d2) and other
	cmp_d1->latency = calc_joined_latency(cmp_d1, cmp_d2);
	mk_comm_latency(cmp_d1, merged, other);
	}
	}

	// Remove old comparisons
	for (i=0;i<nres;i++){
	if (&results[i] == min_mergecost_cl) continue;
	move_to_d1(&results[i], d2);
	if (results[i].d1 != d2){
	*out++ = results[i];
	}
	}

	nres = out - results;
	dump(results, nres);
	}
	}


	int main(){
	int i, j;

	printf("starting\n");
	for (i=0; i<NCPUS; i++){
	mk_cpu_domain(&cpus[i], i);
	}
	comm_latency results[NCPUS * NCPUS];
	int nres = 0;
	intercore_latency(&results[0], 0, 1);

	printf("testing cpu... ");
	fflush(stdout);
	for (i=0; i<NCPUS; i++){
	printf("%d ", i);
	fflush(stdout);
	for (j=0; j<NCPUS; j++){
	if (i >= j) continue;
	intercore_latency(&results[nres], i, j);
	mk_comm_latency(&results[nres], &cpus[i], &cpus[j]);
	nres++;
	}
	}
	printf("\n");
	printf("Building trees...\n");
	find_trees(results, nres);

	return 0;
	}