okeefm · April 19, 2011 02:48
diff --git a/mm.c b/mm.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <mpi.h>
 #include <string.h>

 #if defined(__i386__)

 static __inline__ unsigned long long rdtsc(void)
 {
  unsigned long long int x;
  __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
  return x;
 }
 #elif defined(__x86_64__)


 static __inline__ unsigned long long rdtsc(void)
 {
  unsigned hi, lo;
  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
  return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
 }

 #elif defined(__powerpc__)
 static __inline__ unsigned long long rdtsc(void)
 {
  unsigned long long int result=0;
  unsigned long int upper, lower,tmp;
  __asm__ volatile(
                "0:                  \n"
                "\tmftbu   %0           \n"
                "\tmftb    %1           \n"
                "\tmftbu   %2           \n"
                "\tcmpw    %2,%0        \n"
                "\tbne     0b         \n"
                : "=r"(upper),"=r"(lower),"=r"(tmp)
 		   );
  result = upper;
  result = result<<32;
  result = result|lower;

  return(result);
 }
 #endif

 /***********************************************************************/
 /* START: MT 19937******************************************************/
 /***********************************************************************/

 /* Period parameters */  
 #define N 624
 #define M 397
 #define MATRIX_A 0x9908b0dfUL   /* constant vector a */
 #define UPPER_MASK 0x80000000UL /* most significant w-r bits */
 #define LOWER_MASK 0x7fffffffUL /* least significant r bits */

 static unsigned long mt[N]; /* the array for the state vector  */
 static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */

 /* initializes mt[N] with a seed */
 void init_genrand(unsigned long s)
 {
    mt[0]= s & 0xffffffffUL;
    for (mti=1; mti<N; mti++) {
        mt[mti] = 
 	    (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti); 
        /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
        /* In the previous versions, MSBs of the seed affect   */
        /* only MSBs of the array mt[].                        */
        /* 2002/01/09 modified by Makoto Matsumoto             */
        mt[mti] &= 0xffffffffUL;
        /* for >32 bit machines */
    }
 }

 /* initialize by an array with array-length */
 /* init_key is the array for initializing keys */
 /* key_length is its length */
 /* slight change for C++, 2004/2/26 */
 void init_by_array(unsigned long init_key[], int key_length)
 {
    int i, j, k;
    init_genrand(19650218UL);
    i=1; j=0;
    k = (N>key_length ? N : key_length);
    for (; k; k--) {
        mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
          + init_key[j] + j; /* non linear */
        mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
        i++; j++;
        if (i>=N) { mt[0] = mt[N-1]; i=1; }
        if (j>=key_length) j=0;
    }
    for (k=N-1; k; k--) {
        mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
          - i; /* non linear */
        mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
        i++;
        if (i>=N) { mt[0] = mt[N-1]; i=1; }
    }

    mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */ 
 }

 /* generates a random number on [0,0xffffffff]-interval */
 unsigned long genrand_int32(void)
 {
    unsigned long y;
    static unsigned long mag01[2]={0x0UL, MATRIX_A};
    /* mag01[x] = x * MATRIX_A  for x=0,1 */

    if (mti >= N) { /* generate N words at one time */
        int kk;

        if (mti == N+1)   /* if init_genrand() has not been called, */
            init_genrand(5489UL); /* a default initial seed is used */

        for (kk=0;kk<N-M;kk++) {
            y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
            mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
        }
        for (;kk<N-1;kk++) {
            y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
            mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
        }
        y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
        mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];

        mti = 0;
    }
  
    y = mt[mti++];

    /* Tempering */
    y ^= (y >> 11);
    y ^= (y << 7) & 0x9d2c5680UL;
    y ^= (y << 15) & 0xefc60000UL;
    y ^= (y >> 18);

    return y;
 }

 /* generates a random number on [0,0x7fffffff]-interval */
 long genrand_int31(void)
 {
    return (long)(genrand_int32()>>1);
 }

 /* generates a random number on [0,1]-real-interval */
 double genrand_real1(void)
 {
    return genrand_int32()*(1.0/4294967295.0); 
    /* divided by 2^32-1 */ 
 }

 /* generates a random number on [0,1)-real-interval */
 double genrand_real2(void)
 {
    return genrand_int32()*(1.0/4294967296.0); 
    /* divided by 2^32 */
 }

 /* generates a random number on (0,1)-real-interval */
 double genrand_real3(void)
 {
    return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0); 
    /* divided by 2^32 */
 }

 /* generates a random number on [0,1) with 53-bit resolution*/
 double genrand_res53(void) 
 { 
    unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6; 
    return(a*67108864.0+b)*(1.0/9007199254740992.0); 
 } 
 /* These real versions are due to Isaku Wada, 2002/01/09 added */

 /***********************************************************************/
 /* END: MT 19937 *******************************************************/
 /***********************************************************************/

 /* Standard matrix multiplication */
 /* Arrays start at 0 */

 char DEBUG_ON = 0;

 double **A=NULL;
 double **C=NULL;
 double *B_buf_in = NULL;
 double *B_buf_out = NULL;
 unsigned int Nc=8000;
 unsigned long rng_init_seeds[6]={0x0, 0x123, 0x234, 0x345, 0x456, 0x789};
 unsigned long rng_init_length=6;
 double clock_rateK=2666700000.0;	// Kratos
 double clock_rateBGL=700000000.0;	// Blue Gene/L
 double clock_rate = 0;	// set for BGL or Kratos

 double matrix_multiply( double **A, double *B, double **C, int B_start, int NP )
 {
 	int i=0, j=0, k=0;
 	unsigned long long start=rdtsc();
 	unsigned long long end=rdtsc();
 	for (i = 0; i< NP; i++)
 		for( j = 0; j < NP; j++ )
 			for( k = 0; k < Nc; k++ ) {
 				C[ i ][ j+B_start ] += A[i][k] * B[k*NP + j];
 				//C[ i+A_start ][ j+B_start ] += A[i+A_start][k] * B[k*NP + j]; //A_start only needed when allocating all of A and C for each process
 			}
 	end = rdtsc();
 	return ((double)end - (double)start)/clock_rate;
 }

 void main( int argc, char* argv[])
 {
 	int i, j;
 	int taskid, numtasks, dest;
 	int intsize,dbsize;
 	int P,NP,sizeB;
 	int count = 0;
 	double mult_time_l = 0, send_time_l = 0;
 	double mult_time[1024];
 	double send_time[1024];
 	
 	double mult[3];
 	double send[3];
 	double total_time = 0;
 	double data;
 	
 	int Use_nodes = 0; //max nodes to use for a job, use all if 0
 	
 	unsigned long long start,start_a, end,end_a;
 	
 	MPI_Status status;
 	MPI_Request recv_req[16], send_req[16];
 	int recv_index[16], recv_count = 0;
 	
 	intsize = sizeof(int);
 	dbsize = sizeof(double);

 	start_a = rdtsc();

 	MPI_Init(&argc, &argv);
 	MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
 	MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
 	
 	clock_rate = clock_rateBGL;	// set for BGL
 	
 	if(argc > 1) {
 		Use_nodes = atoi(argv[1]);
 		numtasks = Use_nodes
 	}
 	if(argc > 2) {
 		if (argv[2] == "k" || argv[2] == "K")
 			clock_rate = clock_rateK;	// set for Kratos
 	}
 	if(argc > 3) {
 		Nc = atoi(argv[3]);
 	}
 	if(argc > 4) {
 		DEBUG_ON = (char)atoi(argv[4]);
 	}
 	
 	if (Use_nodes > 0 && taskid < Use_nodes) {
 	
 		P = numtasks;
 		NP = Nc/P;
 		sizeB = NP*Nc;
 		if(taskid==0 && DEBUG_ON > 1) {
 			printf("sizeB:%d\n",sizeB);
 			printf("taskid:%d\n",taskid);
 			printf("NP:%d\n", NP);
 			printf("P:%d\n",P);
 		}
 		dest = (taskid+1)%numtasks;	
 	
 		// WHEN USING MPI DO: rng_init_seeds[0] = my_rank;
 		rng_init_seeds[0] = taskid;
 		init_by_array(rng_init_seeds, rng_init_length);
 	
 		//Allocate space
 		B_buf_in = (double*)calloc(sizeB, dbsize);
 		B_buf_out = (double*)calloc(sizeB, dbsize);

 		A = (double **)calloc( NP, sizeof(double*));
 		for( i = 0; i < NP; i++ ) 
 			A[i] = (double *)calloc( Nc, sizeof(double));

 		C = (double **)calloc( NP, sizeof(double*));
 		for( i = 0; i < NP; i++ )
 			C[i] = (double *)calloc( Nc, sizeof(double));

 		//Initialize A and B
 		for( i = 0; i < NP; i++ ) {
 			for( j = 0; j < Nc; j++ ) {
 				A[i][j] = genrand_res53();
 				B_buf_in[(j*NP)+i] = genrand_res53();
 			}
 		}
 		if(DEBUG_ON > 1) {
 			printf("\nA:\n");
 			for( i=0; i<NP; i++) {
 				for (j=0; j<Nc; j++) {
 					printf("%f ",A[i][j]);
 				}
 				printf("--%d\n",taskid);
 			}
 			printf("\nB:\n");
 			for( i=0; i<Nc; i++) {
 				for (j=0; j<NP; j++) {
 					printf("%f ",B_buf_in[i*NP + j]);
 				}
 				printf("--%d\n",taskid);
 			}
 		}
 	
 		for (count = 0; count < P; count++) {
 			//move B to sending buffer
 			memcpy(B_buf_out, B_buf_in, sizeB*dbsize);
 			//send B to next process
 			MPI_Isend(B_buf_out, sizeB, MPI_DOUBLE, dest, 0, MPI_COMM_WORLD, &(send_req[0]));
 			//perform multiplication
 			mult_time_l += matrix_multiply( A, B_buf_in, C, ((count+taskid)%P)*NP, NP );
 			//receive B from previous process
 			start = rdtsc();
 			MPI_Irecv(B_buf_in, sizeB, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &(recv_req[0]));
 			//wait for new B
 			while (recv_count == 0) {
 				MPI_Testsome(0, recv_req, &recv_count, recv_index, MPI_STATUSES_IGNORE);
 			}
 			recv_count = 0;
 			end = rdtsc();
 			send_time_l += ((double)end - (double)start)/clock_rate;
 		
 			if(DEBUG_ON) {
 				printf("\nC(%d):\n",count);
 				for( i=0; i<NP; i++) {
 					for (j=0; j<Nc; j++) {
 						printf("%d ",(int)C[i][j]);
 					}
 					printf("--%d\n",taskid);
 				}
 			}
 		}
 	
 		// Collect stats
 		MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 		MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 		mult[0] = mult_time[0];
 		send[0] = send_time[0];
 		MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
 		MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
 		mult[1] = mult_time[0];
 		send[1] = send_time[0];
 		MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
 		MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
 		mult[1] = mult_time[0];
 		send[1] = send_time[0];
 	}
 	
 	//Finalize
 	MPI_Finalize();
 	
 	//report stats
 	if (taskid == 0) {
 		end_a = rdtsc();
 		total_time = ((double)end_a - (double)start_a)/clock_rate;
 		data = sizeB*P;
 		printf("NUM CORES --- AVG MULT --- AVG SEND --- MIN MULT --- MIN SEND --- MAX MULT --- MAX SEND --- TOTAL EXE\n");
 		printf("%d \t%lf \t%f \t%lf \t%lf \t%lf \t%lf \t%lf\n",P,data/mult[0],data/send[0],data/mult[1],data/send[0],data/mult[1],data/send[0],total_time);
 	}
 		
 }
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>
	#include <mpi.h>
	#include <string.h>

	#if defined(__i386__)

	static __inline__ unsigned long long rdtsc(void)
	{
	unsigned long long int x;
	__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
	return x;
	}
	#elif defined(__x86_64__)


	static __inline__ unsigned long long rdtsc(void)
	{
	unsigned hi, lo;
	__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
	return ( (unsigned long long)lo)\|( ((unsigned long long)hi)<<32 );
	}

	#elif defined(__powerpc__)
	static __inline__ unsigned long long rdtsc(void)
	{
	unsigned long long int result=0;
	unsigned long int upper, lower,tmp;
	__asm__ volatile(
	"0: \n"
	"\tmftbu %0 \n"
	"\tmftb %1 \n"
	"\tmftbu %2 \n"
	"\tcmpw %2,%0 \n"
	"\tbne 0b \n"
	: "=r"(upper),"=r"(lower),"=r"(tmp)
	);
	result = upper;
	result = result<<32;
	result = result\|lower;

	return(result);
	}
	#endif

	/***********************************************************************/
	/* START: MT 19937******************************************************/
	/***********************************************************************/

	/* Period parameters */
	#define N 624
	#define M 397
	#define MATRIX_A 0x9908b0dfUL /* constant vector a */
	#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
	#define LOWER_MASK 0x7fffffffUL /* least significant r bits */

	static unsigned long mt[N]; /* the array for the state vector */
	static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */

	/* initializes mt[N] with a seed */
	void init_genrand(unsigned long s)
	{
	mt[0]= s & 0xffffffffUL;
	for (mti=1; mti<N; mti++) {
	mt[mti] =
	(1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
	/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
	/* In the previous versions, MSBs of the seed affect */
	/* only MSBs of the array mt[]. */
	/* 2002/01/09 modified by Makoto Matsumoto */
	mt[mti] &= 0xffffffffUL;
	/* for >32 bit machines */
	}
	}

	/* initialize by an array with array-length */
	/* init_key is the array for initializing keys */
	/* key_length is its length */
	/* slight change for C++, 2004/2/26 */
	void init_by_array(unsigned long init_key[], int key_length)
	{
	int i, j, k;
	init_genrand(19650218UL);
	i=1; j=0;
	k = (N>key_length ? N : key_length);
	for (; k; k--) {
	mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
	+ init_key[j] + j; /* non linear */
	mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
	i++; j++;
	if (i>=N) { mt[0] = mt[N-1]; i=1; }
	if (j>=key_length) j=0;
	}
	for (k=N-1; k; k--) {
	mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
	- i; /* non linear */
	mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
	i++;
	if (i>=N) { mt[0] = mt[N-1]; i=1; }
	}

	mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
	}

	/* generates a random number on [0,0xffffffff]-interval */
	unsigned long genrand_int32(void)
	{
	unsigned long y;
	static unsigned long mag01[2]={0x0UL, MATRIX_A};
	/* mag01[x] = x * MATRIX_A for x=0,1 */

	if (mti >= N) { /* generate N words at one time */
	int kk;

	if (mti == N+1) /* if init_genrand() has not been called, */
	init_genrand(5489UL); /* a default initial seed is used */

	for (kk=0;kk<N-M;kk++) {
	y = (mt[kk]&UPPER_MASK)\|(mt[kk+1]&LOWER_MASK);
	mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
	}
	for (;kk<N-1;kk++) {
	y = (mt[kk]&UPPER_MASK)\|(mt[kk+1]&LOWER_MASK);
	mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
	}
	y = (mt[N-1]&UPPER_MASK)\|(mt[0]&LOWER_MASK);
	mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];

	mti = 0;
	}

	y = mt[mti++];

	/* Tempering */
	y ^= (y >> 11);
	y ^= (y << 7) & 0x9d2c5680UL;
	y ^= (y << 15) & 0xefc60000UL;
	y ^= (y >> 18);

	return y;
	}

	/* generates a random number on [0,0x7fffffff]-interval */
	long genrand_int31(void)
	{
	return (long)(genrand_int32()>>1);
	}

	/* generates a random number on [0,1]-real-interval */
	double genrand_real1(void)
	{
	return genrand_int32()*(1.0/4294967295.0);
	/* divided by 2^32-1 */
	}

	/* generates a random number on [0,1)-real-interval */
	double genrand_real2(void)
	{
	return genrand_int32()*(1.0/4294967296.0);
	/* divided by 2^32 */
	}

	/* generates a random number on (0,1)-real-interval */
	double genrand_real3(void)
	{
	return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
	/* divided by 2^32 */
	}

	/* generates a random number on [0,1) with 53-bit resolution*/
	double genrand_res53(void)
	{
	unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
	return(a67108864.0+b)(1.0/9007199254740992.0);
	}
	/* These real versions are due to Isaku Wada, 2002/01/09 added */

	/***********************************************************************/
	/* END: MT 19937 *******************************************************/
	/***********************************************************************/

	/* Standard matrix multiplication */
	/* Arrays start at 0 */

	char DEBUG_ON = 0;

	double **A=NULL;
	double **C=NULL;
	double *B_buf_in = NULL;
	double *B_buf_out = NULL;
	unsigned int Nc=8000;
	unsigned long rng_init_seeds[6]={0x0, 0x123, 0x234, 0x345, 0x456, 0x789};
	unsigned long rng_init_length=6;
	double clock_rateK=2666700000.0; // Kratos
	double clock_rateBGL=700000000.0; // Blue Gene/L
	double clock_rate = 0; // set for BGL or Kratos

	double matrix_multiply( double *A, double B, double **C, int B_start, int NP )
	{
	int i=0, j=0, k=0;
	unsigned long long start=rdtsc();
	unsigned long long end=rdtsc();
	for (i = 0; i< NP; i++)
	for( j = 0; j < NP; j++ )
	for( k = 0; k < Nc; k++ ) {
	C[ i ][ j+B_start ] += A[i][k] * B[k*NP + j];
	//C[ i+A_start ][ j+B_start ] += A[i+A_start][k] * B[k*NP + j]; //A_start only needed when allocating all of A and C for each process
	}
	end = rdtsc();
	return ((double)end - (double)start)/clock_rate;
	}

	void main( int argc, char* argv[])
	{
	int i, j;
	int taskid, numtasks, dest;
	int intsize,dbsize;
	int P,NP,sizeB;
	int count = 0;
	double mult_time_l = 0, send_time_l = 0;
	double mult_time[1024];
	double send_time[1024];

	double mult[3];
	double send[3];
	double total_time = 0;
	double data;

	int Use_nodes = 0; //max nodes to use for a job, use all if 0

	unsigned long long start,start_a, end,end_a;

	MPI_Status status;
	MPI_Request recv_req[16], send_req[16];
	int recv_index[16], recv_count = 0;

	intsize = sizeof(int);
	dbsize = sizeof(double);

	start_a = rdtsc();

	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
	MPI_Comm_size(MPI_COMM_WORLD, &numtasks);

	clock_rate = clock_rateBGL; // set for BGL

	if(argc > 1) {
	Use_nodes = atoi(argv[1]);
	numtasks = Use_nodes
	}
	if(argc > 2) {
	if (argv[2] == "k" \|\| argv[2] == "K")
	clock_rate = clock_rateK; // set for Kratos
	}
	if(argc > 3) {
	Nc = atoi(argv[3]);
	}
	if(argc > 4) {
	DEBUG_ON = (char)atoi(argv[4]);
	}

	if (Use_nodes > 0 && taskid < Use_nodes) {

	P = numtasks;
	NP = Nc/P;
	sizeB = NP*Nc;
	if(taskid==0 && DEBUG_ON > 1) {
	printf("sizeB:%d\n",sizeB);
	printf("taskid:%d\n",taskid);
	printf("NP:%d\n", NP);
	printf("P:%d\n",P);
	}
	dest = (taskid+1)%numtasks;

	// WHEN USING MPI DO: rng_init_seeds[0] = my_rank;
	rng_init_seeds[0] = taskid;
	init_by_array(rng_init_seeds, rng_init_length);

	//Allocate space
	B_buf_in = (double*)calloc(sizeB, dbsize);
	B_buf_out = (double*)calloc(sizeB, dbsize);

	A = (double *)calloc( NP, sizeof(double));
	for( i = 0; i < NP; i++ )
	A[i] = (double *)calloc( Nc, sizeof(double));

	C = (double *)calloc( NP, sizeof(double));
	for( i = 0; i < NP; i++ )
	C[i] = (double *)calloc( Nc, sizeof(double));

	//Initialize A and B
	for( i = 0; i < NP; i++ ) {
	for( j = 0; j < Nc; j++ ) {
	A[i][j] = genrand_res53();
	B_buf_in[(j*NP)+i] = genrand_res53();
	}
	}
	if(DEBUG_ON > 1) {
	printf("\nA:\n");
	for( i=0; i<NP; i++) {
	for (j=0; j<Nc; j++) {
	printf("%f ",A[i][j]);
	}
	printf("--%d\n",taskid);
	}
	printf("\nB:\n");
	for( i=0; i<Nc; i++) {
	for (j=0; j<NP; j++) {
	printf("%f ",B_buf_in[i*NP + j]);
	}
	printf("--%d\n",taskid);
	}
	}

	for (count = 0; count < P; count++) {
	//move B to sending buffer
	memcpy(B_buf_out, B_buf_in, sizeB*dbsize);
	//send B to next process
	MPI_Isend(B_buf_out, sizeB, MPI_DOUBLE, dest, 0, MPI_COMM_WORLD, &(send_req[0]));
	//perform multiplication
	mult_time_l += matrix_multiply( A, B_buf_in, C, ((count+taskid)%P)*NP, NP );
	//receive B from previous process
	start = rdtsc();
	MPI_Irecv(B_buf_in, sizeB, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &(recv_req[0]));
	//wait for new B
	while (recv_count == 0) {
	MPI_Testsome(0, recv_req, &recv_count, recv_index, MPI_STATUSES_IGNORE);
	}
	recv_count = 0;
	end = rdtsc();
	send_time_l += ((double)end - (double)start)/clock_rate;

	if(DEBUG_ON) {
	printf("\nC(%d):\n",count);
	for( i=0; i<NP; i++) {
	for (j=0; j<Nc; j++) {
	printf("%d ",(int)C[i][j]);
	}
	printf("--%d\n",taskid);
	}
	}
	}

	// Collect stats
	MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	mult[0] = mult_time[0];
	send[0] = send_time[0];
	MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
	MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
	mult[1] = mult_time[0];
	send[1] = send_time[0];
	MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
	MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
	mult[1] = mult_time[0];
	send[1] = send_time[0];
	}

	//Finalize
	MPI_Finalize();

	//report stats
	if (taskid == 0) {
	end_a = rdtsc();
	total_time = ((double)end_a - (double)start_a)/clock_rate;
	data = sizeB*P;
	printf("NUM CORES --- AVG MULT --- AVG SEND --- MIN MULT --- MIN SEND --- MAX MULT --- MAX SEND --- TOTAL EXE\n");
	printf("%d \t%lf \t%f \t%lf \t%lf \t%lf \t%lf \t%lf\n",P,data/mult[0],data/send[0],data/mult[1],data/send[0],data/mult[1],data/send[0],total_time);
	}

	}