Created April 19, 2011 02:48
Matrix Multiplication code for BlueGene/L using MPI.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <mpi.h>
#include <string.h>
#if defined(__i386__)
static __inline__ unsigned long long rdtsc(void)
unsigned long long int x;
__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
return x;
#elif defined(__x86_64__)
static __inline__ unsigned long long rdtsc(void)
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
#elif defined(__powerpc__)
static __inline__ unsigned long long rdtsc(void)
unsigned long long int result=0;
unsigned long int upper, lower,tmp;
__asm__ volatile(
"0: \n"
"\tmftbu %0 \n"
"\tmftb %1 \n"
"\tmftbu %2 \n"
"\tcmpw %2,%0 \n"
"\tbne 0b \n"
: "=r"(upper),"=r"(lower),"=r"(tmp)
result = upper;
result = result<<32;
result = result|lower;
/* START: MT 19937******************************************************/
/* Period parameters */
#define N 624
#define M 397
#define MATRIX_A 0x9908b0dfUL /* constant vector a */
#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
#define LOWER_MASK 0x7fffffffUL /* least significant r bits */
static unsigned long mt[N]; /* the array for the state vector */
static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */
/* initializes mt[N] with a seed */
void init_genrand(unsigned long s)
mt[0]= s & 0xffffffffUL;
for (mti=1; mti<N; mti++) {
mt[mti] =
(1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
/* In the previous versions, MSBs of the seed affect */
/* only MSBs of the array mt[]. */
/* 2002/01/09 modified by Makoto Matsumoto */
mt[mti] &= 0xffffffffUL;
/* for >32 bit machines */
/* initialize by an array with array-length */
/* init_key is the array for initializing keys */
/* key_length is its length */
/* slight change for C++, 2004/2/26 */
void init_by_array(unsigned long init_key[], int key_length)
int i, j, k;
i=1; j=0;
k = (N>key_length ? N : key_length);
for (; k; k--) {
mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
+ init_key[j] + j; /* non linear */
mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
i++; j++;
if (i>=N) { mt[0] = mt[N-1]; i=1; }
if (j>=key_length) j=0;
for (k=N-1; k; k--) {
mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
- i; /* non linear */
mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
if (i>=N) { mt[0] = mt[N-1]; i=1; }
mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
/* generates a random number on [0,0xffffffff]-interval */
unsigned long genrand_int32(void)
unsigned long y;
static unsigned long mag01[2]={0x0UL, MATRIX_A};
/* mag01[x] = x * MATRIX_A for x=0,1 */
if (mti >= N) { /* generate N words at one time */
int kk;
if (mti == N+1) /* if init_genrand() has not been called, */
init_genrand(5489UL); /* a default initial seed is used */
for (kk=0;kk<N-M;kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
for (;kk<N-1;kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
mti = 0;
y = mt[mti++];
/* Tempering */
y ^= (y >> 11);
y ^= (y << 7) & 0x9d2c5680UL;
y ^= (y << 15) & 0xefc60000UL;
y ^= (y >> 18);
return y;
/* generates a random number on [0,0x7fffffff]-interval */
long genrand_int31(void)
return (long)(genrand_int32()>>1);
/* generates a random number on [0,1]-real-interval */
double genrand_real1(void)
return genrand_int32()*(1.0/4294967295.0);
/* divided by 2^32-1 */
/* generates a random number on [0,1)-real-interval */
double genrand_real2(void)
return genrand_int32()*(1.0/4294967296.0);
/* divided by 2^32 */
/* generates a random number on (0,1)-real-interval */
double genrand_real3(void)
return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
/* divided by 2^32 */
/* generates a random number on [0,1) with 53-bit resolution*/
double genrand_res53(void)
unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
/* These real versions are due to Isaku Wada, 2002/01/09 added */
/* END: MT 19937 *******************************************************/
/* Standard matrix multiplication */
/* Arrays start at 0 */
char DEBUG_ON = 0;
double **A=NULL;
double **C=NULL;
double *B_buf_in = NULL;
double *B_buf_out = NULL;
unsigned int Nc=8000;
unsigned long rng_init_seeds[6]={0x0, 0x123, 0x234, 0x345, 0x456, 0x789};
unsigned long rng_init_length=6;
double clock_rateK=2666700000.0; // Kratos
double clock_rateBGL=700000000.0; // Blue Gene/L
double clock_rate = 0; // set for BGL or Kratos
double matrix_multiply( double **A, double *B, double **C, int B_start, int NP )
int i=0, j=0, k=0;
unsigned long long start=rdtsc();
unsigned long long end=rdtsc();
for (i = 0; i< NP; i++)
for( j = 0; j < NP; j++ )
for( k = 0; k < Nc; k++ ) {
C[ i ][ j+B_start ] += A[i][k] * B[k*NP + j];
//C[ i+A_start ][ j+B_start ] += A[i+A_start][k] * B[k*NP + j]; //A_start only needed when allocating all of A and C for each process
end = rdtsc();
return ((double)end - (double)start)/clock_rate;
void main( int argc, char* argv[])
int i, j;
int taskid, numtasks, dest;
int intsize,dbsize;
int P,NP,sizeB;
int count = 0;
double mult_time_l = 0, send_time_l = 0;
double mult_time[1024];
double send_time[1024];
double mult[3];
double send[3];
double total_time = 0;
double data;
int Use_nodes = 0; //max nodes to use for a job, use all if 0
unsigned long long start,start_a, end,end_a;
MPI_Status status;
MPI_Request recv_req[16], send_req[16];
int recv_index[16], recv_count = 0;
intsize = sizeof(int);
dbsize = sizeof(double);
start_a = rdtsc();
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
clock_rate = clock_rateBGL; // set for BGL
if(argc > 1) {
Use_nodes = atoi(argv[1]);
numtasks = Use_nodes
if(argc > 2) {
if (argv[2] == "k" || argv[2] == "K")
clock_rate = clock_rateK; // set for Kratos
if(argc > 3) {
Nc = atoi(argv[3]);
if(argc > 4) {
DEBUG_ON = (char)atoi(argv[4]);
if (Use_nodes > 0 && taskid < Use_nodes) {
P = numtasks;
NP = Nc/P;
sizeB = NP*Nc;
if(taskid==0 && DEBUG_ON > 1) {
printf("NP:%d\n", NP);
dest = (taskid+1)%numtasks;
// WHEN USING MPI DO: rng_init_seeds[0] = my_rank;
rng_init_seeds[0] = taskid;
init_by_array(rng_init_seeds, rng_init_length);
//Allocate space
B_buf_in = (double*)calloc(sizeB, dbsize);
B_buf_out = (double*)calloc(sizeB, dbsize);
A = (double **)calloc( NP, sizeof(double*));
for( i = 0; i < NP; i++ )
A[i] = (double *)calloc( Nc, sizeof(double));
C = (double **)calloc( NP, sizeof(double*));
for( i = 0; i < NP; i++ )
C[i] = (double *)calloc( Nc, sizeof(double));
//Initialize A and B
for( i = 0; i < NP; i++ ) {
for( j = 0; j < Nc; j++ ) {
A[i][j] = genrand_res53();
B_buf_in[(j*NP)+i] = genrand_res53();
if(DEBUG_ON > 1) {
for( i=0; i<NP; i++) {
for (j=0; j<Nc; j++) {
printf("%f ",A[i][j]);
for( i=0; i<Nc; i++) {
for (j=0; j<NP; j++) {
printf("%f ",B_buf_in[i*NP + j]);
for (count = 0; count < P; count++) {
//move B to sending buffer
memcpy(B_buf_out, B_buf_in, sizeB*dbsize);
//send B to next process
MPI_Isend(B_buf_out, sizeB, MPI_DOUBLE, dest, 0, MPI_COMM_WORLD, &(send_req[0]));
//perform multiplication
mult_time_l += matrix_multiply( A, B_buf_in, C, ((count+taskid)%P)*NP, NP );
//receive B from previous process
start = rdtsc();
MPI_Irecv(B_buf_in, sizeB, MPI_CHAR, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &(recv_req[0]));
//wait for new B
while (recv_count == 0) {
MPI_Testsome(0, recv_req, &recv_count, recv_index, MPI_STATUSES_IGNORE);
recv_count = 0;
end = rdtsc();
send_time_l += ((double)end - (double)start)/clock_rate;
if(DEBUG_ON) {
for( i=0; i<NP; i++) {
for (j=0; j<Nc; j++) {
printf("%d ",(int)C[i][j]);
// Collect stats
MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
mult[0] = mult_time[0];
send[0] = send_time[0];
MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
mult[1] = mult_time[0];
send[1] = send_time[0];
MPI_Allreduce(&send_time_l, send_time, numtasks, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
MPI_Allreduce(&mult_time_l, mult_time, numtasks, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
mult[1] = mult_time[0];
send[1] = send_time[0];
//report stats
if (taskid == 0) {
end_a = rdtsc();
total_time = ((double)end_a - (double)start_a)/clock_rate;
data = sizeB*P;
printf("%d \t%lf \t%f \t%lf \t%lf \t%lf \t%lf \t%lf\n",P,data/mult[0],data/send[0],data/mult[1],data/send[0],data/mult[1],data/send[0],total_time);
