jstroem · December 6, 2012 04:18
diff --git a/batch_bang.sh b/batch_bang.sh
 #!/bin/bash
 # This script is intepreted by the Bourne Shell, sh
 #
 # Documentation for SGE is found in:
 # http://docs.oracle.com/cd/E19279-01/820-3257-12/n1ge.html
 #
 # Tell SGE which shell to run the job script in rather than depending
 # on SGE to try and figure it out.
 #$ -S /bin/bash
 #
 # Export all my environment variables to the job
 #$ -V
 #
 # Tun the job in the same directory from which you submitted it
 #$ -cwd
 #
 #
 # --- Don't change anything above this line ---
 #
 # Give a name to the job
 #$ -N MPI
 #
 # Specify a time limit for the job, not more than 30 minutes
 #$ -l h_rt=00:10:00
 #
 # Specify the parallel environment and number of cores,
 # If not a multiple of 8, you'll get the whole node anyway
 #$ -pe orte 8
 #
 # Join stdout and stderr so they are reported in job output file
 #$ -j y
 #
 #
 # Choose the queue to run the job
 #
 # Debug queue: only one node may be used at a time for up to 30 minutes
 # Interactive or batch jobs, maximum of 1 job per user running at a time
 #
 # Normal queue: job may use all available compute nodes (256 cores)
 # for up to 60 minutes
 # Batch jobs, maximum of 2 jobs per user running at a time
 # To use more than one node, specify the "normal" queue
 #$ -q normal.q
 # #$ -q debug.q
 #
 # Specifies the circumstances under which mail is to be sent to the job owner
 # defined by -M option. For example, options "bea" cause mail to be sent at the 
 # begining, end, and at abort time (if it happens) of the job.
 # Option "n" means no mail will be sent.
 #$ -m aeb
 #
 # *** Change to the address you want the notification sent to, and
 # *** REMOVE the blank between the # and the $
 #$ -M [email protected]
 #


 echo
 echo " *** Current working directory"
 pwd
 echo
 echo " *** Compiler"
 # Output which  compiler are we using and the environment
 mpicc -v
 echo
 echo " *** Environment"
 printenv

 echo

 echo ">>> Job Starts"
 date
 mpirun -np $NSLOTS ./mpi -n 5000 -proc_width 4 -proc_height 2 -o mpi.txt -c checksum.txt

 date
 echo ">>> Job Ends"
diff --git a/common.cpp b/common.cpp
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
 #include <float.h>
 #include <string.h>
 #include <math.h>
 #include <time.h>
 #include <mpi.h>
 #include <sys/time.h>
 #include "common.h"

 double size;

 //
 //  tuned constants
 //
 #define density 0.0010
 #define mass    0.01
 #define cutoff  0.01
 #define overlap 0.011
 #define min_r   (cutoff/100)
 #define dt      0.0005

 void setup_ghost_buffers( neighbor_t *neighbors, double size_x, double size_y){
    neighbor_t self = neighbors[SELF];
    for(int i = 1; i < 9; i++){
        if (neighbors[i].proc_x - self.proc_x != 0 && neighbors[i].proc_y - self.proc_y != 0) {
            neighbors[i].buffer_size = (int)(min(ceil(size_y),ceil(size_x)) / mass);
        } else if (neighbors[i].proc_x - self.proc_x != 0) {
            neighbors[i].buffer_size = (int)ceil(size_y / mass);
        } else if (neighbors[i].proc_y - self.proc_y != 0) {
            neighbors[i].buffer_size = (int)ceil(size_x / mass);
        }
        neighbors[i].send_buffer = (particle_t*) malloc( sizeof(particle_t) * neighbors[i].buffer_size);
    }
 }

 void copy_to_ghost_buffers( particle_t *particles, neighbor_t *neighbors) {
    neighbor_t self = neighbors[SELF];
    for(int i = 1; i < 9; i++){
        neighbors[i].send_size = 0;
    }
    for (int i = self.offset; i < self.offset + self.recv_size; i++){
        if (particles[i].x <= self.from_x + overlap && particles[i].y <= self.from_y + overlap && neighbors[TOP_LEFT].rank != -1) {//TOP LEFT particle
            if (neighbors[TOP_LEFT].buffer_size <= neighbors[TOP_LEFT].send_size){
                printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[TOP_LEFT].rank, neighbors[TOP_LEFT].buffer_size, neighbors[TOP_LEFT].send_size +1);
            } else {
                neighbors[TOP_LEFT].send_buffer[neighbors[TOP_LEFT].send_size] = particles[i];
            }
            neighbors[TOP_LEFT].send_size = neighbors[TOP_LEFT].send_size + 1;
        }
        if (self.to_x - overlap < particles[i].x && particles[i].y <= self.from_y + overlap && neighbors[TOP_RIGHT].rank != -1) {//TOP RIGHT particle
            if (neighbors[TOP_RIGHT].buffer_size <= neighbors[TOP_RIGHT].send_size){
                printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[TOP_RIGHT].rank, neighbors[TOP_RIGHT].buffer_size, neighbors[TOP_LEFT].send_size +1);
            } else {
                neighbors[TOP_RIGHT].send_buffer[neighbors[TOP_RIGHT].send_size] = particles[i];
            }
            neighbors[TOP_RIGHT].send_size = neighbors[TOP_RIGHT].send_size + 1;
        }
        if (particles[i].x <= self.from_x + overlap && self.to_y - overlap < particles[i].y && neighbors[BOTTOM_LEFT].rank != -1) {// BOTTOM LEFT particle
            if (neighbors[BOTTOM_LEFT].buffer_size <= neighbors[BOTTOM_LEFT].send_size){
                printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[BOTTOM_LEFT].rank, neighbors[BOTTOM_LEFT].buffer_size, neighbors[TOP_LEFT].send_size +1);
            } else {
                neighbors[BOTTOM_LEFT].send_buffer[neighbors[BOTTOM_LEFT].send_size] = particles[i];
            }
            neighbors[BOTTOM_LEFT].send_size = neighbors[BOTTOM_LEFT].send_size + 1;
        }
        if (self.to_x - overlap < particles[i].x && self.to_y - overlap < particles[i].y && neighbors[BOTTOM_RIGHT].rank != -1) {//BOTTOM RIGHT particle
            if (neighbors[BOTTOM_RIGHT].buffer_size <= neighbors[BOTTOM_RIGHT].send_size){
                printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[BOTTOM_RIGHT].rank, neighbors[BOTTOM_RIGHT].buffer_size, neighbors[TOP_LEFT].send_size +1);
            } else {
                neighbors[BOTTOM_RIGHT].send_buffer[neighbors[BOTTOM_RIGHT].send_size] = particles[i];
            }
            neighbors[BOTTOM_RIGHT].send_size = neighbors[BOTTOM_RIGHT].send_size + 1;
        }
        if (self.to_x - overlap < particles[i].x && neighbors[STRAIGHT_RIGHT].rank != -1) { // STRAIGHT RIGHT particle
            if (neighbors[STRAIGHT_RIGHT].buffer_size <= neighbors[STRAIGHT_RIGHT].send_size){
                printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[STRAIGHT_RIGHT].rank, neighbors[STRAIGHT_RIGHT].buffer_size, neighbors[TOP_LEFT].send_size +1);
            } else {
                neighbors[STRAIGHT_RIGHT].send_buffer[neighbors[STRAIGHT_RIGHT].send_size] = particles[i];
            }
            neighbors[STRAIGHT_RIGHT].send_size = neighbors[STRAIGHT_RIGHT].send_size + 1;
        }
        if (particles[i].x <= self.from_x + overlap && neighbors[STRAIGHT_LEFT].rank != -1) { // STRAIGHT LEFT particle
            if (neighbors[STRAIGHT_LEFT].buffer_size <= neighbors[STRAIGHT_LEFT].send_size){
                printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[STRAIGHT_LEFT].rank, neighbors[STRAIGHT_LEFT].buffer_size, neighbors[TOP_LEFT].send_size +1);
            } else {
                neighbors[STRAIGHT_LEFT].send_buffer[neighbors[STRAIGHT_LEFT].send_size] = particles[i];
            }
            neighbors[STRAIGHT_LEFT].send_size = neighbors[STRAIGHT_LEFT].send_size + 1;
        }
        if (self.to_y - overlap < particles[i].y && neighbors[BOTTOM_STRAIGHT].rank != -1) { // BOTTOM STRAIGHT particle
            if (neighbors[BOTTOM_STRAIGHT].buffer_size <= neighbors[BOTTOM_STRAIGHT].send_size){
                printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[BOTTOM_STRAIGHT].rank, neighbors[BOTTOM_STRAIGHT].buffer_size, neighbors[TOP_LEFT].send_size +1);
            } else {
                neighbors[BOTTOM_STRAIGHT].send_buffer[neighbors[BOTTOM_STRAIGHT].send_size] = particles[i];
            }
            neighbors[BOTTOM_STRAIGHT].send_size = neighbors[BOTTOM_STRAIGHT].send_size + 1;
        }
        if (particles[i].y <= self.from_y + overlap && neighbors[TOP_STRAIGHT].rank != -1) { // TOP STRAIGHT particle
            if (neighbors[TOP_STRAIGHT].buffer_size <= neighbors[TOP_STRAIGHT].send_size){
                printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[TOP_STRAIGHT].rank, neighbors[TOP_STRAIGHT].buffer_size, neighbors[TOP_LEFT].send_size +1);
            } else {
                neighbors[TOP_STRAIGHT].send_buffer[neighbors[TOP_STRAIGHT].send_size] = particles[i];
            }
            neighbors[TOP_STRAIGHT].send_size = neighbors[TOP_STRAIGHT].send_size + 1;
        }
    }
 }

 //
 //  timer
 //
 double read_timer( )
 {
    static bool initialized = false;
    static struct timeval start;
    struct timeval end;
    if( !initialized )
    {
        gettimeofday( &start, NULL );
        initialized = true;
    }
    gettimeofday( &end, NULL );
    return (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
 }

 //
 //  keep density constant
 //
 double set_size( int n )
 {
    size = sqrt( density * n );
    return size;
 }

 //
 //  Initialize the particle positions and velocities
 //
 void init_particles( int n, particle_t *p )
 {
    srand48( 1 );
        
    int sx = (int)ceil(sqrt((double)n));
    int sy = (n+sx-1)/sx;
    
    int *shuffle = (int*)malloc( n * sizeof(int) );
    for( int i = 0; i < n; i++ )
        shuffle[i] = i;
    
    for( int i = 0; i < n; i++ ) 
    {
        //
        //  make sure particles are not spatially sorted
        //
        int j = lrand48()%(n-i);
        int k = shuffle[j];
        shuffle[j] = shuffle[n-i-1];
        
        //
        //  distribute particles evenly to ensure proper spacing
        //
        p[i].x = size*(1.+(k%sx))/(1+sx);
        p[i].y = (size/2)*(1.+(k/sx))/(1+sy);
        p[i].index = i;

        //
        //  assign random velocities within a bound
        //
        p[i].vx = drand48()*2-1;
        p[i].vy = drand48()*2-1;
    }
    free( shuffle );
 }

 //
 //  interact two particles
 //
 void apply_force( particle_t &particle, particle_t &neighbor )
 {

    double dx = neighbor.x - particle.x;
    double dy = neighbor.y - particle.y;
    double r2 = dx * dx + dy * dy;
    if( r2 > cutoff*cutoff )
        return;
    r2 = fmax( r2, min_r*min_r );
    double r = sqrt( r2 );

    //
    //  very simple short-range repulsive force
    //
    double coef = ( 1 - cutoff / r ) / r2 / mass;
    particle.ax += coef * dx;
    particle.ay += coef * dy;
 }

 //
 //  integrate the ODE
 //
 void move( particle_t &p )
 {
    //
    //  slightly simplified Velocity Verlet integration
    //  conserves energy better than explicit Euler method
    //
    p.vx += p.ax * dt;
    p.vy += p.ay * dt;
    p.x  += p.vx * dt;
    p.y  += p.vy * dt;

    //
    //  bounce from walls
    //
    while( p.x < 0 || p.x > size )
    {
        p.x  = p.x < 0 ? -p.x : 2*size-p.x;
        p.vx = -p.vx;
    }
    while( p.y < 0 || p.y > size )
    {
        p.y  = p.y < 0 ? -p.y : 2*size-p.y;
        p.vy = -p.vy;
    }
 }

 //
 //  I/O routines
 //
 void save( FILE *f, int n, particle_t *p )
 {
    static bool first = true;
    if( first )
    {
        fprintf( f, "%d %g\n", n, size );
        first = false;
    }
    for( int i = 0; i < n; i++ )
        fprintf( f, "%g %g\n", p[i].x, p[i].y );
 }

 //
 //  command line option processing
 //
 int find_option( int argc, char **argv, const char *option )
 {
    for( int i = 1; i < argc; i++ )
        if( strcmp( argv[i], option ) == 0 )
            return i;
    return -1;
 }

 int read_int( int argc, char **argv, const char *option, int default_value )
 {
    int iplace = find_option( argc, argv, option );
    if( iplace >= 0 && iplace < argc-1 )
        return atoi( argv[iplace+1] );
    return default_value;
 }

 char *read_string( int argc, char **argv, const char *option, char *default_value )
 {
    int iplace = find_option( argc, argv, option );
    if( iplace >= 0 && iplace < argc-1 )
        return argv[iplace+1];
    return default_value;
 }
diff --git a/common.h b/common.h
 #ifndef __CS267_COMMON_H__
 #define __CS267_COMMON_H__

 inline int min( int a, int b ) { return a < b ? a : b; }
 inline int max( int a, int b ) { return a > b ? a : b; }

 //
 //  saving parameters
 //
 const int NSTEPS = 1000;
 const int SAVEFREQ = 10;

 const int   SELF = 0, 
            BOTTOM_STRAIGHT = 1, 
            BOTTOM_LEFT = 2, 
            STRAIGHT_LEFT = 3, 
            TOP_LEFT = 4, 
            TOP_STRAIGHT = 5, 
            TOP_RIGHT = 6, 
            STRAIGHT_RIGHT = 7, 
            BOTTOM_RIGHT = 8;

 //
 // particle data structure
 //
 typedef struct 
 {
  double index;
  double x;
  double y;
  double vx;
  double vy;
  double ax;
  double ay;
 } particle_t;

 typedef struct
 {
 	int offset;
 	int send_size;
 	int recv_size;
 	int proc_x;
 	int proc_y;
 	int rank;
 	double from_x;
 	double from_y;
 	double to_x;
 	double to_y;
 	particle_t *send_buffer;
 	int buffer_size;
 	MPI_Request recv_req;
 	MPI_Request send_req;
 	MPI_Status recv_stat;
 	MPI_Status send_stat;
 } neighbor_t;

 //
 //  timing routines
 //
 double read_timer( );


 void copy_to_ghost_buffers( particle_t *particles, neighbor_t *neighbors);
 void setup_ghost_buffers( neighbor_t *neighbors, double size_x, double size_y);

 //
 //  simulation routines
 //
 double set_size( int n );
 void init_particles( int n, particle_t *p );
 void apply_force( particle_t &particle, particle_t &neighbor );
 void move( particle_t &p );

 //
 //  I/O routines
 //
 FILE *open_save( char *filename, int n );
 void save( FILE *f, int n, particle_t *p );

 //
 //  argument processing routines
 //
 int find_option( int argc, char **argv, const char *option );
 int read_int( int argc, char **argv, const char *option, int default_value );
 char *read_string( int argc, char **argv, const char *option, char *default_value );

 #endif
diff --git a/Makefile b/Makefile
 HOST = $(shell hostname)
 BANG   =  $(shell expr match `hostname` ccom-bang)
 BANG-COMPUTE   =  $(shell expr match `hostname` compute)
 LILLIPUT   =  $(shell expr match `hostname` lilliput)



 ifneq ($(BANG), 0)
 PUB     = /share/class/public/cse260-fa12
 include $(PUB)/Arch/arch.gnu.generic
 else
 ifneq ($(BANG-COMPUTE), 0)
 PUB     = /share/class/public/cse260-fa12
 include $(PUB)/Arch/arch.gnu.generic
 else
 ifneq ($(LILLIPUT), 0)
 PUB	= /class/public/cse260-fa12
 include $(PUB)/Arch/arch.intel.generic
 else
 # PUB = /Users/baden/lib
 include $(PUB)/Arch/arch.gnu
 # include $(PUB)/Arch/arch.gnu-4.5
 endif
 endif
 endif
 #
 # Add symbol table information for gdb/cachegrind
 ifeq ($(debug), 1)
        CFLAGS += -g
        LDFLAGS += -g
        C++FLAGS += -g
 endif   


 # Add symbol table information for gprof
 ifeq ($(gprof), 1)
        CFLAGS += -g -pg
        C++FLAGS += -g -pg
        LDFLAGS += -g -pg
 endif

 # If you want to compile for single precision,
 # specify single=1 on the "make" command line
 ifeq ($(single), 1)
 else
    C++FLAGS += -D_DOUBLE
    CFLAGS += -D_DOUBLE
 endif


 # If you want to compile so that you call the plotter for
 # flattened 2D arrays (implemented as 1D arrays)
 # specify flattened=1 on the "make" command line
 ifeq ($(flattened), 1)
    C++FLAGS += -DPLOT1D
    CFLAGS += -DPLOT1D
 endif

 # If you want to use restrict pointers, make restrict=1
 # This applies to the hand code version
 ifeq ($(restrict), 1)
    C++FLAGS += -D__RESTRICT
    CFLAGS += -D__RESTRICT
 ifneq ($(CARVER), 0)
    C++FLAGS += -restrict
    CFLAGS += -restrict
 endif
 endif


 #DEBUG += -DDEBUG
 TARGETS = mpi

 app:		$(TARGETS)

 OBJECTS = common.o mpi.o.o
 #ifeq ($(no-mpi),1)
 #OBJECTS += Timer.o
 #endif
 app:	$(TARGETS)

 mpi: mpi.o common.o
 	$(C++LINK) $(LDFLAGS) -o $@ mpi.o common.o $(LDLIBS)

 clean:	
 	$(RM) *.o $(TARGETS);
 	$(RM) core.*;
diff --git a/mpi.cpp b/mpi.cpp
 #include <mpi.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
 #include <math.h>
 #include "common.h"

 //
 //  benchmarking program
 //
 int main( int argc, char **argv )
 {    
    //
    //  process command line parameters
    //
    if( find_option( argc, argv, "-h" ) >= 0 )
    {
        printf( "Options:\n" );
        printf( "-h to see this help\n" );
        printf( "-n <int> to set the number of particles\n" );
        printf( "-proc_width <int> to set the width of the processor grid\n" );
        printf( "-proc_height <int> to set the height of the processor grid\n" );
        printf( "-o <filename> to specify the output file name\n" );
        printf( "-c <checksum> to specify the output checksum file name\n" );
        return 0;
    }
    
    int n = read_int( argc, argv, "-n", 1000 );
    int proc_width = read_int( argc, argv, "-proc_width", NULL );
    int proc_height = read_int( argc, argv, "-proc_height", NULL );
    char *savename = read_string( argc, argv, "-o", NULL );
    char *checksum = read_string( argc, argv, "-c", NULL );

    //
    //  set up MPI
    //
    int n_proc, myrank;
    MPI_Init( &argc, &argv );
    MPI_Comm_size( MPI_COMM_WORLD, &n_proc );
    MPI_Comm_rank( MPI_COMM_WORLD, &myrank );

    //Ensure that px and py are correct setup.
     if ((proc_height * proc_width) != n_proc && !myrank){
        printf("\n *** The number of  processors in the geometry (%d)  is not the same as the number requested (%d)",proc_height*proc_width,n_proc);
        exit(-1);
     }

     int my_x = myrank % proc_width;
     int my_y = myrank / proc_width;

    //
    // Setup the neighbor setup
    //
    neighbor_t *neighbors = (neighbor_t*) malloc( 9 * sizeof(neighbor_t));
    neighbors[ SELF ].proc_x = my_x; neighbors[ SELF ].proc_y = my_y;
    neighbors[ BOTTOM_STRAIGHT ].proc_x = my_x; neighbors[ BOTTOM_STRAIGHT ].proc_y = my_y + 1;
    neighbors[ BOTTOM_LEFT ].proc_x = my_x - 1; neighbors[ BOTTOM_LEFT ].proc_y = my_y + 1;
    neighbors[ STRAIGHT_LEFT ].proc_x = my_x - 1; neighbors[ STRAIGHT_LEFT ].proc_y = my_y;
    neighbors[ TOP_LEFT ].proc_x = my_x - 1; neighbors[ TOP_LEFT ].proc_y = my_y - 1;
    neighbors[ TOP_STRAIGHT ].proc_x = my_x; neighbors[ TOP_STRAIGHT ].proc_y = my_y - 1;
    neighbors[ TOP_RIGHT ].proc_x = my_x + 1; neighbors[ TOP_RIGHT ].proc_y = my_y - 1;
    neighbors[ STRAIGHT_RIGHT ].proc_x = my_x + 1; neighbors[ STRAIGHT_RIGHT ].proc_y = my_y;
    neighbors[ BOTTOM_RIGHT ].proc_x = my_x + 1; neighbors[ BOTTOM_RIGHT ].proc_y = my_y + 1;

    //
    //  allocate generic resources
    //
    FILE *fsave = savename && myrank == 0 ? fopen( savename, "w" ) : NULL;
    FILE *fchecksum = checksum && myrank == 0 ? fopen( checksum, "w" ) : NULL;
    particle_t *particles = (particle_t*) malloc( n * sizeof(particle_t));
    particle_t *particles_prev = (particle_t*) malloc( n * sizeof(particle_t));
    
    MPI_Datatype PARTICLE;
    MPI_Type_contiguous( 7, MPI_DOUBLE, &PARTICLE );
    MPI_Type_commit( &PARTICLE );
    
    //
    //  initialize and distribute the particles (that's fine to leave it unoptimized)
    //
    double size = set_size( n );
    double size_x = size / proc_width;
    double size_y = size / proc_height;

    //
    //  set up the data partitioning across processors
    //
    setup_ghost_buffers(neighbors, size_x, size_y);
    if (!myrank) { //Init all particles at rank 0
        init_particles( n, particles_prev );
    }
    MPI_Bcast(particles_prev, n, PARTICLE, 0, MPI_COMM_WORLD);

    for( int i = 0; i < 9; i++ ) {
        neighbors[ i ].from_x = size_x * neighbors[ i ].proc_x;
        neighbors[ i ].to_x = size_x + neighbors[ i ].from_x;
        neighbors[ i ].from_y = size_y * neighbors[ i ].proc_y;
        neighbors[ i ].to_y = size_y + neighbors[ i ].from_y;
        if (i == 0) { // 0 = SELF
            neighbors[ SELF ].rank =  neighbors[ SELF ].proc_x + neighbors[ SELF ].proc_y * proc_width;
            neighbors[ SELF ].offset = 0;
            neighbors[ SELF ].send_size = 0;
            neighbors[ SELF ].recv_size = 0;
            for(int j = 0; j < n; j++){
                if (particles_prev[j].x >= neighbors[SELF].from_x && particles_prev[j].x < neighbors[SELF].to_x && 
                    particles_prev[j].y >= neighbors[SELF].from_y && particles_prev[j].y < neighbors[SELF].to_y){
                        particles[neighbors[ SELF ].recv_size] = particles_prev[j];
                        neighbors[ SELF ].recv_size += 1;
                }
            }
            neighbors[ SELF ].buffer_size = neighbors[SELF].recv_size;
        } else if (neighbors[ i ].proc_x > -1 && neighbors[i].proc_x < proc_width && neighbors[ i ].proc_y > -1 && neighbors[i].proc_y < proc_height) {
            neighbors[ i ].rank =  neighbors[ i ].proc_x + neighbors[ i ].proc_y * proc_width;
            neighbors[ i ].offset = neighbors[ i - 1 ].buffer_size + neighbors[ i - 1 ].offset;
        } else {
            neighbors[ i ].buffer_size = 0;
            neighbors[ i ].rank =  -1;
            neighbors[ i ].offset = neighbors[ i - 1 ].buffer_size + neighbors[ i - 1 ].offset;
            neighbors[ i ].send_size = 0;
            neighbors[ i ].recv_size = 0;
        }
    }

    for(int i = 1; i < 9; i++){
        if (neighbors[i].rank != -1){
            MPI_Irecv(&particles[neighbors[i].offset],neighbors[i].buffer_size, PARTICLE, neighbors[i].rank, 0, MPI_COMM_WORLD, &neighbors[i].recv_req );
        }
    }

    //Send info out to each neighbor
    copy_to_ghost_buffers( particles, neighbors );
    for(int i = 1; i < 9; i++) {
        if (neighbors[i].rank != -1) {          
            MPI_Isend(&neighbors[i].send_buffer[0],neighbors[i].send_size, PARTICLE, neighbors[i].rank, 0, MPI_COMM_WORLD, &neighbors[i].send_req );
        }
    }

    //Ensure that we have recv from everyone
    for(int i = 1; i < 9; i++) {
        if (neighbors[i].rank != -1) {
            MPI_Wait(&neighbors[i].recv_req,&neighbors[i].recv_stat);
            MPI_Get_count(&neighbors[i].recv_stat, PARTICLE, &neighbors[i].recv_size);
        }
    }

    //Ensure that you have send to anyone.
    for(int i = 1; i < 9; i++) {
        if (neighbors[i].rank != -1) {
            MPI_Wait(&neighbors[i].send_req,&neighbors[i].send_stat);
        }
    }
    
    particle_t *tmp = particles; particles = particles_prev; particles_prev = tmp;

    //Myrank is main so he is the one gathering the info
    int *n_proc_sizes = new int[n_proc];
    int *n_proc_offset = new int[n_proc];
    int count;

    MPI_Gather( &neighbors[SELF].recv_size, 1, MPI_INT, n_proc_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD );
    if(!myrank ) {
        int real_n = 0;
        for(int i = 0; i < n_proc; i++){
            real_n += n_proc_sizes[i];
        }
        printf("Before correctness check: n: %d, real-n: %d\n",n, real_n);
    }

    //
    //  simulate a number of time steps
    //
    double simulation_time = read_timer( );
    for( int step = 0; step < NSTEPS; step++ )
    {   
        //
        //  save current step if necessary (slightly different semantics than in other codes)
        //
        if( (step%SAVEFREQ) == 0 && savename ) {
            //To to collect from every node 
            MPI_Gather( &neighbors[SELF].recv_size, 1, MPI_INT, n_proc_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD );
            //Calculate offsets
            if (myrank == 0){
                for(int i = 0; i < n_proc; i++){
                    if (i == 0) {
                        n_proc_offset[i] = 0;
                    } else {
                        n_proc_offset[i] = n_proc_offset[i-1] + n_proc_sizes[i-1];
                    }
                }
            }
            MPI_Gatherv( &particles_prev[neighbors[SELF].offset], neighbors[SELF].recv_size, PARTICLE, particles, n_proc_sizes, n_proc_offset, PARTICLE, 0, MPI_COMM_WORLD );
            if (myrank == 0 && fsave) {
                save( fsave, n, particles );
            }
        }

        //
        //  compute forces in us and the neighbors
        //
        for( int i = 0; i < 9; i++){
            for (int k = neighbors[i].offset; k < neighbors[i].offset + neighbors[i].recv_size; k++){
                particles_prev[k].ax = particles_prev[k].ay = 0;
                for( int j = 0; j < 9; j++){
                    for (int l = neighbors[j].offset; l < neighbors[j].offset + neighbors[j].recv_size; l++){
                        apply_force( particles_prev[k], particles_prev[l] );
                    }
                }  
            }
        }
        
        //
        //  move particles
        //
        count = 0;
        for(int i = 0; i < 9; i++){
            for (int k = neighbors[i].offset; k < neighbors[i].offset + neighbors[i].recv_size; k++){
                move(particles_prev[k]);
                if (particles_prev[k].x >= neighbors[SELF].from_x && particles_prev[k].x < neighbors[SELF].to_x && 
                    particles_prev[k].y >= neighbors[SELF].from_y && particles_prev[k].y < neighbors[SELF].to_y){
                        particles[count + neighbors[SELF].offset] = particles_prev[k];
                        count = count + 1;
                }
            }
        }
        neighbors[SELF].buffer_size = neighbors[SELF].recv_size = count;

        for(int i = 1; i < 9; i++) {
            neighbors[ i ].offset = neighbors[ i - 1 ].buffer_size + neighbors[ i - 1 ].offset;
            if (neighbors[i].rank != -1){
                MPI_Irecv(&particles[neighbors[i].offset],neighbors[i].buffer_size, PARTICLE, neighbors[i].rank, 0, MPI_COMM_WORLD, &neighbors[i].recv_req );
            }
        }

        copy_to_ghost_buffers(particles, neighbors);

        for( int i = 1; i < 9; i++){
            if (neighbors[i].rank != -1){
                 MPI_Isend(&neighbors[i].send_buffer[0],min(neighbors[i].send_size, neighbors[i].buffer_size), PARTICLE, neighbors[i].rank, 0, MPI_COMM_WORLD, &neighbors[i].send_req );
            }
        }


        for( int i = 1; i < 9; i++) {
            if (neighbors[i].rank != -1){
                MPI_Wait(&neighbors[i].recv_req,&neighbors[i].recv_stat);
                MPI_Get_count(&neighbors[i].recv_stat, PARTICLE, &neighbors[i].recv_size);
            }
        }

        particle_t *tmp = particles; particles = particles_prev; particles_prev = tmp;


        for( int i = 1; i < 9; i++) {
            if (neighbors[i].rank != -1){
                MPI_Wait(&neighbors[i].send_req,&neighbors[i].send_stat);
            }
        }
    }
    simulation_time = read_timer( ) - simulation_time;

    MPI_Gather( &neighbors[SELF].recv_size, 1, MPI_INT, n_proc_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD );
    if(!myrank ) {
        int real_n = 0;
        for(int i = 0; i < n_proc; i++){
            if (i == 0) {
                n_proc_offset[i] = 0;
            } else {
                n_proc_offset[i] = n_proc_offset[i-1] + n_proc_sizes[i-1];
            }
            real_n += n_proc_sizes[i];
        }
        printf("After correctness check: n: %d, real-n: %d\n",n, real_n);
            
        printf( "n = %d, n_procs = %d, proc_width = %d, proc_height = %d, simulation time = %g s\n", n, n_proc, proc_width, proc_height, simulation_time );
    }
    MPI_Gatherv( &particles_prev[neighbors[SELF].offset], neighbors[SELF].recv_size, PARTICLE, particles, n_proc_sizes, n_proc_offset, PARTICLE, 0, MPI_COMM_WORLD );
    //Sort every particle to their i value
    if (!myrank && fchecksum){
        for(int j = 0; j < n; j++){
            particles_prev[(int)particles[j].index] = particles[j];
        }
        save( fchecksum, n, particles_prev );
    }
    
    //
    //  release resources
    //
    free( neighbors );
    free( particles );
    free( particles_prev );
    if( fsave )
        fclose( fsave );
    if (fchecksum)
        fclose( fchecksum );
    
    MPI_Finalize( );
    
    return 0;
 }
	#!/bin/bash
	# This script is intepreted by the Bourne Shell, sh
	#
	# Documentation for SGE is found in:
	# http://docs.oracle.com/cd/E19279-01/820-3257-12/n1ge.html
	#
	# Tell SGE which shell to run the job script in rather than depending
	# on SGE to try and figure it out.
	#$ -S /bin/bash
	#
	# Export all my environment variables to the job
	#$ -V
	#
	# Tun the job in the same directory from which you submitted it
	#$ -cwd
	#
	#
	# --- Don't change anything above this line ---
	#
	# Give a name to the job
	#$ -N MPI
	#
	# Specify a time limit for the job, not more than 30 minutes
	#$ -l h_rt=00:10:00
	#
	# Specify the parallel environment and number of cores,
	# If not a multiple of 8, you'll get the whole node anyway
	#$ -pe orte 8
	#
	# Join stdout and stderr so they are reported in job output file
	#$ -j y
	#
	#
	# Choose the queue to run the job
	#
	# Debug queue: only one node may be used at a time for up to 30 minutes
	# Interactive or batch jobs, maximum of 1 job per user running at a time
	#
	# Normal queue: job may use all available compute nodes (256 cores)
	# for up to 60 minutes
	# Batch jobs, maximum of 2 jobs per user running at a time
	# To use more than one node, specify the "normal" queue
	#$ -q normal.q
	# #$ -q debug.q
	#
	# Specifies the circumstances under which mail is to be sent to the job owner
	# defined by -M option. For example, options "bea" cause mail to be sent at the
	# begining, end, and at abort time (if it happens) of the job.
	# Option "n" means no mail will be sent.
	#$ -m aeb
	#
	# *** Change to the address you want the notification sent to, and
	# *** REMOVE the blank between the # and the $
	#$ -M [email protected]
	#


	echo
	echo " *** Current working directory"
	pwd
	echo
	echo " *** Compiler"
	# Output which compiler are we using and the environment
	mpicc -v
	echo
	echo " *** Environment"
	printenv

	echo

	echo ">>> Job Starts"
	date
	mpirun -np $NSLOTS ./mpi -n 5000 -proc_width 4 -proc_height 2 -o mpi.txt -c checksum.txt

	date
	echo ">>> Job Ends"
	#include <stdlib.h>
	#include <stdio.h>
	#include <assert.h>
	#include <float.h>
	#include <string.h>
	#include <math.h>
	#include <time.h>
	#include <mpi.h>
	#include <sys/time.h>
	#include "common.h"

	double size;

	//
	// tuned constants
	//
	#define density 0.0010
	#define mass 0.01
	#define cutoff 0.01
	#define overlap 0.011
	#define min_r (cutoff/100)
	#define dt 0.0005

	void setup_ghost_buffers( neighbor_t *neighbors, double size_x, double size_y){
	neighbor_t self = neighbors[SELF];
	for(int i = 1; i < 9; i++){
	if (neighbors[i].proc_x - self.proc_x != 0 && neighbors[i].proc_y - self.proc_y != 0) {
	neighbors[i].buffer_size = (int)(min(ceil(size_y),ceil(size_x)) / mass);
	} else if (neighbors[i].proc_x - self.proc_x != 0) {
	neighbors[i].buffer_size = (int)ceil(size_y / mass);
	} else if (neighbors[i].proc_y - self.proc_y != 0) {
	neighbors[i].buffer_size = (int)ceil(size_x / mass);
	}
	neighbors[i].send_buffer = (particle_t) malloc( sizeof(particle_t) neighbors[i].buffer_size);
	}
	}

	void copy_to_ghost_buffers( particle_t particles, neighbor_t neighbors) {
	neighbor_t self = neighbors[SELF];
	for(int i = 1; i < 9; i++){
	neighbors[i].send_size = 0;
	}
	for (int i = self.offset; i < self.offset + self.recv_size; i++){
	if (particles[i].x <= self.from_x + overlap && particles[i].y <= self.from_y + overlap && neighbors[TOP_LEFT].rank != -1) {//TOP LEFT particle
	if (neighbors[TOP_LEFT].buffer_size <= neighbors[TOP_LEFT].send_size){
	printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[TOP_LEFT].rank, neighbors[TOP_LEFT].buffer_size, neighbors[TOP_LEFT].send_size +1);
	} else {
	neighbors[TOP_LEFT].send_buffer[neighbors[TOP_LEFT].send_size] = particles[i];
	}
	neighbors[TOP_LEFT].send_size = neighbors[TOP_LEFT].send_size + 1;
	}
	if (self.to_x - overlap < particles[i].x && particles[i].y <= self.from_y + overlap && neighbors[TOP_RIGHT].rank != -1) {//TOP RIGHT particle
	if (neighbors[TOP_RIGHT].buffer_size <= neighbors[TOP_RIGHT].send_size){
	printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[TOP_RIGHT].rank, neighbors[TOP_RIGHT].buffer_size, neighbors[TOP_LEFT].send_size +1);
	} else {
	neighbors[TOP_RIGHT].send_buffer[neighbors[TOP_RIGHT].send_size] = particles[i];
	}
	neighbors[TOP_RIGHT].send_size = neighbors[TOP_RIGHT].send_size + 1;
	}
	if (particles[i].x <= self.from_x + overlap && self.to_y - overlap < particles[i].y && neighbors[BOTTOM_LEFT].rank != -1) {// BOTTOM LEFT particle
	if (neighbors[BOTTOM_LEFT].buffer_size <= neighbors[BOTTOM_LEFT].send_size){
	printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[BOTTOM_LEFT].rank, neighbors[BOTTOM_LEFT].buffer_size, neighbors[TOP_LEFT].send_size +1);
	} else {
	neighbors[BOTTOM_LEFT].send_buffer[neighbors[BOTTOM_LEFT].send_size] = particles[i];
	}
	neighbors[BOTTOM_LEFT].send_size = neighbors[BOTTOM_LEFT].send_size + 1;
	}
	if (self.to_x - overlap < particles[i].x && self.to_y - overlap < particles[i].y && neighbors[BOTTOM_RIGHT].rank != -1) {//BOTTOM RIGHT particle
	if (neighbors[BOTTOM_RIGHT].buffer_size <= neighbors[BOTTOM_RIGHT].send_size){
	printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[BOTTOM_RIGHT].rank, neighbors[BOTTOM_RIGHT].buffer_size, neighbors[TOP_LEFT].send_size +1);
	} else {
	neighbors[BOTTOM_RIGHT].send_buffer[neighbors[BOTTOM_RIGHT].send_size] = particles[i];
	}
	neighbors[BOTTOM_RIGHT].send_size = neighbors[BOTTOM_RIGHT].send_size + 1;
	}
	if (self.to_x - overlap < particles[i].x && neighbors[STRAIGHT_RIGHT].rank != -1) { // STRAIGHT RIGHT particle
	if (neighbors[STRAIGHT_RIGHT].buffer_size <= neighbors[STRAIGHT_RIGHT].send_size){
	printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[STRAIGHT_RIGHT].rank, neighbors[STRAIGHT_RIGHT].buffer_size, neighbors[TOP_LEFT].send_size +1);
	} else {
	neighbors[STRAIGHT_RIGHT].send_buffer[neighbors[STRAIGHT_RIGHT].send_size] = particles[i];
	}
	neighbors[STRAIGHT_RIGHT].send_size = neighbors[STRAIGHT_RIGHT].send_size + 1;
	}
	if (particles[i].x <= self.from_x + overlap && neighbors[STRAIGHT_LEFT].rank != -1) { // STRAIGHT LEFT particle
	if (neighbors[STRAIGHT_LEFT].buffer_size <= neighbors[STRAIGHT_LEFT].send_size){
	printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[STRAIGHT_LEFT].rank, neighbors[STRAIGHT_LEFT].buffer_size, neighbors[TOP_LEFT].send_size +1);
	} else {
	neighbors[STRAIGHT_LEFT].send_buffer[neighbors[STRAIGHT_LEFT].send_size] = particles[i];
	}
	neighbors[STRAIGHT_LEFT].send_size = neighbors[STRAIGHT_LEFT].send_size + 1;
	}
	if (self.to_y - overlap < particles[i].y && neighbors[BOTTOM_STRAIGHT].rank != -1) { // BOTTOM STRAIGHT particle
	if (neighbors[BOTTOM_STRAIGHT].buffer_size <= neighbors[BOTTOM_STRAIGHT].send_size){
	printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[BOTTOM_STRAIGHT].rank, neighbors[BOTTOM_STRAIGHT].buffer_size, neighbors[TOP_LEFT].send_size +1);
	} else {
	neighbors[BOTTOM_STRAIGHT].send_buffer[neighbors[BOTTOM_STRAIGHT].send_size] = particles[i];
	}
	neighbors[BOTTOM_STRAIGHT].send_size = neighbors[BOTTOM_STRAIGHT].send_size + 1;
	}
	if (particles[i].y <= self.from_y + overlap && neighbors[TOP_STRAIGHT].rank != -1) { // TOP STRAIGHT particle
	if (neighbors[TOP_STRAIGHT].buffer_size <= neighbors[TOP_STRAIGHT].send_size){
	printf("Rank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.rank, neighbors[TOP_STRAIGHT].rank, neighbors[TOP_STRAIGHT].buffer_size, neighbors[TOP_LEFT].send_size +1);
	} else {
	neighbors[TOP_STRAIGHT].send_buffer[neighbors[TOP_STRAIGHT].send_size] = particles[i];
	}
	neighbors[TOP_STRAIGHT].send_size = neighbors[TOP_STRAIGHT].send_size + 1;
	}
	}
	}

	//
	// timer
	//
	double read_timer( )
	{
	static bool initialized = false;
	static struct timeval start;
	struct timeval end;
	if( !initialized )
	{
	gettimeofday( &start, NULL );
	initialized = true;
	}
	gettimeofday( &end, NULL );
	return (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
	}

	//
	// keep density constant
	//
	double set_size( int n )
	{
	size = sqrt( density * n );
	return size;
	}

	//
	// Initialize the particle positions and velocities
	//
	void init_particles( int n, particle_t *p )
	{
	srand48( 1 );

	int sx = (int)ceil(sqrt((double)n));
	int sy = (n+sx-1)/sx;

	int shuffle = (int)malloc( n * sizeof(int) );
	for( int i = 0; i < n; i++ )
	shuffle[i] = i;

	for( int i = 0; i < n; i++ )
	{
	//
	// make sure particles are not spatially sorted
	//
	int j = lrand48()%(n-i);
	int k = shuffle[j];
	shuffle[j] = shuffle[n-i-1];

	//
	// distribute particles evenly to ensure proper spacing
	//
	p[i].x = size*(1.+(k%sx))/(1+sx);
	p[i].y = (size/2)*(1.+(k/sx))/(1+sy);
	p[i].index = i;

	//
	// assign random velocities within a bound
	//
	p[i].vx = drand48()*2-1;
	p[i].vy = drand48()*2-1;
	}
	free( shuffle );
	}

	//
	// interact two particles
	//
	void apply_force( particle_t &particle, particle_t &neighbor )
	{

	double dx = neighbor.x - particle.x;
	double dy = neighbor.y - particle.y;
	double r2 = dx * dx + dy * dy;
	if( r2 > cutoff*cutoff )
	return;
	r2 = fmax( r2, min_r*min_r );
	double r = sqrt( r2 );

	//
	// very simple short-range repulsive force
	//
	double coef = ( 1 - cutoff / r ) / r2 / mass;
	particle.ax += coef * dx;
	particle.ay += coef * dy;
	}

	//
	// integrate the ODE
	//
	void move( particle_t &p )
	{
	//
	// slightly simplified Velocity Verlet integration
	// conserves energy better than explicit Euler method
	//
	p.vx += p.ax * dt;
	p.vy += p.ay * dt;
	p.x += p.vx * dt;
	p.y += p.vy * dt;

	//
	// bounce from walls
	//
	while( p.x < 0 \|\| p.x > size )
	{
	p.x = p.x < 0 ? -p.x : 2*size-p.x;
	p.vx = -p.vx;
	}
	while( p.y < 0 \|\| p.y > size )
	{
	p.y = p.y < 0 ? -p.y : 2*size-p.y;
	p.vy = -p.vy;
	}
	}

	//
	// I/O routines
	//
	void save( FILE f, int n, particle_t p )
	{
	static bool first = true;
	if( first )
	{
	fprintf( f, "%d %g\n", n, size );
	first = false;
	}
	for( int i = 0; i < n; i++ )
	fprintf( f, "%g %g\n", p[i].x, p[i].y );
	}

	//
	// command line option processing
	//
	int find_option( int argc, char *argv, const char option )
	{
	for( int i = 1; i < argc; i++ )
	if( strcmp( argv[i], option ) == 0 )
	return i;
	return -1;
	}

	int read_int( int argc, char *argv, const char option, int default_value )
	{
	int iplace = find_option( argc, argv, option );
	if( iplace >= 0 && iplace < argc-1 )
	return atoi( argv[iplace+1] );
	return default_value;
	}

	char read_string( int argc, char argv, const char option, char *default_value )
	{
	int iplace = find_option( argc, argv, option );
	if( iplace >= 0 && iplace < argc-1 )
	return argv[iplace+1];
	return default_value;
	}
	#ifndef __CS267_COMMON_H__
	#define __CS267_COMMON_H__

	inline int min( int a, int b ) { return a < b ? a : b; }
	inline int max( int a, int b ) { return a > b ? a : b; }

	//
	// saving parameters
	//
	const int NSTEPS = 1000;
	const int SAVEFREQ = 10;

	const int SELF = 0,
	BOTTOM_STRAIGHT = 1,
	BOTTOM_LEFT = 2,
	STRAIGHT_LEFT = 3,
	TOP_LEFT = 4,
	TOP_STRAIGHT = 5,
	TOP_RIGHT = 6,
	STRAIGHT_RIGHT = 7,
	BOTTOM_RIGHT = 8;

	//
	// particle data structure
	//
	typedef struct
	{
	double index;
	double x;
	double y;
	double vx;
	double vy;
	double ax;
	double ay;
	} particle_t;

	typedef struct
	{
	int offset;
	int send_size;
	int recv_size;
	int proc_x;
	int proc_y;
	int rank;
	double from_x;
	double from_y;
	double to_x;
	double to_y;
	particle_t *send_buffer;
	int buffer_size;
	MPI_Request recv_req;
	MPI_Request send_req;
	MPI_Status recv_stat;
	MPI_Status send_stat;
	} neighbor_t;

	//
	// timing routines
	//
	double read_timer( );


	void copy_to_ghost_buffers( particle_t particles, neighbor_t neighbors);
	void setup_ghost_buffers( neighbor_t *neighbors, double size_x, double size_y);

	//
	// simulation routines
	//
	double set_size( int n );
	void init_particles( int n, particle_t *p );
	void apply_force( particle_t &particle, particle_t &neighbor );
	void move( particle_t &p );

	//
	// I/O routines
	//
	FILE open_save( char filename, int n );
	void save( FILE f, int n, particle_t p );

	//
	// argument processing routines
	//
	int find_option( int argc, char *argv, const char option );
	int read_int( int argc, char *argv, const char option, int default_value );
	char read_string( int argc, char argv, const char option, char *default_value );

	#endif
	HOST = $(shell hostname)
	BANG = $(shell expr match `hostname` ccom-bang)
	BANG-COMPUTE = $(shell expr match `hostname` compute)
	LILLIPUT = $(shell expr match `hostname` lilliput)



	ifneq ($(BANG), 0)
	PUB = /share/class/public/cse260-fa12
	include $(PUB)/Arch/arch.gnu.generic
	else
	ifneq ($(BANG-COMPUTE), 0)
	PUB = /share/class/public/cse260-fa12
	include $(PUB)/Arch/arch.gnu.generic
	else
	ifneq ($(LILLIPUT), 0)
	PUB = /class/public/cse260-fa12
	include $(PUB)/Arch/arch.intel.generic
	else
	# PUB = /Users/baden/lib
	include $(PUB)/Arch/arch.gnu
	# include $(PUB)/Arch/arch.gnu-4.5
	endif
	endif
	endif
	#
	# Add symbol table information for gdb/cachegrind
	ifeq ($(debug), 1)
	CFLAGS += -g
	LDFLAGS += -g
	C++FLAGS += -g
	endif


	# Add symbol table information for gprof
	ifeq ($(gprof), 1)
	CFLAGS += -g -pg
	C++FLAGS += -g -pg
	LDFLAGS += -g -pg
	endif

	# If you want to compile for single precision,
	# specify single=1 on the "make" command line
	ifeq ($(single), 1)
	else
	C++FLAGS += -D_DOUBLE
	CFLAGS += -D_DOUBLE
	endif


	# If you want to compile so that you call the plotter for
	# flattened 2D arrays (implemented as 1D arrays)
	# specify flattened=1 on the "make" command line
	ifeq ($(flattened), 1)
	C++FLAGS += -DPLOT1D
	CFLAGS += -DPLOT1D
	endif

	# If you want to use restrict pointers, make restrict=1
	# This applies to the hand code version
	ifeq ($(restrict), 1)
	C++FLAGS += -D__RESTRICT
	CFLAGS += -D__RESTRICT
	ifneq ($(CARVER), 0)
	C++FLAGS += -restrict
	CFLAGS += -restrict
	endif
	endif


	#DEBUG += -DDEBUG
	TARGETS = mpi

	app: $(TARGETS)

	OBJECTS = common.o mpi.o.o
	#ifeq ($(no-mpi),1)
	#OBJECTS += Timer.o
	#endif
	app: $(TARGETS)

	mpi: mpi.o common.o
	$(C++LINK) $(LDFLAGS) -o $@ mpi.o common.o $(LDLIBS)

	clean:
	$(RM) *.o $(TARGETS);
	$(RM) core.*;