jstroem · December 6, 2012 04:12
diff --git a/batch_bang.sh b/batch_bang.sh
 #!/bin/bash
 # This script is intepreted by the Bourne Shell, sh
 #
 # Documentation for SGE is found in:
 # http://docs.oracle.com/cd/E19279-01/820-3257-12/n1ge.html
 #
 # Tell SGE which shell to run the job script in rather than depending
 # on SGE to try and figure it out.
 #$ -S /bin/bash
 #
 # Export all my environment variables to the job
 #$ -V
 #
 # Tun the job in the same directory from which you submitted it
 #$ -cwd
 #
 #
 # --- Don't change anything above this line ---
 #
 # Give a name to the job
 #$ -N MPI
 #
 # Specify a time limit for the job, not more than 30 minutes
 #$ -l h_rt=00:10:00
 #
 # Specify the parallel environment and number of cores,
 # If not a multiple of 8, you'll get the whole node anyway
 #$ -pe orte 8
 #
 # Join stdout and stderr so they are reported in job output file
 #$ -j y
 #
 #
 # Choose the queue to run the job
 #
 # Debug queue: only one node may be used at a time for up to 30 minutes
 # Interactive or batch jobs, maximum of 1 job per user running at a time
 #
 # Normal queue: job may use all available compute nodes (256 cores)
 # for up to 60 minutes
 # Batch jobs, maximum of 2 jobs per user running at a time
 # To use more than one node, specify the "normal" queue
 #$ -q normal.q
 # #$ -q debug.q
 #
 # Specifies the circumstances under which mail is to be sent to the job owner
 # defined by -M option. For example, options "bea" cause mail to be sent at the 
 # begining, end, and at abort time (if it happens) of the job.
 # Option "n" means no mail will be sent.
 #$ -m aeb
 #
 # *** Change to the address you want the notification sent to, and
 # *** REMOVE the blank between the # and the $
 #$ -M [email protected]
 #


 echo
 echo " *** Current working directory"
 pwd
 echo
 echo " *** Compiler"
 # Output which  compiler are we using and the environment
 mpicc -v
 echo
 echo " *** Environment"
 printenv

 echo

 echo ">>> Job Starts"
 date
 mpirun -np $NSLOTS ./mpi -n 5000 -proc_width 4 -proc_height 2 -split_factor 2 -o mpi.txt -c checksum.txt

 date
 echo ">>> Job Ends"
diff --git a/common.cpp b/common.cpp
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
 #include <float.h>
 #include <string.h>
 #include <math.h>
 #include <time.h>
 #include <mpi.h>
 #include <sys/time.h>
 #include "common.h"

 double size;

 //
 //  tuned constants
 //
 #define density 0.0005
 #define mass    0.01
 #define cutoff  0.01
 #define overlap 0.011
 #define min_r   (cutoff/100)
 #define dt      0.0005

 int calc_array_size(int n, double size_x, double size_y, int split_factor, int n_proc){
    return max(n, (int)ceil(fmax(size_x,size_y) / mass) * 8 * (split_factor * split_factor * n_proc));
 }

 void setup_neighbors( block_t block, int my_x, int my_y, int proc_width, int proc_height, int split_factor ){
    neighbor_t *neighbors = block.neighbors;
    neighbors[ BOTTOM_STRAIGHT ].proc_x = my_x; neighbors[ BOTTOM_STRAIGHT ].proc_y = my_y + 1;
    neighbors[ BOTTOM_LEFT ].proc_x = my_x - 1; neighbors[ BOTTOM_LEFT ].proc_y = my_y + 1;
    neighbors[ STRAIGHT_LEFT ].proc_x = my_x - 1; neighbors[ STRAIGHT_LEFT ].proc_y = my_y;
    neighbors[ TOP_LEFT ].proc_x = my_x - 1; neighbors[ TOP_LEFT ].proc_y = my_y - 1;
    neighbors[ TOP_STRAIGHT ].proc_x = my_x; neighbors[ TOP_STRAIGHT ].proc_y = my_y - 1;
    neighbors[ TOP_RIGHT ].proc_x = my_x + 1; neighbors[ TOP_RIGHT ].proc_y = my_y - 1;
    neighbors[ STRAIGHT_RIGHT ].proc_x = my_x + 1; neighbors[ STRAIGHT_RIGHT ].proc_y = my_y;
    neighbors[ BOTTOM_RIGHT ].proc_x = my_x + 1; neighbors[ BOTTOM_RIGHT ].proc_y = my_y + 1;
    for(int i = 0; i < 8; i++){
        if (neighbors[ i ].proc_x < 0) {
            neighbors[ i ].proc_x = neighbors[ i ].proc_x + proc_width;
            neighbors[ i ].block_x = block.block_x - 1;
        } else if (proc_width <= neighbors[ i ].proc_x) {
            neighbors[ i ].proc_x = neighbors[ i ].proc_x % proc_width;
            neighbors[ i ].block_x = block.block_x + 1;
        } else {
            neighbors[ i ].block_x = block.block_x;
        }
        if (neighbors[ i ].proc_y  < 0) {
            neighbors[ i ].proc_y = neighbors[ i ].proc_y + proc_height;
            neighbors[ i ].block_y = block.block_y - 1;
        } else if (proc_height <= neighbors[ i ].proc_y) {
            neighbors[ i ].proc_y = neighbors[ i ].proc_y % proc_height;
            neighbors[ i ].block_y = block.block_y + 1;
        } else {
            neighbors[ i ].block_y = block.block_y;
        }
        neighbors[ i ].rank =  neighbors[ i ].proc_x + neighbors[ i ].proc_y * proc_width;
        if (neighbors[ i ].block_y < 0 || split_factor <= neighbors[ i ].block_y || neighbors[ i ].block_x < 0 || split_factor <= neighbors[ i ].block_x){
            neighbors[ i ].rank = -1;
            neighbors[ i ].buffer_size = 0;
            neighbors[ i ].recv_size = 0;
        }
    }
 }

 void setup_ghost_buffers(int my_x, int my_y, neighbor_t *neighbors, double size_x, double size_y, int start_offset){
    for(int i = 0; i < 8; i++){
        if (neighbors[ i ].rank != -1){
            if (neighbors[i].proc_x - my_x != 0 && neighbors[i].proc_y - my_y != 0) {
                neighbors[i].buffer_size = (int)(ceil(fmin(size_y,size_x) / mass));
            } else if (neighbors[i].proc_x - my_x != 0) {
                neighbors[i].buffer_size = (int)ceil(size_y / mass);
            } else if (neighbors[i].proc_y - my_y != 0) {
                neighbors[i].buffer_size = (int)ceil(size_x / mass);
            }
        } else {
            neighbors[i].buffer_size = 0;
        }
        neighbors[i].offset = (i == 0 ? start_offset : neighbors[ i - 1 ].buffer_size + neighbors[ i - 1 ].offset);
        neighbors[i].send_buffer = (particle_t*) malloc( sizeof(particle_t) * neighbors[i].buffer_size);
    }
 }

 void copy_to_ghost_buffers( particle_t *particles, block_t self) {
    for(int i = 0; i < 8; i++){
        self.neighbors[i].send_size = 0;
    }
    for (int i = self.offset; i < self.offset + self.size; i++){
        if (particles[i].x <= self.from_x + overlap && particles[i].y <= self.from_y + overlap && self.neighbors[TOP_LEFT].rank != -1) {//TOP LEFT particle
            if (self.neighbors[TOP_LEFT].buffer_size <= self.neighbors[TOP_LEFT].send_size){
                printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[TOP_LEFT].rank, self.neighbors[TOP_LEFT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
            } else {
                self.neighbors[TOP_LEFT].send_buffer[self.neighbors[TOP_LEFT].send_size] = particles[i];
            }
            self.neighbors[TOP_LEFT].send_size = self.neighbors[TOP_LEFT].send_size + 1;
        }
        if (self.to_x - overlap < particles[i].x && particles[i].y <= self.from_y + overlap && self.neighbors[TOP_RIGHT].rank != -1) {//TOP RIGHT particle
            if (self.neighbors[TOP_RIGHT].buffer_size <= self.neighbors[TOP_RIGHT].send_size){
                printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[TOP_RIGHT].rank, self.neighbors[TOP_RIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
            } else {
                self.neighbors[TOP_RIGHT].send_buffer[self.neighbors[TOP_RIGHT].send_size] = particles[i];
            }
            self.neighbors[TOP_RIGHT].send_size = self.neighbors[TOP_RIGHT].send_size + 1;
        }
        if (particles[i].x <= self.from_x + overlap && self.to_y - overlap < particles[i].y && self.neighbors[BOTTOM_LEFT].rank != -1) {// BOTTOM LEFT particle
            if (self.neighbors[BOTTOM_LEFT].buffer_size <= self.neighbors[BOTTOM_LEFT].send_size){
                printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[BOTTOM_LEFT].rank, self.neighbors[BOTTOM_LEFT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
            } else {
                self.neighbors[BOTTOM_LEFT].send_buffer[self.neighbors[BOTTOM_LEFT].send_size] = particles[i];
            }
            self.neighbors[BOTTOM_LEFT].send_size = self.neighbors[BOTTOM_LEFT].send_size + 1;
        }
        if (self.to_x - overlap < particles[i].x && self.to_y - overlap < particles[i].y && self.neighbors[BOTTOM_RIGHT].rank != -1) {//BOTTOM RIGHT particle
            if (self.neighbors[BOTTOM_RIGHT].buffer_size <= self.neighbors[BOTTOM_RIGHT].send_size){
                printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[BOTTOM_RIGHT].rank, self.neighbors[BOTTOM_RIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
            } else {
                self.neighbors[BOTTOM_RIGHT].send_buffer[self.neighbors[BOTTOM_RIGHT].send_size] = particles[i];
            }
            self.neighbors[BOTTOM_RIGHT].send_size = self.neighbors[BOTTOM_RIGHT].send_size + 1;
        }
        if (self.to_x - overlap < particles[i].x && self.neighbors[STRAIGHT_RIGHT].rank != -1) { // STRAIGHT RIGHT particle
            if (self.neighbors[STRAIGHT_RIGHT].buffer_size <= self.neighbors[STRAIGHT_RIGHT].send_size){
                printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[STRAIGHT_RIGHT].rank, self.neighbors[STRAIGHT_RIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
            } else {
                self.neighbors[STRAIGHT_RIGHT].send_buffer[self.neighbors[STRAIGHT_RIGHT].send_size] = particles[i];
            }
            self.neighbors[STRAIGHT_RIGHT].send_size = self.neighbors[STRAIGHT_RIGHT].send_size + 1;
        }
        if (particles[i].x <= self.from_x + overlap && self.neighbors[STRAIGHT_LEFT].rank != -1) { // STRAIGHT LEFT particle
            if (self.neighbors[STRAIGHT_LEFT].buffer_size <= self.neighbors[STRAIGHT_LEFT].send_size){
                printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[STRAIGHT_LEFT].rank, self.neighbors[STRAIGHT_LEFT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
            } else {
                self.neighbors[STRAIGHT_LEFT].send_buffer[self.neighbors[STRAIGHT_LEFT].send_size] = particles[i];
            }
            self.neighbors[STRAIGHT_LEFT].send_size = self.neighbors[STRAIGHT_LEFT].send_size + 1;
        }
        if (self.to_y - overlap < particles[i].y && self.neighbors[BOTTOM_STRAIGHT].rank != -1) { // BOTTOM STRAIGHT particle
            if (self.neighbors[BOTTOM_STRAIGHT].buffer_size <= self.neighbors[BOTTOM_STRAIGHT].send_size){
                printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[BOTTOM_STRAIGHT].rank, self.neighbors[BOTTOM_STRAIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
            } else {
                self.neighbors[BOTTOM_STRAIGHT].send_buffer[self.neighbors[BOTTOM_STRAIGHT].send_size] = particles[i];
            }
            self.neighbors[BOTTOM_STRAIGHT].send_size = self.neighbors[BOTTOM_STRAIGHT].send_size + 1;
        }
        if (particles[i].y <= self.from_y + overlap && self.neighbors[TOP_STRAIGHT].rank != -1) { // TOP STRAIGHT particle
            if (self.neighbors[TOP_STRAIGHT].buffer_size <= self.neighbors[TOP_STRAIGHT].send_size){
                printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[TOP_STRAIGHT].rank, self.neighbors[TOP_STRAIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
            } else {
                self.neighbors[TOP_STRAIGHT].send_buffer[self.neighbors[TOP_STRAIGHT].send_size] = particles[i];
            }
            self.neighbors[TOP_STRAIGHT].send_size = self.neighbors[TOP_STRAIGHT].send_size + 1;
        }
    }
 }

 //
 //  timer
 //
 double read_timer( )
 {
    static bool initialized = false;
    static struct timeval start;
    struct timeval end;
    if( !initialized )
    {
        gettimeofday( &start, NULL );
        initialized = true;
    }
    gettimeofday( &end, NULL );
    return (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
 }

 //
 //  keep density constant
 //
 double set_size( int n )
 {
    size = sqrt( density * n );
    return size;
 }

 //
 //  Initialize the particle positions and velocities
 //
 void init_particles( int n, particle_t *p )
 {
    srand48( 1 );
        
    int sx = (int)ceil(sqrt((double)n));
    int sy = (n+sx-1)/sx;
    
    int *shuffle = (int*)malloc( n * sizeof(int) );
    for( int i = 0; i < n; i++ )
        shuffle[i] = i;
    
    for( int i = 0; i < n; i++ ) 
    {
        //
        //  make sure particles are not spatially sorted
        //
        int j = lrand48()%(n-i);
        int k = shuffle[j];
        shuffle[j] = shuffle[n-i-1];
        
        //
        //  distribute particles evenly to ensure proper spacing
        //
        p[i].x = size*(1.+(k%sx))/(1+sx);
        p[i].y = size*(1.+(k/sx))/(1+sy);
        p[i].index = i;

        //
        //  assign random velocities within a bound
        //
        p[i].vx = drand48()*2-1;
        p[i].vy = drand48()*2-1;
    }
    free( shuffle );
 }

 //
 //  interact two particles
 //
 void apply_force( particle_t &particle, particle_t &neighbor )
 {

    double dx = neighbor.x - particle.x;
    double dy = neighbor.y - particle.y;
    double r2 = dx * dx + dy * dy;
    if( r2 > cutoff*cutoff )
        return;
    r2 = fmax( r2, min_r*min_r );
    double r = sqrt( r2 );

    //
    //  very simple short-range repulsive force
    //
    double coef = ( 1 - cutoff / r ) / r2 / mass;
    particle.ax += coef * dx;
    particle.ay += coef * dy;
 }

 //
 //  integrate the ODE
 //
 void move( particle_t &p )
 {
    //
    //  slightly simplified Velocity Verlet integration
    //  conserves energy better than explicit Euler method
    //
    p.vx += p.ax * dt;
    p.vy += p.ay * dt;
    p.x  += p.vx * dt;
    p.y  += p.vy * dt;

    //
    //  bounce from walls
    //
    while( p.x < 0 || p.x > size )
    {
        p.x  = p.x < 0 ? -p.x : 2*size-p.x;
        p.vx = -p.vx;
    }
    while( p.y < 0 || p.y > size )
    {
        p.y  = p.y < 0 ? -p.y : 2*size-p.y;
        p.vy = -p.vy;
    }
 }

 //
 //  I/O routines
 //
 void save( FILE *f, int n, particle_t *p )
 {
    static bool first = true;
    if( first )
    {
        fprintf( f, "%d %g\n", n, size );
        first = false;
    }
    for( int i = 0; i < n; i++ )
        fprintf( f, "%g %g\n", p[i].x, p[i].y );
 }

 //
 //  command line option processing
 //
 int find_option( int argc, char **argv, const char *option )
 {
    for( int i = 1; i < argc; i++ )
        if( strcmp( argv[i], option ) == 0 )
            return i;
    return -1;
 }

 int read_int( int argc, char **argv, const char *option, int default_value )
 {
    int iplace = find_option( argc, argv, option );
    if( iplace >= 0 && iplace < argc-1 )
        return atoi( argv[iplace+1] );
    return default_value;
 }

 char *read_string( int argc, char **argv, const char *option, char *default_value )
 {
    int iplace = find_option( argc, argv, option );
    if( iplace >= 0 && iplace < argc-1 )
        return argv[iplace+1];
    return default_value;
 }
diff --git a/common.h b/common.h
 #ifndef __CS267_COMMON_H__
 #define __CS267_COMMON_H__

 inline int min( int a, int b ) { return a < b ? a : b; }
 inline int max( int a, int b ) { return a > b ? a : b; }

 //
 //  saving parameters
 //
 const int NSTEPS = 1000;
 const int SAVEFREQ = 10;

 const int   BOTTOM_STRAIGHT = 0, 
            BOTTOM_LEFT = 1, 
            STRAIGHT_LEFT = 2, 
            TOP_LEFT = 3, 
            TOP_STRAIGHT = 4, 
            TOP_RIGHT = 5, 
            STRAIGHT_RIGHT = 6, 
            BOTTOM_RIGHT = 7;

 //
 // particle data structure
 //
 typedef struct 
 {
  double index;
  double x;
  double y;
  double vx;
  double vy;
  double ax;
  double ay;
 } particle_t;

 typedef struct
 {
 	int offset;
 	int send_size;
 	int recv_size;
 	int proc_x;
 	int proc_y;
 	int block_x;
 	int block_y;
 	int rank;
 	particle_t *send_buffer;
 	int buffer_size;
 	MPI_Request recv_req;
 	MPI_Request send_req;
 	MPI_Status recv_stat;
 	MPI_Status send_stat;
 } neighbor_t;

 typedef struct
 {
 	double from_x;
 	double from_y;
 	double to_x;
 	double to_y;
 	neighbor_t *neighbors;
 	int offset;
 	int size;
 	int block_x;
 	int subrank;
 	int block_y;
 } block_t;

 //
 //  timing routines
 //
 double read_timer( );

 int calc_array_size(int n, double size_x, double size_y, int split_factor, int n_proc);
 void copy_to_ghost_buffers( particle_t *particles, block_t self);
 void setup_ghost_buffers( int my_x, int my_y, neighbor_t *neighbors, double size_x, double size_y, int start_offset);
 void setup_neighbors( block_t block, int my_x, int my_y, int proc_width, int proc_height, int split_factor );

 //
 //  simulation routines
 //
 double set_size( int n );
 void init_particles( int n, particle_t *p );
 void apply_force( particle_t &particle, particle_t &neighbor );
 void move( particle_t &p );

 //
 //  I/O routines
 //
 FILE *open_save( char *filename, int n );
 void save( FILE *f, int n, particle_t *p );

 //
 //  argument processing routines
 //
 int find_option( int argc, char **argv, const char *option );
 int read_int( int argc, char **argv, const char *option, int default_value );
 char *read_string( int argc, char **argv, const char *option, char *default_value );

 #endif
diff --git a/Makefile b/Makefile
 HOST = $(shell hostname)
 BANG   =  $(shell expr match `hostname` ccom-bang)
 BANG-COMPUTE   =  $(shell expr match `hostname` compute)
 LILLIPUT   =  $(shell expr match `hostname` lilliput)



 ifneq ($(BANG), 0)
 PUB     = /share/class/public/cse260-fa12
 include $(PUB)/Arch/arch.gnu.generic
 else
 ifneq ($(BANG-COMPUTE), 0)
 PUB     = /share/class/public/cse260-fa12
 include $(PUB)/Arch/arch.gnu.generic
 else
 ifneq ($(LILLIPUT), 0)
 PUB	= /class/public/cse260-fa12
 include $(PUB)/Arch/arch.intel.generic
 else
 # PUB = /Users/baden/lib
 include $(PUB)/Arch/arch.gnu
 # include $(PUB)/Arch/arch.gnu-4.5
 endif
 endif
 endif
 #
 # Add symbol table information for gdb/cachegrind
 ifeq ($(debug), 1)
        CFLAGS += -g
        LDFLAGS += -g
        C++FLAGS += -g
 endif   


 # Add symbol table information for gprof
 ifeq ($(gprof), 1)
        CFLAGS += -g -pg
        C++FLAGS += -g -pg
        LDFLAGS += -g -pg
 endif

 # If you want to compile for single precision,
 # specify single=1 on the "make" command line
 ifeq ($(single), 1)
 else
    C++FLAGS += -D_DOUBLE
    CFLAGS += -D_DOUBLE
 endif


 # If you want to compile so that you call the plotter for
 # flattened 2D arrays (implemented as 1D arrays)
 # specify flattened=1 on the "make" command line
 ifeq ($(flattened), 1)
    C++FLAGS += -DPLOT1D
    CFLAGS += -DPLOT1D
 endif

 # If you want to use restrict pointers, make restrict=1
 # This applies to the hand code version
 ifeq ($(restrict), 1)
    C++FLAGS += -D__RESTRICT
    CFLAGS += -D__RESTRICT
 ifneq ($(CARVER), 0)
    C++FLAGS += -restrict
    CFLAGS += -restrict
 endif
 endif


 #DEBUG += -DDEBUG
 TARGETS = mpi

 app:		$(TARGETS)

 OBJECTS = common.o mpi.o.o
 #ifeq ($(no-mpi),1)
 #OBJECTS += Timer.o
 #endif
 app:	$(TARGETS)

 mpi: mpi.o common.o
 	$(C++LINK) $(LDFLAGS) -o $@ mpi.o common.o $(LDLIBS)

 clean:	
 	$(RM) *.o $(TARGETS);
 	$(RM) core.*;
diff --git a/mpi.cpp b/mpi.cpp
 #include <mpi.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
 #include "common.h"

 //
 //  benchmarking program
 //
 int main( int argc, char **argv )
 {    
    //
    //  process command line parameters
    //
    if( find_option( argc, argv, "-h" ) >= 0 )
    {
        printf( "Options:\n" );
        printf( "-h to see this help\n" );
        printf( "-n <int> to set the number of particles\n" );
        printf( "-proc_width <int> to set the width of the processor grid\n" );
        printf( "-proc_height <int> to set the height of the processor grid\n" );
        printf( "-split_factor <int> to set split factor of the processor grid\n" );
        printf( "-o <filename> to specify the output file name\n" );
        printf( "-c <checksum> to specify the output checksum file name\n" );
        return 0;
    }
    
    int n = read_int( argc, argv, "-n", 1000 );
    int proc_width = read_int( argc, argv, "-proc_width", NULL );
    int proc_height = read_int( argc, argv, "-proc_height", NULL );
    int split_factor = read_int( argc, argv, "-split_factor", NULL );
    char *savename = read_string( argc, argv, "-o", NULL );
    char *checksum = read_string( argc, argv, "-c", NULL );

    //
    //  set up MPI
    //
    int n_proc, myrank;
    MPI_Init( &argc, &argv );
    MPI_Comm_size( MPI_COMM_WORLD, &n_proc );
    MPI_Comm_rank( MPI_COMM_WORLD, &myrank );

    //Ensure that px and py are correct setup.
     if ((proc_height * proc_width) != n_proc && !myrank){
        printf("\n *** The number of  processors in the geometry (%d)  is not the same as the number requested (%d)",proc_height*proc_width,n_proc);
        exit(-1);
     }

    //
    //  initialize sizes and ranks.
    //
    double size = set_size( n );
    double size_x = (size / proc_width) / split_factor;
    double size_y = (size / proc_height) / split_factor;
    int my_x = myrank % proc_width;
    int my_y = myrank / proc_width;
    block_t *blocks = (block_t*) malloc(split_factor * split_factor * sizeof(block_t));
    int block_no = (n_proc * split_factor * split_factor);

    //
    //  allocate generic resources
    //
    FILE *fsave = savename && myrank == 0 ? fopen( savename, "w" ) : NULL;
    FILE *fchecksum = checksum && myrank == 0 ? fopen( checksum, "w" ) : NULL;
    int array_size = calc_array_size(n, size_x,size_y,split_factor,n_proc);
    particle_t *particles = (particle_t*) malloc( array_size * sizeof(particle_t));
    particle_t *particles_prev = (particle_t*) malloc( array_size * sizeof(particle_t));

    MPI_Datatype PARTICLE;
    MPI_Type_contiguous( 7, MPI_DOUBLE, &PARTICLE );
    MPI_Type_commit( &PARTICLE );
    if (!myrank) { //Init all particles at rank 0
        init_particles( n, particles_prev );
    }
    MPI_Bcast(particles_prev, n, PARTICLE, 0, MPI_COMM_WORLD);

    //
    // Setup the neighbor setup
    //
    int ij;
    for (int i = 0; i < split_factor; i++){
        for (int j = 0; j < split_factor; j++){
            ij = i*split_factor+j;
            blocks[ij].block_x = j;
            blocks[ij].block_y = i;
            blocks[ij].from_x = (my_x + j * proc_width) * size_x;
            blocks[ij].from_y = (my_y + i * proc_height) * size_y;
            blocks[ij].to_x = blocks[ij].from_x + size_x;
            blocks[ij].to_y = blocks[ij].from_y + size_y;
            blocks[ij].subrank = ((j*n_proc) + (i*split_factor*n_proc) + myrank);
            blocks[ij].offset = (ij == 0 ? 0 : blocks[ij - 1].size + blocks[ij - 1].offset);
            blocks[ij].size = 0;
            for(int k=0; k<n;k++){
                if (particles_prev[k].x >= blocks[ij].from_x && particles_prev[k].x < blocks[ij].to_x && 
                    particles_prev[k].y >= blocks[ij].from_y && particles_prev[k].y < blocks[ij].to_y){
                        particles[blocks[ij].offset + blocks[ij].size] = particles_prev[k];
                        blocks[ij].size += 1;
                }
            }
            blocks[ij].neighbors = (neighbor_t*) malloc( 8 * sizeof(neighbor_t) );
            setup_neighbors( blocks[ij], my_x, my_y, proc_width, proc_height, split_factor );
        }
    }

    int last_block_index = split_factor*split_factor-1;
    int start_offset = blocks[last_block_index].offset + blocks[last_block_index].size;
    for (int i = 0; i < split_factor*split_factor; i++){
        setup_ghost_buffers(my_x,my_y, blocks[i].neighbors, size_x, size_y, start_offset);
        start_offset = blocks[i].neighbors[7].offset + blocks[i].neighbors[7].buffer_size;
    }

    //Setup recivers.
    for (int i = 0; i < split_factor * split_factor; i++){
        for(int j = 0; j < 8; j++) {
            if (blocks[i].neighbors[j].rank != -1){
                MPI_Irecv(&particles[blocks[i].neighbors[j].offset],
                          blocks[i].neighbors[j].buffer_size, 
                          PARTICLE, 
                          blocks[i].neighbors[j].rank, 
                          blocks[i].block_x + blocks[i].block_y*split_factor, 
                          MPI_COMM_WORLD, 
                          &blocks[i].neighbors[j].recv_req );
            } 
        }
    }

    //Copy to ghost buffers and send it.
    for (int i = 0; i < split_factor * split_factor; i++){
        copy_to_ghost_buffers( particles, blocks[i] );
        for(int j = 0; j < 8; j++){
            if (blocks[i].neighbors[j].rank != -1){
                MPI_Isend(&blocks[i].neighbors[j].send_buffer[0],
                          blocks[i].neighbors[j].send_size, 
                          PARTICLE, 
                          blocks[i].neighbors[j].rank, 
                          blocks[i].neighbors[j].block_x + blocks[i].neighbors[j].block_y*split_factor, 
                          MPI_COMM_WORLD, 
                          &blocks[i].neighbors[j].send_req );
            } 
        }
    }
    //Set correct size
    for (int i = 0; i < split_factor * split_factor; i++){
        for(int j = 0; j < 8; j++) {
            if (blocks[i].neighbors[j].rank != -1) {
               MPI_Wait(&blocks[i].neighbors[j].recv_req,&blocks[i].neighbors[j].recv_stat);
               MPI_Get_count(&blocks[i].neighbors[j].recv_stat, PARTICLE, &blocks[i].neighbors[j].recv_size);
            }
        }
    }

    //Ensure that you have send to anyone.
    for (int i = 0; i < split_factor * split_factor; i++){
        for(int j = 0; j < 8; j++) {
            if (blocks[i].neighbors[j].rank != -1) {
                MPI_Wait(&blocks[i].neighbors[j].send_req,&blocks[i].neighbors[j].send_stat);
            }
        }
    }
    
    particle_t *tmp = particles; particles = particles_prev; particles_prev = tmp;

    //Myrank is main so he is the one gathering the info
    int *n_proc_sizes = new int[n_proc];
    int *n_proc_offset = new int[n_proc];
    int count;

    int size_of_array = blocks[last_block_index].size + blocks[last_block_index].offset;
    MPI_Gather( &size_of_array, 1, MPI_INT, n_proc_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD );
    if(!myrank ) {
        int real_n = 0;
        for(int i = 0; i < n_proc; i++){
            real_n += n_proc_sizes[i];
        }
        printf("Before correctness check: n: %d, real-n: %d\n",n, real_n);
    }

    //
    //  simulate a number of time steps
    //
    double simulation_time = read_timer( );
    for( int step = 0; step < NSTEPS; step++ )
    {   
        //
        //  save current step if necessary (slightly different semantics than in other codes)
        //
        if( (step%SAVEFREQ) == 0 && savename ) {
            //To to collect from every node 
            size_of_array = blocks[last_block_index].size + blocks[last_block_index].offset;
            MPI_Gather( &size_of_array, 1, MPI_INT, n_proc_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD );
            //Calculate offsets
            if (myrank == 0){
                for(int i = 0; i < n_proc; i++){
                    if (i == 0) {
                        n_proc_offset[i] = 0;
                    } else {
                        n_proc_offset[i] = n_proc_offset[i-1] + n_proc_sizes[i-1];
                    }
                }
            }
            MPI_Gatherv( &particles_prev[blocks[0].offset], size_of_array, PARTICLE, particles, n_proc_sizes, n_proc_offset, PARTICLE, 0, MPI_COMM_WORLD );
            if (myrank == 0 && fsave) {
                save( fsave, n, particles );
            }
        }

        //
        //  compute forces in us and the neighbors
        //
        for( int i = 0; i < split_factor * split_factor; i++){
            for(int k = blocks[i].offset; k < blocks[i].offset + blocks[i].size; k++){
                particles_prev[k].ax = particles_prev[k].ay = 0;
                for(int l = blocks[i].offset; l < blocks[i].offset + blocks[i].size; l++ ) {
                    apply_force( particles_prev[k], particles_prev[l] );
                }
                for(int j = 0; j < 8; j++){
                    for(int l = blocks[i].neighbors[j].offset; l < blocks[i].neighbors[j].offset + blocks[i].neighbors[j].recv_size; l++ ) {
                       apply_force( particles_prev[k], particles_prev[l] );
                    }
                }
            }
            for(int j = 0; j < 8; j++){
                for(int k = blocks[i].neighbors[j].offset; k < blocks[i].neighbors[j].offset + blocks[i].neighbors[j].recv_size; k++ ) {
                    particles_prev[k].ax = particles_prev[k].ay = 0;
                    for(int l = blocks[i].offset; l < blocks[i].offset + blocks[i].size; l++ ) {
                        apply_force( particles_prev[k], particles_prev[l] );
                    }
                    for(int z = 0; z < 8; z++){
                        for(int l = blocks[i].neighbors[z].offset; l < blocks[i].neighbors[z].offset + blocks[i].neighbors[z].recv_size; l++ ) {
                            apply_force( particles_prev[k], particles_prev[l] );
                        }
                    }
                }
            }
        }

        
        //
        //  move particles
        //
       int new_offset = blocks[0].offset;
        for( int i = 0; i < split_factor * split_factor; i++){
            count = 0;
            int removed = 0;
            int newin = 0;
            for(int k = blocks[i].offset; k < blocks[i].offset + blocks[i].size; k++ ){
                move(particles_prev[k]);
                if (particles_prev[k].x >= blocks[i].from_x && particles_prev[k].x < blocks[i].to_x && 
                    particles_prev[k].y >= blocks[i].from_y && particles_prev[k].y < blocks[i].to_y){
                        particles[count+new_offset] = particles_prev[k];
                        count = count + 1;
                } else {
                    removed = removed + 1;
                }
            }
            for(int j = 0; j < 8; j++){
                for(int k = blocks[i].neighbors[j].offset; k < blocks[i].neighbors[j].offset + blocks[i].neighbors[j].recv_size; k++ ) {
                    move(particles_prev[k]);
                    if (particles_prev[k].x >= blocks[i].from_x && particles_prev[k].x < blocks[i].to_x && 
                        particles_prev[k].y >= blocks[i].from_y && particles_prev[k].y < blocks[i].to_y){
                            particles[count + new_offset] = particles_prev[k];
                            count = count + 1;
                            newin = newin + 1;
                    }
                }
                blocks[i].neighbors[j].recv_size = 0;
            }
            blocks[i].size = count;
            blocks[i].offset = new_offset;
            new_offset = blocks[i].offset + blocks[i].size;
        }

        //calculating new offset
        int start_offset = blocks[last_block_index].offset + blocks[last_block_index].size;
        for (int i = 0; i < split_factor*split_factor; i++){
            for(int j = 0; j < 8; j++) {
                blocks[i].neighbors[j].offset = start_offset;
                start_offset = start_offset + blocks[i].neighbors[j].buffer_size;
            }
        }

        //Setup recivers.
        for (int i = 0; i < split_factor * split_factor; i++){
            for(int j = 0; j < 8; j++) {
                if (blocks[i].neighbors[j].rank != -1){
                    MPI_Irecv(&particles[blocks[i].neighbors[j].offset],
                              blocks[i].neighbors[j].buffer_size, 
                              PARTICLE, 
                              blocks[i].neighbors[j].rank, 
                              blocks[i].block_x + blocks[i].block_y*split_factor, 
                              MPI_COMM_WORLD, 
                              &blocks[i].neighbors[j].recv_req );
                } 
            }
        }

        //Copy to ghost buffers and send it.
        for (int i = 0; i < split_factor * split_factor; i++){
            copy_to_ghost_buffers( particles, blocks[i] );
            for(int j = 0; j < 8; j++){ 
                if (blocks[i].neighbors[j].rank != -1){
                    MPI_Isend(&blocks[i].neighbors[j].send_buffer[0],
                              blocks[i].neighbors[j].send_size, 
                              PARTICLE, 
                              blocks[i].neighbors[j].rank, 
                              blocks[i].neighbors[j].block_x + blocks[i].neighbors[j].block_y*split_factor, 
                              MPI_COMM_WORLD, 
                              &blocks[i].neighbors[j].send_req );
                } 
            }
        }
        //Set correct size
        for (int i = 0; i < split_factor * split_factor; i++){
            for(int j = 0; j < 8; j++) {
                if (blocks[i].neighbors[j].rank != -1) {
                   MPI_Wait(&blocks[i].neighbors[j].recv_req,&blocks[i].neighbors[j].recv_stat);
                   MPI_Get_count(&blocks[i].neighbors[j].recv_stat, PARTICLE, &blocks[i].neighbors[j].recv_size);
                }
            }
        }

        particle_t *tmp = particles; particles = particles_prev; particles_prev = tmp;

        //Ensure that you have send to anyone.
        for (int i = 0; i < split_factor * split_factor; i++){
            for(int j = 0; j < 8; j++) {
                if (blocks[i].neighbors[j].rank != -1) {
                    MPI_Wait(&blocks[i].neighbors[j].send_req,&blocks[i].neighbors[j].send_stat);
                }
            }
        }
    }
    simulation_time = read_timer( ) - simulation_time;


    size_of_array = blocks[last_block_index].size + blocks[last_block_index].offset;
    MPI_Gather( &size_of_array, 1, MPI_INT, n_proc_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD );
    if(!myrank ) {
        int real_n = 0;
        for(int i = 0; i < n_proc; i++){
            if (i == 0) {
                n_proc_offset[i] = 0;
            } else {
                n_proc_offset[i] = n_proc_offset[i-1] + n_proc_sizes[i-1];
            }
            real_n += n_proc_sizes[i];
        }
        printf("After correctness check: n: %d, real-n: %d\n",n, real_n);
            
        printf( "n = %d, n_procs = %d, proc_width = %d, proc_height = %d, split_factor = %d, simulation time = %g s\n", n, n_proc, proc_width, proc_height, split_factor, simulation_time );
    }
    MPI_Gatherv( &particles_prev[blocks[0].offset], size_of_array, PARTICLE, particles, n_proc_sizes, n_proc_offset, PARTICLE, 0, MPI_COMM_WORLD );
    //Sort every particle to their i value
    if (!myrank && fchecksum){
        for(int j = 0; j < n; j++){
            particles_prev[(int)particles[j].index] = particles[j];
        }
        save( fchecksum, n, particles_prev );
    }
    
    //
    //  release resources
    //
    for(int i = 0; i < split_factor*split_factor; i++){
        free(blocks[i].neighbors);
    }
    free( blocks );
    free( particles );
    free( particles_prev );
    if( fsave )
        fclose( fsave );
    if (fchecksum)
        fclose( fchecksum );
    
    MPI_Finalize( );
    
    return 0;
 }
	#!/bin/bash
	# This script is intepreted by the Bourne Shell, sh
	#
	# Documentation for SGE is found in:
	# http://docs.oracle.com/cd/E19279-01/820-3257-12/n1ge.html
	#
	# Tell SGE which shell to run the job script in rather than depending
	# on SGE to try and figure it out.
	#$ -S /bin/bash
	#
	# Export all my environment variables to the job
	#$ -V
	#
	# Tun the job in the same directory from which you submitted it
	#$ -cwd
	#
	#
	# --- Don't change anything above this line ---
	#
	# Give a name to the job
	#$ -N MPI
	#
	# Specify a time limit for the job, not more than 30 minutes
	#$ -l h_rt=00:10:00
	#
	# Specify the parallel environment and number of cores,
	# If not a multiple of 8, you'll get the whole node anyway
	#$ -pe orte 8
	#
	# Join stdout and stderr so they are reported in job output file
	#$ -j y
	#
	#
	# Choose the queue to run the job
	#
	# Debug queue: only one node may be used at a time for up to 30 minutes
	# Interactive or batch jobs, maximum of 1 job per user running at a time
	#
	# Normal queue: job may use all available compute nodes (256 cores)
	# for up to 60 minutes
	# Batch jobs, maximum of 2 jobs per user running at a time
	# To use more than one node, specify the "normal" queue
	#$ -q normal.q
	# #$ -q debug.q
	#
	# Specifies the circumstances under which mail is to be sent to the job owner
	# defined by -M option. For example, options "bea" cause mail to be sent at the
	# begining, end, and at abort time (if it happens) of the job.
	# Option "n" means no mail will be sent.
	#$ -m aeb
	#
	# *** Change to the address you want the notification sent to, and
	# *** REMOVE the blank between the # and the $
	#$ -M [email protected]
	#


	echo
	echo " *** Current working directory"
	pwd
	echo
	echo " *** Compiler"
	# Output which compiler are we using and the environment
	mpicc -v
	echo
	echo " *** Environment"
	printenv

	echo

	echo ">>> Job Starts"
	date
	mpirun -np $NSLOTS ./mpi -n 5000 -proc_width 4 -proc_height 2 -split_factor 2 -o mpi.txt -c checksum.txt

	date
	echo ">>> Job Ends"
	#include <stdlib.h>
	#include <stdio.h>
	#include <assert.h>
	#include <float.h>
	#include <string.h>
	#include <math.h>
	#include <time.h>
	#include <mpi.h>
	#include <sys/time.h>
	#include "common.h"

	double size;

	//
	// tuned constants
	//
	#define density 0.0005
	#define mass 0.01
	#define cutoff 0.01
	#define overlap 0.011
	#define min_r (cutoff/100)
	#define dt 0.0005

	int calc_array_size(int n, double size_x, double size_y, int split_factor, int n_proc){
	return max(n, (int)ceil(fmax(size_x,size_y) / mass) * 8 * (split_factor * split_factor * n_proc));
	}

	void setup_neighbors( block_t block, int my_x, int my_y, int proc_width, int proc_height, int split_factor ){
	neighbor_t *neighbors = block.neighbors;
	neighbors[ BOTTOM_STRAIGHT ].proc_x = my_x; neighbors[ BOTTOM_STRAIGHT ].proc_y = my_y + 1;
	neighbors[ BOTTOM_LEFT ].proc_x = my_x - 1; neighbors[ BOTTOM_LEFT ].proc_y = my_y + 1;
	neighbors[ STRAIGHT_LEFT ].proc_x = my_x - 1; neighbors[ STRAIGHT_LEFT ].proc_y = my_y;
	neighbors[ TOP_LEFT ].proc_x = my_x - 1; neighbors[ TOP_LEFT ].proc_y = my_y - 1;
	neighbors[ TOP_STRAIGHT ].proc_x = my_x; neighbors[ TOP_STRAIGHT ].proc_y = my_y - 1;
	neighbors[ TOP_RIGHT ].proc_x = my_x + 1; neighbors[ TOP_RIGHT ].proc_y = my_y - 1;
	neighbors[ STRAIGHT_RIGHT ].proc_x = my_x + 1; neighbors[ STRAIGHT_RIGHT ].proc_y = my_y;
	neighbors[ BOTTOM_RIGHT ].proc_x = my_x + 1; neighbors[ BOTTOM_RIGHT ].proc_y = my_y + 1;
	for(int i = 0; i < 8; i++){
	if (neighbors[ i ].proc_x < 0) {
	neighbors[ i ].proc_x = neighbors[ i ].proc_x + proc_width;
	neighbors[ i ].block_x = block.block_x - 1;
	} else if (proc_width <= neighbors[ i ].proc_x) {
	neighbors[ i ].proc_x = neighbors[ i ].proc_x % proc_width;
	neighbors[ i ].block_x = block.block_x + 1;
	} else {
	neighbors[ i ].block_x = block.block_x;
	}
	if (neighbors[ i ].proc_y < 0) {
	neighbors[ i ].proc_y = neighbors[ i ].proc_y + proc_height;
	neighbors[ i ].block_y = block.block_y - 1;
	} else if (proc_height <= neighbors[ i ].proc_y) {
	neighbors[ i ].proc_y = neighbors[ i ].proc_y % proc_height;
	neighbors[ i ].block_y = block.block_y + 1;
	} else {
	neighbors[ i ].block_y = block.block_y;
	}
	neighbors[ i ].rank = neighbors[ i ].proc_x + neighbors[ i ].proc_y * proc_width;
	if (neighbors[ i ].block_y < 0 \|\| split_factor <= neighbors[ i ].block_y \|\| neighbors[ i ].block_x < 0 \|\| split_factor <= neighbors[ i ].block_x){
	neighbors[ i ].rank = -1;
	neighbors[ i ].buffer_size = 0;
	neighbors[ i ].recv_size = 0;
	}
	}
	}

	void setup_ghost_buffers(int my_x, int my_y, neighbor_t *neighbors, double size_x, double size_y, int start_offset){
	for(int i = 0; i < 8; i++){
	if (neighbors[ i ].rank != -1){
	if (neighbors[i].proc_x - my_x != 0 && neighbors[i].proc_y - my_y != 0) {
	neighbors[i].buffer_size = (int)(ceil(fmin(size_y,size_x) / mass));
	} else if (neighbors[i].proc_x - my_x != 0) {
	neighbors[i].buffer_size = (int)ceil(size_y / mass);
	} else if (neighbors[i].proc_y - my_y != 0) {
	neighbors[i].buffer_size = (int)ceil(size_x / mass);
	}
	} else {
	neighbors[i].buffer_size = 0;
	}
	neighbors[i].offset = (i == 0 ? start_offset : neighbors[ i - 1 ].buffer_size + neighbors[ i - 1 ].offset);
	neighbors[i].send_buffer = (particle_t) malloc( sizeof(particle_t) neighbors[i].buffer_size);
	}
	}

	void copy_to_ghost_buffers( particle_t *particles, block_t self) {
	for(int i = 0; i < 8; i++){
	self.neighbors[i].send_size = 0;
	}
	for (int i = self.offset; i < self.offset + self.size; i++){
	if (particles[i].x <= self.from_x + overlap && particles[i].y <= self.from_y + overlap && self.neighbors[TOP_LEFT].rank != -1) {//TOP LEFT particle
	if (self.neighbors[TOP_LEFT].buffer_size <= self.neighbors[TOP_LEFT].send_size){
	printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[TOP_LEFT].rank, self.neighbors[TOP_LEFT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
	} else {
	self.neighbors[TOP_LEFT].send_buffer[self.neighbors[TOP_LEFT].send_size] = particles[i];
	}
	self.neighbors[TOP_LEFT].send_size = self.neighbors[TOP_LEFT].send_size + 1;
	}
	if (self.to_x - overlap < particles[i].x && particles[i].y <= self.from_y + overlap && self.neighbors[TOP_RIGHT].rank != -1) {//TOP RIGHT particle
	if (self.neighbors[TOP_RIGHT].buffer_size <= self.neighbors[TOP_RIGHT].send_size){
	printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[TOP_RIGHT].rank, self.neighbors[TOP_RIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
	} else {
	self.neighbors[TOP_RIGHT].send_buffer[self.neighbors[TOP_RIGHT].send_size] = particles[i];
	}
	self.neighbors[TOP_RIGHT].send_size = self.neighbors[TOP_RIGHT].send_size + 1;
	}
	if (particles[i].x <= self.from_x + overlap && self.to_y - overlap < particles[i].y && self.neighbors[BOTTOM_LEFT].rank != -1) {// BOTTOM LEFT particle
	if (self.neighbors[BOTTOM_LEFT].buffer_size <= self.neighbors[BOTTOM_LEFT].send_size){
	printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[BOTTOM_LEFT].rank, self.neighbors[BOTTOM_LEFT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
	} else {
	self.neighbors[BOTTOM_LEFT].send_buffer[self.neighbors[BOTTOM_LEFT].send_size] = particles[i];
	}
	self.neighbors[BOTTOM_LEFT].send_size = self.neighbors[BOTTOM_LEFT].send_size + 1;
	}
	if (self.to_x - overlap < particles[i].x && self.to_y - overlap < particles[i].y && self.neighbors[BOTTOM_RIGHT].rank != -1) {//BOTTOM RIGHT particle
	if (self.neighbors[BOTTOM_RIGHT].buffer_size <= self.neighbors[BOTTOM_RIGHT].send_size){
	printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[BOTTOM_RIGHT].rank, self.neighbors[BOTTOM_RIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
	} else {
	self.neighbors[BOTTOM_RIGHT].send_buffer[self.neighbors[BOTTOM_RIGHT].send_size] = particles[i];
	}
	self.neighbors[BOTTOM_RIGHT].send_size = self.neighbors[BOTTOM_RIGHT].send_size + 1;
	}
	if (self.to_x - overlap < particles[i].x && self.neighbors[STRAIGHT_RIGHT].rank != -1) { // STRAIGHT RIGHT particle
	if (self.neighbors[STRAIGHT_RIGHT].buffer_size <= self.neighbors[STRAIGHT_RIGHT].send_size){
	printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[STRAIGHT_RIGHT].rank, self.neighbors[STRAIGHT_RIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
	} else {
	self.neighbors[STRAIGHT_RIGHT].send_buffer[self.neighbors[STRAIGHT_RIGHT].send_size] = particles[i];
	}
	self.neighbors[STRAIGHT_RIGHT].send_size = self.neighbors[STRAIGHT_RIGHT].send_size + 1;
	}
	if (particles[i].x <= self.from_x + overlap && self.neighbors[STRAIGHT_LEFT].rank != -1) { // STRAIGHT LEFT particle
	if (self.neighbors[STRAIGHT_LEFT].buffer_size <= self.neighbors[STRAIGHT_LEFT].send_size){
	printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[STRAIGHT_LEFT].rank, self.neighbors[STRAIGHT_LEFT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
	} else {
	self.neighbors[STRAIGHT_LEFT].send_buffer[self.neighbors[STRAIGHT_LEFT].send_size] = particles[i];
	}
	self.neighbors[STRAIGHT_LEFT].send_size = self.neighbors[STRAIGHT_LEFT].send_size + 1;
	}
	if (self.to_y - overlap < particles[i].y && self.neighbors[BOTTOM_STRAIGHT].rank != -1) { // BOTTOM STRAIGHT particle
	if (self.neighbors[BOTTOM_STRAIGHT].buffer_size <= self.neighbors[BOTTOM_STRAIGHT].send_size){
	printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[BOTTOM_STRAIGHT].rank, self.neighbors[BOTTOM_STRAIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
	} else {
	self.neighbors[BOTTOM_STRAIGHT].send_buffer[self.neighbors[BOTTOM_STRAIGHT].send_size] = particles[i];
	}
	self.neighbors[BOTTOM_STRAIGHT].send_size = self.neighbors[BOTTOM_STRAIGHT].send_size + 1;
	}
	if (particles[i].y <= self.from_y + overlap && self.neighbors[TOP_STRAIGHT].rank != -1) { // TOP STRAIGHT particle
	if (self.neighbors[TOP_STRAIGHT].buffer_size <= self.neighbors[TOP_STRAIGHT].send_size){
	printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[TOP_STRAIGHT].rank, self.neighbors[TOP_STRAIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
	} else {
	self.neighbors[TOP_STRAIGHT].send_buffer[self.neighbors[TOP_STRAIGHT].send_size] = particles[i];
	}
	self.neighbors[TOP_STRAIGHT].send_size = self.neighbors[TOP_STRAIGHT].send_size + 1;
	}
	}
	}

	//
	// timer
	//
	double read_timer( )
	{
	static bool initialized = false;
	static struct timeval start;
	struct timeval end;
	if( !initialized )
	{
	gettimeofday( &start, NULL );
	initialized = true;
	}
	gettimeofday( &end, NULL );
	return (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
	}

	//
	// keep density constant
	//
	double set_size( int n )
	{
	size = sqrt( density * n );
	return size;
	}

	//
	// Initialize the particle positions and velocities
	//
	void init_particles( int n, particle_t *p )
	{
	srand48( 1 );

	int sx = (int)ceil(sqrt((double)n));
	int sy = (n+sx-1)/sx;

	int shuffle = (int)malloc( n * sizeof(int) );
	for( int i = 0; i < n; i++ )
	shuffle[i] = i;

	for( int i = 0; i < n; i++ )
	{
	//
	// make sure particles are not spatially sorted
	//
	int j = lrand48()%(n-i);
	int k = shuffle[j];
	shuffle[j] = shuffle[n-i-1];

	//
	// distribute particles evenly to ensure proper spacing
	//
	p[i].x = size*(1.+(k%sx))/(1+sx);
	p[i].y = size*(1.+(k/sx))/(1+sy);
	p[i].index = i;

	//
	// assign random velocities within a bound
	//
	p[i].vx = drand48()*2-1;
	p[i].vy = drand48()*2-1;
	}
	free( shuffle );
	}

	//
	// interact two particles
	//
	void apply_force( particle_t &particle, particle_t &neighbor )
	{

	double dx = neighbor.x - particle.x;
	double dy = neighbor.y - particle.y;
	double r2 = dx * dx + dy * dy;
	if( r2 > cutoff*cutoff )
	return;
	r2 = fmax( r2, min_r*min_r );
	double r = sqrt( r2 );

	//
	// very simple short-range repulsive force
	//
	double coef = ( 1 - cutoff / r ) / r2 / mass;
	particle.ax += coef * dx;
	particle.ay += coef * dy;
	}

	//
	// integrate the ODE
	//
	void move( particle_t &p )
	{
	//
	// slightly simplified Velocity Verlet integration
	// conserves energy better than explicit Euler method
	//
	p.vx += p.ax * dt;
	p.vy += p.ay * dt;
	p.x += p.vx * dt;
	p.y += p.vy * dt;

	//
	// bounce from walls
	//
	while( p.x < 0 \|\| p.x > size )
	{
	p.x = p.x < 0 ? -p.x : 2*size-p.x;
	p.vx = -p.vx;
	}
	while( p.y < 0 \|\| p.y > size )
	{
	p.y = p.y < 0 ? -p.y : 2*size-p.y;
	p.vy = -p.vy;
	}
	}

	//
	// I/O routines
	//
	void save( FILE f, int n, particle_t p )
	{
	static bool first = true;
	if( first )
	{
	fprintf( f, "%d %g\n", n, size );
	first = false;
	}
	for( int i = 0; i < n; i++ )
	fprintf( f, "%g %g\n", p[i].x, p[i].y );
	}

	//
	// command line option processing
	//
	int find_option( int argc, char *argv, const char option )
	{
	for( int i = 1; i < argc; i++ )
	if( strcmp( argv[i], option ) == 0 )
	return i;
	return -1;
	}

	int read_int( int argc, char *argv, const char option, int default_value )
	{
	int iplace = find_option( argc, argv, option );
	if( iplace >= 0 && iplace < argc-1 )
	return atoi( argv[iplace+1] );
	return default_value;
	}

	char read_string( int argc, char argv, const char option, char *default_value )
	{
	int iplace = find_option( argc, argv, option );
	if( iplace >= 0 && iplace < argc-1 )
	return argv[iplace+1];
	return default_value;
	}
	#ifndef __CS267_COMMON_H__
	#define __CS267_COMMON_H__

	inline int min( int a, int b ) { return a < b ? a : b; }
	inline int max( int a, int b ) { return a > b ? a : b; }

	//
	// saving parameters
	//
	const int NSTEPS = 1000;
	const int SAVEFREQ = 10;

	const int BOTTOM_STRAIGHT = 0,
	BOTTOM_LEFT = 1,
	STRAIGHT_LEFT = 2,
	TOP_LEFT = 3,
	TOP_STRAIGHT = 4,
	TOP_RIGHT = 5,
	STRAIGHT_RIGHT = 6,
	BOTTOM_RIGHT = 7;

	//
	// particle data structure
	//
	typedef struct
	{
	double index;
	double x;
	double y;
	double vx;
	double vy;
	double ax;
	double ay;
	} particle_t;

	typedef struct
	{
	int offset;
	int send_size;
	int recv_size;
	int proc_x;
	int proc_y;
	int block_x;
	int block_y;
	int rank;
	particle_t *send_buffer;
	int buffer_size;
	MPI_Request recv_req;
	MPI_Request send_req;
	MPI_Status recv_stat;
	MPI_Status send_stat;
	} neighbor_t;

	typedef struct
	{
	double from_x;
	double from_y;
	double to_x;
	double to_y;
	neighbor_t *neighbors;
	int offset;
	int size;
	int block_x;
	int subrank;
	int block_y;
	} block_t;

	//
	// timing routines
	//
	double read_timer( );

	int calc_array_size(int n, double size_x, double size_y, int split_factor, int n_proc);
	void copy_to_ghost_buffers( particle_t *particles, block_t self);
	void setup_ghost_buffers( int my_x, int my_y, neighbor_t *neighbors, double size_x, double size_y, int start_offset);
	void setup_neighbors( block_t block, int my_x, int my_y, int proc_width, int proc_height, int split_factor );

	//
	// simulation routines
	//
	double set_size( int n );
	void init_particles( int n, particle_t *p );
	void apply_force( particle_t &particle, particle_t &neighbor );
	void move( particle_t &p );

	//
	// I/O routines
	//
	FILE open_save( char filename, int n );
	void save( FILE f, int n, particle_t p );

	//
	// argument processing routines
	//
	int find_option( int argc, char *argv, const char option );
	int read_int( int argc, char *argv, const char option, int default_value );
	char read_string( int argc, char argv, const char option, char *default_value );

	#endif
	HOST = $(shell hostname)
	BANG = $(shell expr match `hostname` ccom-bang)
	BANG-COMPUTE = $(shell expr match `hostname` compute)
	LILLIPUT = $(shell expr match `hostname` lilliput)



	ifneq ($(BANG), 0)
	PUB = /share/class/public/cse260-fa12
	include $(PUB)/Arch/arch.gnu.generic
	else
	ifneq ($(BANG-COMPUTE), 0)
	PUB = /share/class/public/cse260-fa12
	include $(PUB)/Arch/arch.gnu.generic
	else
	ifneq ($(LILLIPUT), 0)
	PUB = /class/public/cse260-fa12
	include $(PUB)/Arch/arch.intel.generic
	else
	# PUB = /Users/baden/lib
	include $(PUB)/Arch/arch.gnu
	# include $(PUB)/Arch/arch.gnu-4.5
	endif
	endif
	endif
	#
	# Add symbol table information for gdb/cachegrind
	ifeq ($(debug), 1)
	CFLAGS += -g
	LDFLAGS += -g
	C++FLAGS += -g
	endif


	# Add symbol table information for gprof
	ifeq ($(gprof), 1)
	CFLAGS += -g -pg
	C++FLAGS += -g -pg
	LDFLAGS += -g -pg
	endif

	# If you want to compile for single precision,
	# specify single=1 on the "make" command line
	ifeq ($(single), 1)
	else
	C++FLAGS += -D_DOUBLE
	CFLAGS += -D_DOUBLE
	endif


	# If you want to compile so that you call the plotter for
	# flattened 2D arrays (implemented as 1D arrays)
	# specify flattened=1 on the "make" command line
	ifeq ($(flattened), 1)
	C++FLAGS += -DPLOT1D
	CFLAGS += -DPLOT1D
	endif

	# If you want to use restrict pointers, make restrict=1
	# This applies to the hand code version
	ifeq ($(restrict), 1)
	C++FLAGS += -D__RESTRICT
	CFLAGS += -D__RESTRICT
	ifneq ($(CARVER), 0)
	C++FLAGS += -restrict
	CFLAGS += -restrict
	endif
	endif


	#DEBUG += -DDEBUG
	TARGETS = mpi

	app: $(TARGETS)

	OBJECTS = common.o mpi.o.o
	#ifeq ($(no-mpi),1)
	#OBJECTS += Timer.o
	#endif
	app: $(TARGETS)

	mpi: mpi.o common.o
	$(C++LINK) $(LDFLAGS) -o $@ mpi.o common.o $(LDLIBS)

	clean:
	$(RM) *.o $(TARGETS);
	$(RM) core.*;