Skip to content

Instantly share code, notes, and snippets.

@jstroem
Created December 6, 2012 04:12
Show Gist options
  • Save jstroem/4221722 to your computer and use it in GitHub Desktop.
Save jstroem/4221722 to your computer and use it in GitHub Desktop.
CSE260 project. ghost-cells and cyclic. even distribution
#!/bin/bash
# This script is intepreted by the Bourne Shell, sh
#
# Documentation for SGE is found in:
# http://docs.oracle.com/cd/E19279-01/820-3257-12/n1ge.html
#
# Tell SGE which shell to run the job script in rather than depending
# on SGE to try and figure it out.
#$ -S /bin/bash
#
# Export all my environment variables to the job
#$ -V
#
# Tun the job in the same directory from which you submitted it
#$ -cwd
#
#
# --- Don't change anything above this line ---
#
# Give a name to the job
#$ -N MPI
#
# Specify a time limit for the job, not more than 30 minutes
#$ -l h_rt=00:10:00
#
# Specify the parallel environment and number of cores,
# If not a multiple of 8, you'll get the whole node anyway
#$ -pe orte 8
#
# Join stdout and stderr so they are reported in job output file
#$ -j y
#
#
# Choose the queue to run the job
#
# Debug queue: only one node may be used at a time for up to 30 minutes
# Interactive or batch jobs, maximum of 1 job per user running at a time
#
# Normal queue: job may use all available compute nodes (256 cores)
# for up to 60 minutes
# Batch jobs, maximum of 2 jobs per user running at a time
# To use more than one node, specify the "normal" queue
#$ -q normal.q
# #$ -q debug.q
#
# Specifies the circumstances under which mail is to be sent to the job owner
# defined by -M option. For example, options "bea" cause mail to be sent at the
# begining, end, and at abort time (if it happens) of the job.
# Option "n" means no mail will be sent.
#$ -m aeb
#
# *** Change to the address you want the notification sent to, and
# *** REMOVE the blank between the # and the $
#$ -M [email protected]
#
echo
echo " *** Current working directory"
pwd
echo
echo " *** Compiler"
# Output which compiler are we using and the environment
mpicc -v
echo
echo " *** Environment"
printenv
echo
echo ">>> Job Starts"
date
mpirun -np $NSLOTS ./mpi -n 5000 -proc_width 4 -proc_height 2 -split_factor 2 -o mpi.txt -c checksum.txt
date
echo ">>> Job Ends"
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <float.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <mpi.h>
#include <sys/time.h>
#include "common.h"
double size;
//
// tuned constants
//
#define density 0.0005
#define mass 0.01
#define cutoff 0.01
#define overlap 0.011
#define min_r (cutoff/100)
#define dt 0.0005
int calc_array_size(int n, double size_x, double size_y, int split_factor, int n_proc){
return max(n, (int)ceil(fmax(size_x,size_y) / mass) * 8 * (split_factor * split_factor * n_proc));
}
void setup_neighbors( block_t block, int my_x, int my_y, int proc_width, int proc_height, int split_factor ){
neighbor_t *neighbors = block.neighbors;
neighbors[ BOTTOM_STRAIGHT ].proc_x = my_x; neighbors[ BOTTOM_STRAIGHT ].proc_y = my_y + 1;
neighbors[ BOTTOM_LEFT ].proc_x = my_x - 1; neighbors[ BOTTOM_LEFT ].proc_y = my_y + 1;
neighbors[ STRAIGHT_LEFT ].proc_x = my_x - 1; neighbors[ STRAIGHT_LEFT ].proc_y = my_y;
neighbors[ TOP_LEFT ].proc_x = my_x - 1; neighbors[ TOP_LEFT ].proc_y = my_y - 1;
neighbors[ TOP_STRAIGHT ].proc_x = my_x; neighbors[ TOP_STRAIGHT ].proc_y = my_y - 1;
neighbors[ TOP_RIGHT ].proc_x = my_x + 1; neighbors[ TOP_RIGHT ].proc_y = my_y - 1;
neighbors[ STRAIGHT_RIGHT ].proc_x = my_x + 1; neighbors[ STRAIGHT_RIGHT ].proc_y = my_y;
neighbors[ BOTTOM_RIGHT ].proc_x = my_x + 1; neighbors[ BOTTOM_RIGHT ].proc_y = my_y + 1;
for(int i = 0; i < 8; i++){
if (neighbors[ i ].proc_x < 0) {
neighbors[ i ].proc_x = neighbors[ i ].proc_x + proc_width;
neighbors[ i ].block_x = block.block_x - 1;
} else if (proc_width <= neighbors[ i ].proc_x) {
neighbors[ i ].proc_x = neighbors[ i ].proc_x % proc_width;
neighbors[ i ].block_x = block.block_x + 1;
} else {
neighbors[ i ].block_x = block.block_x;
}
if (neighbors[ i ].proc_y < 0) {
neighbors[ i ].proc_y = neighbors[ i ].proc_y + proc_height;
neighbors[ i ].block_y = block.block_y - 1;
} else if (proc_height <= neighbors[ i ].proc_y) {
neighbors[ i ].proc_y = neighbors[ i ].proc_y % proc_height;
neighbors[ i ].block_y = block.block_y + 1;
} else {
neighbors[ i ].block_y = block.block_y;
}
neighbors[ i ].rank = neighbors[ i ].proc_x + neighbors[ i ].proc_y * proc_width;
if (neighbors[ i ].block_y < 0 || split_factor <= neighbors[ i ].block_y || neighbors[ i ].block_x < 0 || split_factor <= neighbors[ i ].block_x){
neighbors[ i ].rank = -1;
neighbors[ i ].buffer_size = 0;
neighbors[ i ].recv_size = 0;
}
}
}
void setup_ghost_buffers(int my_x, int my_y, neighbor_t *neighbors, double size_x, double size_y, int start_offset){
for(int i = 0; i < 8; i++){
if (neighbors[ i ].rank != -1){
if (neighbors[i].proc_x - my_x != 0 && neighbors[i].proc_y - my_y != 0) {
neighbors[i].buffer_size = (int)(ceil(fmin(size_y,size_x) / mass));
} else if (neighbors[i].proc_x - my_x != 0) {
neighbors[i].buffer_size = (int)ceil(size_y / mass);
} else if (neighbors[i].proc_y - my_y != 0) {
neighbors[i].buffer_size = (int)ceil(size_x / mass);
}
} else {
neighbors[i].buffer_size = 0;
}
neighbors[i].offset = (i == 0 ? start_offset : neighbors[ i - 1 ].buffer_size + neighbors[ i - 1 ].offset);
neighbors[i].send_buffer = (particle_t*) malloc( sizeof(particle_t) * neighbors[i].buffer_size);
}
}
void copy_to_ghost_buffers( particle_t *particles, block_t self) {
for(int i = 0; i < 8; i++){
self.neighbors[i].send_size = 0;
}
for (int i = self.offset; i < self.offset + self.size; i++){
if (particles[i].x <= self.from_x + overlap && particles[i].y <= self.from_y + overlap && self.neighbors[TOP_LEFT].rank != -1) {//TOP LEFT particle
if (self.neighbors[TOP_LEFT].buffer_size <= self.neighbors[TOP_LEFT].send_size){
printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[TOP_LEFT].rank, self.neighbors[TOP_LEFT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
} else {
self.neighbors[TOP_LEFT].send_buffer[self.neighbors[TOP_LEFT].send_size] = particles[i];
}
self.neighbors[TOP_LEFT].send_size = self.neighbors[TOP_LEFT].send_size + 1;
}
if (self.to_x - overlap < particles[i].x && particles[i].y <= self.from_y + overlap && self.neighbors[TOP_RIGHT].rank != -1) {//TOP RIGHT particle
if (self.neighbors[TOP_RIGHT].buffer_size <= self.neighbors[TOP_RIGHT].send_size){
printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[TOP_RIGHT].rank, self.neighbors[TOP_RIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
} else {
self.neighbors[TOP_RIGHT].send_buffer[self.neighbors[TOP_RIGHT].send_size] = particles[i];
}
self.neighbors[TOP_RIGHT].send_size = self.neighbors[TOP_RIGHT].send_size + 1;
}
if (particles[i].x <= self.from_x + overlap && self.to_y - overlap < particles[i].y && self.neighbors[BOTTOM_LEFT].rank != -1) {// BOTTOM LEFT particle
if (self.neighbors[BOTTOM_LEFT].buffer_size <= self.neighbors[BOTTOM_LEFT].send_size){
printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[BOTTOM_LEFT].rank, self.neighbors[BOTTOM_LEFT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
} else {
self.neighbors[BOTTOM_LEFT].send_buffer[self.neighbors[BOTTOM_LEFT].send_size] = particles[i];
}
self.neighbors[BOTTOM_LEFT].send_size = self.neighbors[BOTTOM_LEFT].send_size + 1;
}
if (self.to_x - overlap < particles[i].x && self.to_y - overlap < particles[i].y && self.neighbors[BOTTOM_RIGHT].rank != -1) {//BOTTOM RIGHT particle
if (self.neighbors[BOTTOM_RIGHT].buffer_size <= self.neighbors[BOTTOM_RIGHT].send_size){
printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[BOTTOM_RIGHT].rank, self.neighbors[BOTTOM_RIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
} else {
self.neighbors[BOTTOM_RIGHT].send_buffer[self.neighbors[BOTTOM_RIGHT].send_size] = particles[i];
}
self.neighbors[BOTTOM_RIGHT].send_size = self.neighbors[BOTTOM_RIGHT].send_size + 1;
}
if (self.to_x - overlap < particles[i].x && self.neighbors[STRAIGHT_RIGHT].rank != -1) { // STRAIGHT RIGHT particle
if (self.neighbors[STRAIGHT_RIGHT].buffer_size <= self.neighbors[STRAIGHT_RIGHT].send_size){
printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[STRAIGHT_RIGHT].rank, self.neighbors[STRAIGHT_RIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
} else {
self.neighbors[STRAIGHT_RIGHT].send_buffer[self.neighbors[STRAIGHT_RIGHT].send_size] = particles[i];
}
self.neighbors[STRAIGHT_RIGHT].send_size = self.neighbors[STRAIGHT_RIGHT].send_size + 1;
}
if (particles[i].x <= self.from_x + overlap && self.neighbors[STRAIGHT_LEFT].rank != -1) { // STRAIGHT LEFT particle
if (self.neighbors[STRAIGHT_LEFT].buffer_size <= self.neighbors[STRAIGHT_LEFT].send_size){
printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[STRAIGHT_LEFT].rank, self.neighbors[STRAIGHT_LEFT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
} else {
self.neighbors[STRAIGHT_LEFT].send_buffer[self.neighbors[STRAIGHT_LEFT].send_size] = particles[i];
}
self.neighbors[STRAIGHT_LEFT].send_size = self.neighbors[STRAIGHT_LEFT].send_size + 1;
}
if (self.to_y - overlap < particles[i].y && self.neighbors[BOTTOM_STRAIGHT].rank != -1) { // BOTTOM STRAIGHT particle
if (self.neighbors[BOTTOM_STRAIGHT].buffer_size <= self.neighbors[BOTTOM_STRAIGHT].send_size){
printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[BOTTOM_STRAIGHT].rank, self.neighbors[BOTTOM_STRAIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
} else {
self.neighbors[BOTTOM_STRAIGHT].send_buffer[self.neighbors[BOTTOM_STRAIGHT].send_size] = particles[i];
}
self.neighbors[BOTTOM_STRAIGHT].send_size = self.neighbors[BOTTOM_STRAIGHT].send_size + 1;
}
if (particles[i].y <= self.from_y + overlap && self.neighbors[TOP_STRAIGHT].rank != -1) { // TOP STRAIGHT particle
if (self.neighbors[TOP_STRAIGHT].buffer_size <= self.neighbors[TOP_STRAIGHT].send_size){
printf("Subrank %d, neighbors: %d, send_buffer isnt big enough. Has: %d, needed: %d\n", self.subrank, self.neighbors[TOP_STRAIGHT].rank, self.neighbors[TOP_STRAIGHT].buffer_size, self.neighbors[TOP_LEFT].send_size +1);
} else {
self.neighbors[TOP_STRAIGHT].send_buffer[self.neighbors[TOP_STRAIGHT].send_size] = particles[i];
}
self.neighbors[TOP_STRAIGHT].send_size = self.neighbors[TOP_STRAIGHT].send_size + 1;
}
}
}
//
// timer
//
double read_timer( )
{
static bool initialized = false;
static struct timeval start;
struct timeval end;
if( !initialized )
{
gettimeofday( &start, NULL );
initialized = true;
}
gettimeofday( &end, NULL );
return (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
}
//
// keep density constant
//
double set_size( int n )
{
size = sqrt( density * n );
return size;
}
//
// Initialize the particle positions and velocities
//
void init_particles( int n, particle_t *p )
{
srand48( 1 );
int sx = (int)ceil(sqrt((double)n));
int sy = (n+sx-1)/sx;
int *shuffle = (int*)malloc( n * sizeof(int) );
for( int i = 0; i < n; i++ )
shuffle[i] = i;
for( int i = 0; i < n; i++ )
{
//
// make sure particles are not spatially sorted
//
int j = lrand48()%(n-i);
int k = shuffle[j];
shuffle[j] = shuffle[n-i-1];
//
// distribute particles evenly to ensure proper spacing
//
p[i].x = size*(1.+(k%sx))/(1+sx);
p[i].y = size*(1.+(k/sx))/(1+sy);
p[i].index = i;
//
// assign random velocities within a bound
//
p[i].vx = drand48()*2-1;
p[i].vy = drand48()*2-1;
}
free( shuffle );
}
//
// interact two particles
//
void apply_force( particle_t &particle, particle_t &neighbor )
{
double dx = neighbor.x - particle.x;
double dy = neighbor.y - particle.y;
double r2 = dx * dx + dy * dy;
if( r2 > cutoff*cutoff )
return;
r2 = fmax( r2, min_r*min_r );
double r = sqrt( r2 );
//
// very simple short-range repulsive force
//
double coef = ( 1 - cutoff / r ) / r2 / mass;
particle.ax += coef * dx;
particle.ay += coef * dy;
}
//
// integrate the ODE
//
void move( particle_t &p )
{
//
// slightly simplified Velocity Verlet integration
// conserves energy better than explicit Euler method
//
p.vx += p.ax * dt;
p.vy += p.ay * dt;
p.x += p.vx * dt;
p.y += p.vy * dt;
//
// bounce from walls
//
while( p.x < 0 || p.x > size )
{
p.x = p.x < 0 ? -p.x : 2*size-p.x;
p.vx = -p.vx;
}
while( p.y < 0 || p.y > size )
{
p.y = p.y < 0 ? -p.y : 2*size-p.y;
p.vy = -p.vy;
}
}
//
// I/O routines
//
void save( FILE *f, int n, particle_t *p )
{
static bool first = true;
if( first )
{
fprintf( f, "%d %g\n", n, size );
first = false;
}
for( int i = 0; i < n; i++ )
fprintf( f, "%g %g\n", p[i].x, p[i].y );
}
//
// command line option processing
//
int find_option( int argc, char **argv, const char *option )
{
for( int i = 1; i < argc; i++ )
if( strcmp( argv[i], option ) == 0 )
return i;
return -1;
}
int read_int( int argc, char **argv, const char *option, int default_value )
{
int iplace = find_option( argc, argv, option );
if( iplace >= 0 && iplace < argc-1 )
return atoi( argv[iplace+1] );
return default_value;
}
char *read_string( int argc, char **argv, const char *option, char *default_value )
{
int iplace = find_option( argc, argv, option );
if( iplace >= 0 && iplace < argc-1 )
return argv[iplace+1];
return default_value;
}
#ifndef __CS267_COMMON_H__
#define __CS267_COMMON_H__
inline int min( int a, int b ) { return a < b ? a : b; }
inline int max( int a, int b ) { return a > b ? a : b; }
//
// saving parameters
//
const int NSTEPS = 1000;
const int SAVEFREQ = 10;
const int BOTTOM_STRAIGHT = 0,
BOTTOM_LEFT = 1,
STRAIGHT_LEFT = 2,
TOP_LEFT = 3,
TOP_STRAIGHT = 4,
TOP_RIGHT = 5,
STRAIGHT_RIGHT = 6,
BOTTOM_RIGHT = 7;
//
// particle data structure
//
typedef struct
{
double index;
double x;
double y;
double vx;
double vy;
double ax;
double ay;
} particle_t;
typedef struct
{
int offset;
int send_size;
int recv_size;
int proc_x;
int proc_y;
int block_x;
int block_y;
int rank;
particle_t *send_buffer;
int buffer_size;
MPI_Request recv_req;
MPI_Request send_req;
MPI_Status recv_stat;
MPI_Status send_stat;
} neighbor_t;
typedef struct
{
double from_x;
double from_y;
double to_x;
double to_y;
neighbor_t *neighbors;
int offset;
int size;
int block_x;
int subrank;
int block_y;
} block_t;
//
// timing routines
//
double read_timer( );
int calc_array_size(int n, double size_x, double size_y, int split_factor, int n_proc);
void copy_to_ghost_buffers( particle_t *particles, block_t self);
void setup_ghost_buffers( int my_x, int my_y, neighbor_t *neighbors, double size_x, double size_y, int start_offset);
void setup_neighbors( block_t block, int my_x, int my_y, int proc_width, int proc_height, int split_factor );
//
// simulation routines
//
double set_size( int n );
void init_particles( int n, particle_t *p );
void apply_force( particle_t &particle, particle_t &neighbor );
void move( particle_t &p );
//
// I/O routines
//
FILE *open_save( char *filename, int n );
void save( FILE *f, int n, particle_t *p );
//
// argument processing routines
//
int find_option( int argc, char **argv, const char *option );
int read_int( int argc, char **argv, const char *option, int default_value );
char *read_string( int argc, char **argv, const char *option, char *default_value );
#endif
HOST = $(shell hostname)
BANG = $(shell expr match `hostname` ccom-bang)
BANG-COMPUTE = $(shell expr match `hostname` compute)
LILLIPUT = $(shell expr match `hostname` lilliput)
ifneq ($(BANG), 0)
PUB = /share/class/public/cse260-fa12
include $(PUB)/Arch/arch.gnu.generic
else
ifneq ($(BANG-COMPUTE), 0)
PUB = /share/class/public/cse260-fa12
include $(PUB)/Arch/arch.gnu.generic
else
ifneq ($(LILLIPUT), 0)
PUB = /class/public/cse260-fa12
include $(PUB)/Arch/arch.intel.generic
else
# PUB = /Users/baden/lib
include $(PUB)/Arch/arch.gnu
# include $(PUB)/Arch/arch.gnu-4.5
endif
endif
endif
#
# Add symbol table information for gdb/cachegrind
ifeq ($(debug), 1)
CFLAGS += -g
LDFLAGS += -g
C++FLAGS += -g
endif
# Add symbol table information for gprof
ifeq ($(gprof), 1)
CFLAGS += -g -pg
C++FLAGS += -g -pg
LDFLAGS += -g -pg
endif
# If you want to compile for single precision,
# specify single=1 on the "make" command line
ifeq ($(single), 1)
else
C++FLAGS += -D_DOUBLE
CFLAGS += -D_DOUBLE
endif
# If you want to compile so that you call the plotter for
# flattened 2D arrays (implemented as 1D arrays)
# specify flattened=1 on the "make" command line
ifeq ($(flattened), 1)
C++FLAGS += -DPLOT1D
CFLAGS += -DPLOT1D
endif
# If you want to use restrict pointers, make restrict=1
# This applies to the hand code version
ifeq ($(restrict), 1)
C++FLAGS += -D__RESTRICT
CFLAGS += -D__RESTRICT
ifneq ($(CARVER), 0)
C++FLAGS += -restrict
CFLAGS += -restrict
endif
endif
#DEBUG += -DDEBUG
TARGETS = mpi
app: $(TARGETS)
OBJECTS = common.o mpi.o.o
#ifeq ($(no-mpi),1)
#OBJECTS += Timer.o
#endif
app: $(TARGETS)
mpi: mpi.o common.o
$(C++LINK) $(LDFLAGS) -o $@ mpi.o common.o $(LDLIBS)
clean:
$(RM) *.o $(TARGETS);
$(RM) core.*;
#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include "common.h"
//
// benchmarking program
//
int main( int argc, char **argv )
{
//
// process command line parameters
//
if( find_option( argc, argv, "-h" ) >= 0 )
{
printf( "Options:\n" );
printf( "-h to see this help\n" );
printf( "-n <int> to set the number of particles\n" );
printf( "-proc_width <int> to set the width of the processor grid\n" );
printf( "-proc_height <int> to set the height of the processor grid\n" );
printf( "-split_factor <int> to set split factor of the processor grid\n" );
printf( "-o <filename> to specify the output file name\n" );
printf( "-c <checksum> to specify the output checksum file name\n" );
return 0;
}
int n = read_int( argc, argv, "-n", 1000 );
int proc_width = read_int( argc, argv, "-proc_width", NULL );
int proc_height = read_int( argc, argv, "-proc_height", NULL );
int split_factor = read_int( argc, argv, "-split_factor", NULL );
char *savename = read_string( argc, argv, "-o", NULL );
char *checksum = read_string( argc, argv, "-c", NULL );
//
// set up MPI
//
int n_proc, myrank;
MPI_Init( &argc, &argv );
MPI_Comm_size( MPI_COMM_WORLD, &n_proc );
MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
//Ensure that px and py are correct setup.
if ((proc_height * proc_width) != n_proc && !myrank){
printf("\n *** The number of processors in the geometry (%d) is not the same as the number requested (%d)",proc_height*proc_width,n_proc);
exit(-1);
}
//
// initialize sizes and ranks.
//
double size = set_size( n );
double size_x = (size / proc_width) / split_factor;
double size_y = (size / proc_height) / split_factor;
int my_x = myrank % proc_width;
int my_y = myrank / proc_width;
block_t *blocks = (block_t*) malloc(split_factor * split_factor * sizeof(block_t));
int block_no = (n_proc * split_factor * split_factor);
//
// allocate generic resources
//
FILE *fsave = savename && myrank == 0 ? fopen( savename, "w" ) : NULL;
FILE *fchecksum = checksum && myrank == 0 ? fopen( checksum, "w" ) : NULL;
int array_size = calc_array_size(n, size_x,size_y,split_factor,n_proc);
particle_t *particles = (particle_t*) malloc( array_size * sizeof(particle_t));
particle_t *particles_prev = (particle_t*) malloc( array_size * sizeof(particle_t));
MPI_Datatype PARTICLE;
MPI_Type_contiguous( 7, MPI_DOUBLE, &PARTICLE );
MPI_Type_commit( &PARTICLE );
if (!myrank) { //Init all particles at rank 0
init_particles( n, particles_prev );
}
MPI_Bcast(particles_prev, n, PARTICLE, 0, MPI_COMM_WORLD);
//
// Setup the neighbor setup
//
int ij;
for (int i = 0; i < split_factor; i++){
for (int j = 0; j < split_factor; j++){
ij = i*split_factor+j;
blocks[ij].block_x = j;
blocks[ij].block_y = i;
blocks[ij].from_x = (my_x + j * proc_width) * size_x;
blocks[ij].from_y = (my_y + i * proc_height) * size_y;
blocks[ij].to_x = blocks[ij].from_x + size_x;
blocks[ij].to_y = blocks[ij].from_y + size_y;
blocks[ij].subrank = ((j*n_proc) + (i*split_factor*n_proc) + myrank);
blocks[ij].offset = (ij == 0 ? 0 : blocks[ij - 1].size + blocks[ij - 1].offset);
blocks[ij].size = 0;
for(int k=0; k<n;k++){
if (particles_prev[k].x >= blocks[ij].from_x && particles_prev[k].x < blocks[ij].to_x &&
particles_prev[k].y >= blocks[ij].from_y && particles_prev[k].y < blocks[ij].to_y){
particles[blocks[ij].offset + blocks[ij].size] = particles_prev[k];
blocks[ij].size += 1;
}
}
blocks[ij].neighbors = (neighbor_t*) malloc( 8 * sizeof(neighbor_t) );
setup_neighbors( blocks[ij], my_x, my_y, proc_width, proc_height, split_factor );
}
}
int last_block_index = split_factor*split_factor-1;
int start_offset = blocks[last_block_index].offset + blocks[last_block_index].size;
for (int i = 0; i < split_factor*split_factor; i++){
setup_ghost_buffers(my_x,my_y, blocks[i].neighbors, size_x, size_y, start_offset);
start_offset = blocks[i].neighbors[7].offset + blocks[i].neighbors[7].buffer_size;
}
//Setup recivers.
for (int i = 0; i < split_factor * split_factor; i++){
for(int j = 0; j < 8; j++) {
if (blocks[i].neighbors[j].rank != -1){
MPI_Irecv(&particles[blocks[i].neighbors[j].offset],
blocks[i].neighbors[j].buffer_size,
PARTICLE,
blocks[i].neighbors[j].rank,
blocks[i].block_x + blocks[i].block_y*split_factor,
MPI_COMM_WORLD,
&blocks[i].neighbors[j].recv_req );
}
}
}
//Copy to ghost buffers and send it.
for (int i = 0; i < split_factor * split_factor; i++){
copy_to_ghost_buffers( particles, blocks[i] );
for(int j = 0; j < 8; j++){
if (blocks[i].neighbors[j].rank != -1){
MPI_Isend(&blocks[i].neighbors[j].send_buffer[0],
blocks[i].neighbors[j].send_size,
PARTICLE,
blocks[i].neighbors[j].rank,
blocks[i].neighbors[j].block_x + blocks[i].neighbors[j].block_y*split_factor,
MPI_COMM_WORLD,
&blocks[i].neighbors[j].send_req );
}
}
}
//Set correct size
for (int i = 0; i < split_factor * split_factor; i++){
for(int j = 0; j < 8; j++) {
if (blocks[i].neighbors[j].rank != -1) {
MPI_Wait(&blocks[i].neighbors[j].recv_req,&blocks[i].neighbors[j].recv_stat);
MPI_Get_count(&blocks[i].neighbors[j].recv_stat, PARTICLE, &blocks[i].neighbors[j].recv_size);
}
}
}
//Ensure that you have send to anyone.
for (int i = 0; i < split_factor * split_factor; i++){
for(int j = 0; j < 8; j++) {
if (blocks[i].neighbors[j].rank != -1) {
MPI_Wait(&blocks[i].neighbors[j].send_req,&blocks[i].neighbors[j].send_stat);
}
}
}
particle_t *tmp = particles; particles = particles_prev; particles_prev = tmp;
//Myrank is main so he is the one gathering the info
int *n_proc_sizes = new int[n_proc];
int *n_proc_offset = new int[n_proc];
int count;
int size_of_array = blocks[last_block_index].size + blocks[last_block_index].offset;
MPI_Gather( &size_of_array, 1, MPI_INT, n_proc_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD );
if(!myrank ) {
int real_n = 0;
for(int i = 0; i < n_proc; i++){
real_n += n_proc_sizes[i];
}
printf("Before correctness check: n: %d, real-n: %d\n",n, real_n);
}
//
// simulate a number of time steps
//
double simulation_time = read_timer( );
for( int step = 0; step < NSTEPS; step++ )
{
//
// save current step if necessary (slightly different semantics than in other codes)
//
if( (step%SAVEFREQ) == 0 && savename ) {
//To to collect from every node
size_of_array = blocks[last_block_index].size + blocks[last_block_index].offset;
MPI_Gather( &size_of_array, 1, MPI_INT, n_proc_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD );
//Calculate offsets
if (myrank == 0){
for(int i = 0; i < n_proc; i++){
if (i == 0) {
n_proc_offset[i] = 0;
} else {
n_proc_offset[i] = n_proc_offset[i-1] + n_proc_sizes[i-1];
}
}
}
MPI_Gatherv( &particles_prev[blocks[0].offset], size_of_array, PARTICLE, particles, n_proc_sizes, n_proc_offset, PARTICLE, 0, MPI_COMM_WORLD );
if (myrank == 0 && fsave) {
save( fsave, n, particles );
}
}
//
// compute forces in us and the neighbors
//
for( int i = 0; i < split_factor * split_factor; i++){
for(int k = blocks[i].offset; k < blocks[i].offset + blocks[i].size; k++){
particles_prev[k].ax = particles_prev[k].ay = 0;
for(int l = blocks[i].offset; l < blocks[i].offset + blocks[i].size; l++ ) {
apply_force( particles_prev[k], particles_prev[l] );
}
for(int j = 0; j < 8; j++){
for(int l = blocks[i].neighbors[j].offset; l < blocks[i].neighbors[j].offset + blocks[i].neighbors[j].recv_size; l++ ) {
apply_force( particles_prev[k], particles_prev[l] );
}
}
}
for(int j = 0; j < 8; j++){
for(int k = blocks[i].neighbors[j].offset; k < blocks[i].neighbors[j].offset + blocks[i].neighbors[j].recv_size; k++ ) {
particles_prev[k].ax = particles_prev[k].ay = 0;
for(int l = blocks[i].offset; l < blocks[i].offset + blocks[i].size; l++ ) {
apply_force( particles_prev[k], particles_prev[l] );
}
for(int z = 0; z < 8; z++){
for(int l = blocks[i].neighbors[z].offset; l < blocks[i].neighbors[z].offset + blocks[i].neighbors[z].recv_size; l++ ) {
apply_force( particles_prev[k], particles_prev[l] );
}
}
}
}
}
//
// move particles
//
int new_offset = blocks[0].offset;
for( int i = 0; i < split_factor * split_factor; i++){
count = 0;
int removed = 0;
int newin = 0;
for(int k = blocks[i].offset; k < blocks[i].offset + blocks[i].size; k++ ){
move(particles_prev[k]);
if (particles_prev[k].x >= blocks[i].from_x && particles_prev[k].x < blocks[i].to_x &&
particles_prev[k].y >= blocks[i].from_y && particles_prev[k].y < blocks[i].to_y){
particles[count+new_offset] = particles_prev[k];
count = count + 1;
} else {
removed = removed + 1;
}
}
for(int j = 0; j < 8; j++){
for(int k = blocks[i].neighbors[j].offset; k < blocks[i].neighbors[j].offset + blocks[i].neighbors[j].recv_size; k++ ) {
move(particles_prev[k]);
if (particles_prev[k].x >= blocks[i].from_x && particles_prev[k].x < blocks[i].to_x &&
particles_prev[k].y >= blocks[i].from_y && particles_prev[k].y < blocks[i].to_y){
particles[count + new_offset] = particles_prev[k];
count = count + 1;
newin = newin + 1;
}
}
blocks[i].neighbors[j].recv_size = 0;
}
blocks[i].size = count;
blocks[i].offset = new_offset;
new_offset = blocks[i].offset + blocks[i].size;
}
//calculating new offset
int start_offset = blocks[last_block_index].offset + blocks[last_block_index].size;
for (int i = 0; i < split_factor*split_factor; i++){
for(int j = 0; j < 8; j++) {
blocks[i].neighbors[j].offset = start_offset;
start_offset = start_offset + blocks[i].neighbors[j].buffer_size;
}
}
//Setup recivers.
for (int i = 0; i < split_factor * split_factor; i++){
for(int j = 0; j < 8; j++) {
if (blocks[i].neighbors[j].rank != -1){
MPI_Irecv(&particles[blocks[i].neighbors[j].offset],
blocks[i].neighbors[j].buffer_size,
PARTICLE,
blocks[i].neighbors[j].rank,
blocks[i].block_x + blocks[i].block_y*split_factor,
MPI_COMM_WORLD,
&blocks[i].neighbors[j].recv_req );
}
}
}
//Copy to ghost buffers and send it.
for (int i = 0; i < split_factor * split_factor; i++){
copy_to_ghost_buffers( particles, blocks[i] );
for(int j = 0; j < 8; j++){
if (blocks[i].neighbors[j].rank != -1){
MPI_Isend(&blocks[i].neighbors[j].send_buffer[0],
blocks[i].neighbors[j].send_size,
PARTICLE,
blocks[i].neighbors[j].rank,
blocks[i].neighbors[j].block_x + blocks[i].neighbors[j].block_y*split_factor,
MPI_COMM_WORLD,
&blocks[i].neighbors[j].send_req );
}
}
}
//Set correct size
for (int i = 0; i < split_factor * split_factor; i++){
for(int j = 0; j < 8; j++) {
if (blocks[i].neighbors[j].rank != -1) {
MPI_Wait(&blocks[i].neighbors[j].recv_req,&blocks[i].neighbors[j].recv_stat);
MPI_Get_count(&blocks[i].neighbors[j].recv_stat, PARTICLE, &blocks[i].neighbors[j].recv_size);
}
}
}
particle_t *tmp = particles; particles = particles_prev; particles_prev = tmp;
//Ensure that you have send to anyone.
for (int i = 0; i < split_factor * split_factor; i++){
for(int j = 0; j < 8; j++) {
if (blocks[i].neighbors[j].rank != -1) {
MPI_Wait(&blocks[i].neighbors[j].send_req,&blocks[i].neighbors[j].send_stat);
}
}
}
}
simulation_time = read_timer( ) - simulation_time;
size_of_array = blocks[last_block_index].size + blocks[last_block_index].offset;
MPI_Gather( &size_of_array, 1, MPI_INT, n_proc_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD );
if(!myrank ) {
int real_n = 0;
for(int i = 0; i < n_proc; i++){
if (i == 0) {
n_proc_offset[i] = 0;
} else {
n_proc_offset[i] = n_proc_offset[i-1] + n_proc_sizes[i-1];
}
real_n += n_proc_sizes[i];
}
printf("After correctness check: n: %d, real-n: %d\n",n, real_n);
printf( "n = %d, n_procs = %d, proc_width = %d, proc_height = %d, split_factor = %d, simulation time = %g s\n", n, n_proc, proc_width, proc_height, split_factor, simulation_time );
}
MPI_Gatherv( &particles_prev[blocks[0].offset], size_of_array, PARTICLE, particles, n_proc_sizes, n_proc_offset, PARTICLE, 0, MPI_COMM_WORLD );
//Sort every particle to their i value
if (!myrank && fchecksum){
for(int j = 0; j < n; j++){
particles_prev[(int)particles[j].index] = particles[j];
}
save( fchecksum, n, particles_prev );
}
//
// release resources
//
for(int i = 0; i < split_factor*split_factor; i++){
free(blocks[i].neighbors);
}
free( blocks );
free( particles );
free( particles_prev );
if( fsave )
fclose( fsave );
if (fchecksum)
fclose( fchecksum );
MPI_Finalize( );
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment