jstroem · December 3, 2012 05:15
diff --git a/batch_bang.sh b/batch_bang.sh
 #!/bin/bash
 # This script is intepreted by the Bourne Shell, sh
 #
 # Documentation for SGE is found in:
 # http://docs.oracle.com/cd/E19279-01/820-3257-12/n1ge.html
 #
 # Tell SGE which shell to run the job script in rather than depending
 # on SGE to try and figure it out.
 #$ -S /bin/bash
 #
 # Export all my environment variables to the job
 #$ -V
 #
 # Tun the job in the same directory from which you submitted it
 #$ -cwd
 #
 #
 # --- Don't change anything above this line ---
 #
 # Give a name to the job
 #$ -N MPI
 #
 # Specify a time limit for the job, not more than 30 minutes
 #$ -l h_rt=00:02:00
 #
 # Specify the parallel environment and number of cores,
 # If not a multiple of 8, you'll get the whole node anyway
 #$ -pe orte 8
 #
 # Join stdout and stderr so they are reported in job output file
 #$ -j y
 #
 #
 # Choose the queue to run the job
 #
 # Debug queue: only one node may be used at a time for up to 30 minutes
 # Interactive or batch jobs, maximum of 1 job per user running at a time
 #
 # Normal queue: job may use all available compute nodes (256 cores)
 # for up to 60 minutes
 # Batch jobs, maximum of 2 jobs per user running at a time
 # To use more than one node, specify the "normal" queue
 #$ -q normal.q
 # #$ -q debug.q
 #
 # Specifies the circumstances under which mail is to be sent to the job owner
 # defined by -M option. For example, options "bea" cause mail to be sent at the 
 # begining, end, and at abort time (if it happens) of the job.
 # Option "n" means no mail will be sent.
 #$ -m aeb
 #
 # *** Change to the address you want the notification sent to, and
 # *** REMOVE the blank between the # and the $
 #$ -M jesper.jln@gmail.com
 #


 echo
 echo " *** Current working directory"
 pwd
 echo
 echo " *** Compiler"
 # Output which  compiler are we using and the environment
 mpicc -v
 echo
 echo " *** Environment"
 printenv

 echo

 echo ">>> Job Starts"
 date
 mpirun -np $NSLOTS ./mpi -n 500 -proc_width 4 -proc_height 2 -o mpi.txt -c checksum.txt

 date
 echo ">>> Job Ends"
diff --git a/common.cpp b/common.cpp
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
 #include <float.h>
 #include <string.h>
 #include <math.h>
 #include <time.h>
 #include <sys/time.h>
 #include "common.h"

 double size;

 //
 //  tuned constants
 //
 #define density 0.0005
 #define mass    0.01
 #define cutoff  0.01
 #define min_r   (cutoff/100)
 #define dt      0.0005

 //
 //  timer
 //
 double read_timer( )
 {
    static bool initialized = false;
    static struct timeval start;
    struct timeval end;
    if( !initialized )
    {
        gettimeofday( &start, NULL );
        initialized = true;
    }
    gettimeofday( &end, NULL );
    return (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
 }

 //
 //  keep density constant
 //
 void set_size( int n )
 {
    size = sqrt( density * n );
 }

 //
 //  Initialize the particle positions and velocities
 //
 void init_particles( int n, particle_t *p )
 {
    srand48( 1 );
        
    int sx = (int)ceil(sqrt((double)n));
    int sy = (n+sx-1)/sx;
    
    int *shuffle = (int*)malloc( n * sizeof(int) );
    for( int i = 0; i < n; i++ )
        shuffle[i] = i;
    
    for( int i = 0; i < n; i++ ) 
    {
        //
        //  make sure particles are not spatially sorted
        //
        int j = lrand48()%(n-i);
        int k = shuffle[j];
        shuffle[j] = shuffle[n-i-1];
        
        //
        //  distribute particles evenly to ensure proper spacing
        //
        p[i].x = size*(1.+(k%sx))/(1+sx);
        p[i].y = size*(1.+(k/sx))/(1+sy);
        p[i].index = i;

        //
        //  assign random velocities within a bound
        //
        p[i].vx = drand48()*2-1;
        p[i].vy = drand48()*2-1;
    }
    free( shuffle );
 }

 //
 //  interact two particles
 //
 void apply_force( particle_t &particle, particle_t &neighbor )
 {

    double dx = neighbor.x - particle.x;
    double dy = neighbor.y - particle.y;
    double r2 = dx * dx + dy * dy;
    if( r2 > cutoff*cutoff )
        return;
    r2 = fmax( r2, min_r*min_r );
    double r = sqrt( r2 );

    //
    //  very simple short-range repulsive force
    //
    double coef = ( 1 - cutoff / r ) / r2 / mass;
    particle.ax += coef * dx;
    particle.ay += coef * dy;
 }

 //
 //  integrate the ODE
 //
 void move( particle_t &p )
 {
    //
    //  slightly simplified Velocity Verlet integration
    //  conserves energy better than explicit Euler method
    //
    p.vx += p.ax * dt;
    p.vy += p.ay * dt;
    p.x  += p.vx * dt;
    p.y  += p.vy * dt;

    //
    //  bounce from walls
    //
    while( p.x < 0 || p.x > size )
    {
        p.x  = p.x < 0 ? -p.x : 2*size-p.x;
        p.vx = -p.vx;
    }
    while( p.y < 0 || p.y > size )
    {
        p.y  = p.y < 0 ? -p.y : 2*size-p.y;
        p.vy = -p.vy;
    }
 }

 //
 //  I/O routines
 //
 void save( FILE *f, int n, particle_t *p )
 {
    static bool first = true;
    if( first )
    {
        fprintf( f, "%d %g\n", n, size );
        first = false;
    }
    for( int i = 0; i < n; i++ )
        fprintf( f, "%g %g\n", p[i].x, p[i].y );
 }

 //
 //  command line option processing
 //
 int find_option( int argc, char **argv, const char *option )
 {
    for( int i = 1; i < argc; i++ )
        if( strcmp( argv[i], option ) == 0 )
            return i;
    return -1;
 }

 int read_int( int argc, char **argv, const char *option, int default_value )
 {
    int iplace = find_option( argc, argv, option );
    if( iplace >= 0 && iplace < argc-1 )
        return atoi( argv[iplace+1] );
    return default_value;
 }

 char *read_string( int argc, char **argv, const char *option, char *default_value )
 {
    int iplace = find_option( argc, argv, option );
    if( iplace >= 0 && iplace < argc-1 )
        return argv[iplace+1];
    return default_value;
 }
diff --git a/common.h b/common.h
 #ifndef __CS267_COMMON_H__
 #define __CS267_COMMON_H__

 inline int min( int a, int b ) { return a < b ? a : b; }
 inline int max( int a, int b ) { return a > b ? a : b; }

 //
 //  saving parameters
 //
 const int NSTEPS = 1000;
 const int SAVEFREQ = 10;

 //
 // particle data structure
 //
 typedef struct 
 {
  double index;
  double x;
  double y;
  double vx;
  double vy;
  double ax;
  double ay;
 } particle_t;

 //
 //  timing routines
 //
 double read_timer( );

 //
 //  simulation routines
 //
 void set_size( int n );
 void init_particles( int n, particle_t *p );
 void apply_force( particle_t &particle, particle_t &neighbor );
 void move( particle_t &p );

 //
 //  I/O routines
 //
 FILE *open_save( char *filename, int n );
 void save( FILE *f, int n, particle_t *p );

 //
 //  argument processing routines
 //
 int find_option( int argc, char **argv, const char *option );
 int read_int( int argc, char **argv, const char *option, int default_value );
 char *read_string( int argc, char **argv, const char *option, char *default_value );

 #endif
diff --git a/Makefile b/Makefile
 HOST = $(shell hostname)
 BANG   =  $(shell expr match `hostname` ccom-bang)
 BANG-COMPUTE   =  $(shell expr match `hostname` compute)
 LILLIPUT   =  $(shell expr match `hostname` lilliput)



 ifneq ($(BANG), 0)
 PUB     = /share/class/public/cse260-fa12
 include $(PUB)/Arch/arch.gnu.generic
 else
 ifneq ($(BANG-COMPUTE), 0)
 PUB     = /share/class/public/cse260-fa12
 include $(PUB)/Arch/arch.gnu.generic
 else
 ifneq ($(LILLIPUT), 0)
 PUB	= /class/public/cse260-fa12
 include $(PUB)/Arch/arch.intel.generic
 else
 # PUB = /Users/baden/lib
 include $(PUB)/Arch/arch.gnu
 # include $(PUB)/Arch/arch.gnu-4.5
 endif
 endif
 endif
 #
 # Add symbol table information for gdb/cachegrind
 ifeq ($(debug), 1)
        CFLAGS += -g
        LDFLAGS += -g
        C++FLAGS += -g
 endif   


 # Add symbol table information for gprof
 ifeq ($(gprof), 1)
        CFLAGS += -g -pg
        C++FLAGS += -g -pg
        LDFLAGS += -g -pg
 endif

 # If you want to compile for single precision,
 # specify single=1 on the "make" command line
 ifeq ($(single), 1)
 else
    C++FLAGS += -D_DOUBLE
    CFLAGS += -D_DOUBLE
 endif


 # If you want to compile so that you call the plotter for
 # flattened 2D arrays (implemented as 1D arrays)
 # specify flattened=1 on the "make" command line
 ifeq ($(flattened), 1)
    C++FLAGS += -DPLOT1D
    CFLAGS += -DPLOT1D
 endif

 # If you want to use restrict pointers, make restrict=1
 # This applies to the hand code version
 ifeq ($(restrict), 1)
    C++FLAGS += -D__RESTRICT
    CFLAGS += -D__RESTRICT
 ifneq ($(CARVER), 0)
    C++FLAGS += -restrict
    CFLAGS += -restrict
 endif
 endif


 #DEBUG += -DDEBUG
 TARGETS = mpi

 app:		$(TARGETS)

 OBJECTS = common.o mpi.o.o
 #ifeq ($(no-mpi),1)
 #OBJECTS += Timer.o
 #endif
 app:	$(TARGETS)

 mpi: mpi.o common.o
 	$(C++LINK) $(LDFLAGS) -o $@ mpi.o common.o $(LDLIBS)

 clean:	
 	$(RM) *.o $(TARGETS);
 	$(RM) core;
diff --git a/mpi.cpp b/mpi.cpp
 #include <mpi.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
 #include "common.h"

 //
 //  benchmarking program
 //
 int main( int argc, char **argv )
 {    
    //
    //  process command line parameters
    //
    if( find_option( argc, argv, "-h" ) >= 0 )
    {
        printf( "Options:\n" );
        printf( "-h to see this help\n" );
        printf( "-n <int> to set the number of particles\n" );
        printf( "-o <filename> to specify the output file name\n" );
        printf( "-c <checksum> to specify the output checksum file name\n" );
        return 0;
    }
    
    int n = read_int( argc, argv, "-n", 1000 );
    char *savename = read_string( argc, argv, "-o", NULL );
    char *checksum = read_string( argc, argv, "-c", NULL );
    
    //
    //  set up MPI
    //
    int n_proc, rank;
    MPI_Init( &argc, &argv );
    MPI_Comm_size( MPI_COMM_WORLD, &n_proc );
    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    
    //
    //  allocate generic resources
    //
    FILE *fsave = savename && rank == 0 ? fopen( savename, "w" ) : NULL;
    FILE *fchecksum = checksum && rank == 0 ? fopen( checksum, "w" ) : NULL;
    particle_t *particles = (particle_t*) malloc( n * sizeof(particle_t) );
    
    MPI_Datatype PARTICLE;
    MPI_Type_contiguous( 7, MPI_DOUBLE, &PARTICLE );
    MPI_Type_commit( &PARTICLE );
    
    //
    //  set up the data partitioning across processors
    //
    int particle_per_proc = (n + n_proc - 1) / n_proc;
    int *partition_offsets = (int*) malloc( (n_proc+1) * sizeof(int) );
    for( int i = 0; i < n_proc+1; i++ )
        partition_offsets[i] = min( i * particle_per_proc, n );
    
    int *partition_sizes = (int*) malloc( n_proc * sizeof(int) );
    for( int i = 0; i < n_proc; i++ )
        partition_sizes[i] = partition_offsets[i+1] - partition_offsets[i];
    
    //
    //  allocate storage for local partition
    //
    int nlocal = partition_sizes[rank];
    particle_t *local = (particle_t*) malloc( nlocal * sizeof(particle_t) );
    
    //
    //  initialize and distribute the particles (that's fine to leave it unoptimized)
    //
    set_size( n );
    if( rank == 0 )
        init_particles( n, particles );
    MPI_Scatterv( particles, partition_sizes, partition_offsets, PARTICLE, local, nlocal, PARTICLE, 0, MPI_COMM_WORLD );
    
    //
    //  simulate a number of time steps
    //
    double simulation_time = read_timer( );
    for( int step = 0; step < NSTEPS; step++ )
    {
        // 
        //  collect all global data locally (not good idea to do)
        //
        MPI_Allgatherv( local, nlocal, PARTICLE, particles, partition_sizes, partition_offsets, PARTICLE, MPI_COMM_WORLD );
        
        //
        //  save current step if necessary (slightly different semantics than in other codes)
        //
        if( fsave && (step%SAVEFREQ) == 0 )
            save( fsave, n, particles );
        
        //
        //  compute all forces
        //
        for( int i = 0; i < nlocal; i++ )
        {
            local[i].ax = local[i].ay = 0;
            for (int j = 0; j < n; j++ )
                apply_force( local[i], particles[j] );
        }
        
        //
        //  move particles
        //
        for( int i = 0; i < nlocal; i++ )
            move( local[i] );
    }
    simulation_time = read_timer( ) - simulation_time;
    
    if( rank == 0 )
        printf( "n = %d, n_procs = %d, simulation time = %g s\n", n, n_proc, simulation_time );

    MPI_Allgatherv( local, nlocal, PARTICLE, particles, partition_sizes, partition_offsets, PARTICLE, MPI_COMM_WORLD );
    if (!rank && fchecksum){
        particle_t *particles_prev = (particle_t*) malloc( n * sizeof(particle_t) );
        for(int j = 0; j < n; j++){
            particles_prev[(int)particles[j].index] = particles[j];
        }
        save( fchecksum, n, particles_prev );
    }

    
    //
    //  release resources
    //
    free( partition_offsets );
    free( partition_sizes );
    free( local );
    free( particles );
    if( fsave )
        fclose( fsave );
    if (fchecksum)
        fclose( fchecksum );
    
    MPI_Finalize( );
    
    return 0;
 }
	#!/bin/bash
	# This script is intepreted by the Bourne Shell, sh
	#
	# Documentation for SGE is found in:
	# http://docs.oracle.com/cd/E19279-01/820-3257-12/n1ge.html
	#
	# Tell SGE which shell to run the job script in rather than depending
	# on SGE to try and figure it out.
	#$ -S /bin/bash
	#
	# Export all my environment variables to the job
	#$ -V
	#
	# Tun the job in the same directory from which you submitted it
	#$ -cwd
	#
	#
	# --- Don't change anything above this line ---
	#
	# Give a name to the job
	#$ -N MPI
	#
	# Specify a time limit for the job, not more than 30 minutes
	#$ -l h_rt=00:02:00
	#
	# Specify the parallel environment and number of cores,
	# If not a multiple of 8, you'll get the whole node anyway
	#$ -pe orte 8
	#
	# Join stdout and stderr so they are reported in job output file
	#$ -j y
	#
	#
	# Choose the queue to run the job
	#
	# Debug queue: only one node may be used at a time for up to 30 minutes
	# Interactive or batch jobs, maximum of 1 job per user running at a time
	#
	# Normal queue: job may use all available compute nodes (256 cores)
	# for up to 60 minutes
	# Batch jobs, maximum of 2 jobs per user running at a time
	# To use more than one node, specify the "normal" queue
	#$ -q normal.q
	# #$ -q debug.q
	#
	# Specifies the circumstances under which mail is to be sent to the job owner
	# defined by -M option. For example, options "bea" cause mail to be sent at the
	# begining, end, and at abort time (if it happens) of the job.
	# Option "n" means no mail will be sent.
	#$ -m aeb
	#
	# *** Change to the address you want the notification sent to, and
	# *** REMOVE the blank between the # and the $
	#$ -M jesper.jln@gmail.com
	#


	echo
	echo " *** Current working directory"
	pwd
	echo
	echo " *** Compiler"
	# Output which compiler are we using and the environment
	mpicc -v
	echo
	echo " *** Environment"
	printenv

	echo

	echo ">>> Job Starts"
	date
	mpirun -np $NSLOTS ./mpi -n 500 -proc_width 4 -proc_height 2 -o mpi.txt -c checksum.txt

	date
	echo ">>> Job Ends"
	#include <stdlib.h>
	#include <stdio.h>
	#include <assert.h>
	#include <float.h>
	#include <string.h>
	#include <math.h>
	#include <time.h>
	#include <sys/time.h>
	#include "common.h"

	double size;

	//
	// tuned constants
	//
	#define density 0.0005
	#define mass 0.01
	#define cutoff 0.01
	#define min_r (cutoff/100)
	#define dt 0.0005

	//
	// timer
	//
	double read_timer( )
	{
	static bool initialized = false;
	static struct timeval start;
	struct timeval end;
	if( !initialized )
	{
	gettimeofday( &start, NULL );
	initialized = true;
	}
	gettimeofday( &end, NULL );
	return (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
	}

	//
	// keep density constant
	//
	void set_size( int n )
	{
	size = sqrt( density * n );
	}

	//
	// Initialize the particle positions and velocities
	//
	void init_particles( int n, particle_t *p )
	{
	srand48( 1 );

	int sx = (int)ceil(sqrt((double)n));
	int sy = (n+sx-1)/sx;

	int shuffle = (int)malloc( n * sizeof(int) );
	for( int i = 0; i < n; i++ )
	shuffle[i] = i;

	for( int i = 0; i < n; i++ )
	{
	//
	// make sure particles are not spatially sorted
	//
	int j = lrand48()%(n-i);
	int k = shuffle[j];
	shuffle[j] = shuffle[n-i-1];

	//
	// distribute particles evenly to ensure proper spacing
	//
	p[i].x = size*(1.+(k%sx))/(1+sx);
	p[i].y = size*(1.+(k/sx))/(1+sy);
	p[i].index = i;

	//
	// assign random velocities within a bound
	//
	p[i].vx = drand48()*2-1;
	p[i].vy = drand48()*2-1;
	}
	free( shuffle );
	}

	//
	// interact two particles
	//
	void apply_force( particle_t &particle, particle_t &neighbor )
	{

	double dx = neighbor.x - particle.x;
	double dy = neighbor.y - particle.y;
	double r2 = dx * dx + dy * dy;
	if( r2 > cutoff*cutoff )
	return;
	r2 = fmax( r2, min_r*min_r );
	double r = sqrt( r2 );

	//
	// very simple short-range repulsive force
	//
	double coef = ( 1 - cutoff / r ) / r2 / mass;
	particle.ax += coef * dx;
	particle.ay += coef * dy;
	}

	//
	// integrate the ODE
	//
	void move( particle_t &p )
	{
	//
	// slightly simplified Velocity Verlet integration
	// conserves energy better than explicit Euler method
	//
	p.vx += p.ax * dt;
	p.vy += p.ay * dt;
	p.x += p.vx * dt;
	p.y += p.vy * dt;

	//
	// bounce from walls
	//
	while( p.x < 0 \|\| p.x > size )
	{
	p.x = p.x < 0 ? -p.x : 2*size-p.x;
	p.vx = -p.vx;
	}
	while( p.y < 0 \|\| p.y > size )
	{
	p.y = p.y < 0 ? -p.y : 2*size-p.y;
	p.vy = -p.vy;
	}
	}

	//
	// I/O routines
	//
	void save( FILE f, int n, particle_t p )
	{
	static bool first = true;
	if( first )
	{
	fprintf( f, "%d %g\n", n, size );
	first = false;
	}
	for( int i = 0; i < n; i++ )
	fprintf( f, "%g %g\n", p[i].x, p[i].y );
	}

	//
	// command line option processing
	//
	int find_option( int argc, char *argv, const char option )
	{
	for( int i = 1; i < argc; i++ )
	if( strcmp( argv[i], option ) == 0 )
	return i;
	return -1;
	}

	int read_int( int argc, char *argv, const char option, int default_value )
	{
	int iplace = find_option( argc, argv, option );
	if( iplace >= 0 && iplace < argc-1 )
	return atoi( argv[iplace+1] );
	return default_value;
	}

	char read_string( int argc, char argv, const char option, char *default_value )
	{
	int iplace = find_option( argc, argv, option );
	if( iplace >= 0 && iplace < argc-1 )
	return argv[iplace+1];
	return default_value;
	}
	#ifndef __CS267_COMMON_H__
	#define __CS267_COMMON_H__

	inline int min( int a, int b ) { return a < b ? a : b; }
	inline int max( int a, int b ) { return a > b ? a : b; }

	//
	// saving parameters
	//
	const int NSTEPS = 1000;
	const int SAVEFREQ = 10;

	//
	// particle data structure
	//
	typedef struct
	{
	double index;
	double x;
	double y;
	double vx;
	double vy;
	double ax;
	double ay;
	} particle_t;

	//
	// timing routines
	//
	double read_timer( );

	//
	// simulation routines
	//
	void set_size( int n );
	void init_particles( int n, particle_t *p );
	void apply_force( particle_t &particle, particle_t &neighbor );
	void move( particle_t &p );

	//
	// I/O routines
	//
	FILE open_save( char filename, int n );
	void save( FILE f, int n, particle_t p );

	//
	// argument processing routines
	//
	int find_option( int argc, char *argv, const char option );
	int read_int( int argc, char *argv, const char option, int default_value );
	char read_string( int argc, char argv, const char option, char *default_value );

	#endif
	HOST = $(shell hostname)
	BANG = $(shell expr match `hostname` ccom-bang)
	BANG-COMPUTE = $(shell expr match `hostname` compute)
	LILLIPUT = $(shell expr match `hostname` lilliput)



	ifneq ($(BANG), 0)
	PUB = /share/class/public/cse260-fa12
	include $(PUB)/Arch/arch.gnu.generic
	else
	ifneq ($(BANG-COMPUTE), 0)
	PUB = /share/class/public/cse260-fa12
	include $(PUB)/Arch/arch.gnu.generic
	else
	ifneq ($(LILLIPUT), 0)
	PUB = /class/public/cse260-fa12
	include $(PUB)/Arch/arch.intel.generic
	else
	# PUB = /Users/baden/lib
	include $(PUB)/Arch/arch.gnu
	# include $(PUB)/Arch/arch.gnu-4.5
	endif
	endif
	endif
	#
	# Add symbol table information for gdb/cachegrind
	ifeq ($(debug), 1)
	CFLAGS += -g
	LDFLAGS += -g
	C++FLAGS += -g
	endif


	# Add symbol table information for gprof
	ifeq ($(gprof), 1)
	CFLAGS += -g -pg
	C++FLAGS += -g -pg
	LDFLAGS += -g -pg
	endif

	# If you want to compile for single precision,
	# specify single=1 on the "make" command line
	ifeq ($(single), 1)
	else
	C++FLAGS += -D_DOUBLE
	CFLAGS += -D_DOUBLE
	endif


	# If you want to compile so that you call the plotter for
	# flattened 2D arrays (implemented as 1D arrays)
	# specify flattened=1 on the "make" command line
	ifeq ($(flattened), 1)
	C++FLAGS += -DPLOT1D
	CFLAGS += -DPLOT1D
	endif

	# If you want to use restrict pointers, make restrict=1
	# This applies to the hand code version
	ifeq ($(restrict), 1)
	C++FLAGS += -D__RESTRICT
	CFLAGS += -D__RESTRICT
	ifneq ($(CARVER), 0)
	C++FLAGS += -restrict
	CFLAGS += -restrict
	endif
	endif


	#DEBUG += -DDEBUG
	TARGETS = mpi

	app: $(TARGETS)

	OBJECTS = common.o mpi.o.o
	#ifeq ($(no-mpi),1)
	#OBJECTS += Timer.o
	#endif
	app: $(TARGETS)

	mpi: mpi.o common.o
	$(C++LINK) $(LDFLAGS) -o $@ mpi.o common.o $(LDLIBS)

	clean:
	$(RM) *.o $(TARGETS);
	$(RM) core;
	#include <mpi.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <assert.h>
	#include "common.h"

	//
	// benchmarking program
	//
	int main( int argc, char **argv )
	{
	//
	// process command line parameters
	//
	if( find_option( argc, argv, "-h" ) >= 0 )
	{
	printf( "Options:\n" );
	printf( "-h to see this help\n" );
	printf( "-n <int> to set the number of particles\n" );
	printf( "-o <filename> to specify the output file name\n" );
	printf( "-c <checksum> to specify the output checksum file name\n" );
	return 0;
	}

	int n = read_int( argc, argv, "-n", 1000 );
	char *savename = read_string( argc, argv, "-o", NULL );
	char *checksum = read_string( argc, argv, "-c", NULL );

	//
	// set up MPI
	//
	int n_proc, rank;
	MPI_Init( &argc, &argv );
	MPI_Comm_size( MPI_COMM_WORLD, &n_proc );
	MPI_Comm_rank( MPI_COMM_WORLD, &rank );

	//
	// allocate generic resources
	//
	FILE *fsave = savename && rank == 0 ? fopen( savename, "w" ) : NULL;
	FILE *fchecksum = checksum && rank == 0 ? fopen( checksum, "w" ) : NULL;
	particle_t particles = (particle_t) malloc( n * sizeof(particle_t) );

	MPI_Datatype PARTICLE;
	MPI_Type_contiguous( 7, MPI_DOUBLE, &PARTICLE );
	MPI_Type_commit( &PARTICLE );

	//
	// set up the data partitioning across processors
	//
	int particle_per_proc = (n + n_proc - 1) / n_proc;
	int partition_offsets = (int) malloc( (n_proc+1) * sizeof(int) );
	for( int i = 0; i < n_proc+1; i++ )
	partition_offsets[i] = min( i * particle_per_proc, n );

	int partition_sizes = (int) malloc( n_proc * sizeof(int) );
	for( int i = 0; i < n_proc; i++ )
	partition_sizes[i] = partition_offsets[i+1] - partition_offsets[i];

	//
	// allocate storage for local partition
	//
	int nlocal = partition_sizes[rank];
	particle_t local = (particle_t) malloc( nlocal * sizeof(particle_t) );

	//
	// initialize and distribute the particles (that's fine to leave it unoptimized)
	//
	set_size( n );
	if( rank == 0 )
	init_particles( n, particles );
	MPI_Scatterv( particles, partition_sizes, partition_offsets, PARTICLE, local, nlocal, PARTICLE, 0, MPI_COMM_WORLD );

	//
	// simulate a number of time steps
	//
	double simulation_time = read_timer( );
	for( int step = 0; step < NSTEPS; step++ )
	{
	//
	// collect all global data locally (not good idea to do)
	//
	MPI_Allgatherv( local, nlocal, PARTICLE, particles, partition_sizes, partition_offsets, PARTICLE, MPI_COMM_WORLD );

	//
	// save current step if necessary (slightly different semantics than in other codes)
	//
	if( fsave && (step%SAVEFREQ) == 0 )
	save( fsave, n, particles );

	//
	// compute all forces
	//
	for( int i = 0; i < nlocal; i++ )
	{
	local[i].ax = local[i].ay = 0;
	for (int j = 0; j < n; j++ )
	apply_force( local[i], particles[j] );
	}

	//
	// move particles
	//
	for( int i = 0; i < nlocal; i++ )
	move( local[i] );
	}
	simulation_time = read_timer( ) - simulation_time;

	if( rank == 0 )
	printf( "n = %d, n_procs = %d, simulation time = %g s\n", n, n_proc, simulation_time );

	MPI_Allgatherv( local, nlocal, PARTICLE, particles, partition_sizes, partition_offsets, PARTICLE, MPI_COMM_WORLD );
	if (!rank && fchecksum){
	particle_t particles_prev = (particle_t) malloc( n * sizeof(particle_t) );
	for(int j = 0; j < n; j++){
	particles_prev[(int)particles[j].index] = particles[j];
	}
	save( fchecksum, n, particles_prev );
	}


	//
	// release resources
	//
	free( partition_offsets );
	free( partition_sizes );
	free( local );
	free( particles );
	if( fsave )
	fclose( fsave );
	if (fchecksum)
	fclose( fchecksum );

	MPI_Finalize( );

	return 0;
	}