plampite · May 7, 2022 18:29
diff --git a/myalltoall.f90 b/myalltoall.f90
 program myalltoall
   !Program to show a simple implementation of a deadlock avoiding mpi loop among all processes which,
   !in principle, is similar to an alltoall loop. However, the main purpose of the technique shown here is to
   !properly reorder shortest (i.e., each process with just few others) non-blocking communication loops,
   !in order to alleviate the burden on the communication side (as each exchange is matched, everything
   !is exchanged very quickly). Here, it is tested against the the mpi_allreduce intrinsic with MPI_SUM
   !on a single real variable, but IT IS NOT a replacement for allreduce (nor alltoall or any other intrinsic).
   use, intrinsic :: iso_fortran_env, only : int32, real64
   use mpi
   implicit none
   integer(int32), allocatable :: comm_order(:)
   integer(int32) :: i, myp, nproc, myid, mpi_err, mpi_stat(mpi_status_size)
   real(real64)   :: x, xr1, xr2, xr, rp

   !Initialize MPI
   call mpi_init(mpi_err)
   call mpi_comm_size(mpi_comm_world,nproc,mpi_err)
   call mpi_comm_rank(mpi_comm_world,myid,mpi_err)

   !The local variable that each process will send to any other process
   call random_number(x)
   
   !In order to test the loop below, as an example, we compute a reference allreduce sum of the local variables
   call mpi_allreduce(x,xr1,1,mpi_double_precision,mpi_sum,mpi_comm_world,mpi_err)

   !The key to avoid a deadlock is to determine a proper communication order for each process. We use a round robin algorithm.
   !For certain purposes, one might want to store and preprocess the communication order before using it, which can be done as
   !follows:
   allocate(comm_order(nproc))
   do i = 1, nproc
      comm_order(mod(myid+i-1,nproc)+1) = i
   enddo

   !The deadlock avoiding loop with all the processes (including itself)
   xr2 = 0.0_real64
   do i = 1, nproc
      !Determine communication partner at i-th stage
      !myp = comm_order(i)-1         !Using precomputed comm_order
      myp = modulo(i-myid-1,nproc)   !Using MAGIC ONE LINER
      !They still need to agree on who sends first: the one with larger id will
      if (myid>myp) then
         !First send then recv
         call mpi_send(x,1,mpi_double_precision,myp,myid,mpi_comm_world,mpi_err)
         call mpi_recv(xr,1,mpi_double_precision,myp,myp,mpi_comm_world,mpi_stat,mpi_err)
         xr2 = xr2 + xr !Just for the sake of the test, we keep a running sum of all the received values
      elseif (myid<myp) then
         !First recv then send
         call mpi_recv(xr,1,mpi_double_precision,myp,myp,mpi_comm_world,mpi_stat,mpi_err)
         call mpi_send(x,1,mpi_double_precision,myp,myid,mpi_comm_world,mpi_err)
         xr2 = xr2 + xr !Just for the sake of the test, we keep a running sum of all the received values
      else
         !It is me!!
         xr2 = xr2 + x !Just for the sake of the test, we keep a running sum of all the received values
      endif
   enddo

   !Pick up a random process to test the end result
   if (myid==0) then
      call random_number(rp)
      myp = floor(rp*nproc)
   endif
   call mpi_bcast(myp,1,mpi_integer,0,mpi_comm_world,mpi_err)

   !Check the end result against the MPI one using a random process
   if (myid==myp) write(*,'(a,g0,a,i0)') "DIFF: ", (xr1-xr2), " from proc ", myp

   !Finalize MPI
   call mpi_finalize(mpi_err)
 endprogram myalltoall
	program myalltoall
	!Program to show a simple implementation of a deadlock avoiding mpi loop among all processes which,
	!in principle, is similar to an alltoall loop. However, the main purpose of the technique shown here is to
	!properly reorder shortest (i.e., each process with just few others) non-blocking communication loops,
	!in order to alleviate the burden on the communication side (as each exchange is matched, everything
	!is exchanged very quickly). Here, it is tested against the the mpi_allreduce intrinsic with MPI_SUM
	!on a single real variable, but IT IS NOT a replacement for allreduce (nor alltoall or any other intrinsic).
	use, intrinsic :: iso_fortran_env, only : int32, real64
	use mpi
	implicit none
	integer(int32), allocatable :: comm_order(:)
	integer(int32) :: i, myp, nproc, myid, mpi_err, mpi_stat(mpi_status_size)
	real(real64) :: x, xr1, xr2, xr, rp

	!Initialize MPI
	call mpi_init(mpi_err)
	call mpi_comm_size(mpi_comm_world,nproc,mpi_err)
	call mpi_comm_rank(mpi_comm_world,myid,mpi_err)

	!The local variable that each process will send to any other process
	call random_number(x)

	!In order to test the loop below, as an example, we compute a reference allreduce sum of the local variables
	call mpi_allreduce(x,xr1,1,mpi_double_precision,mpi_sum,mpi_comm_world,mpi_err)

	!The key to avoid a deadlock is to determine a proper communication order for each process. We use a round robin algorithm.
	!For certain purposes, one might want to store and preprocess the communication order before using it, which can be done as
	!follows:
	allocate(comm_order(nproc))
	do i = 1, nproc
	comm_order(mod(myid+i-1,nproc)+1) = i
	enddo

	!The deadlock avoiding loop with all the processes (including itself)
	xr2 = 0.0_real64
	do i = 1, nproc
	!Determine communication partner at i-th stage
	!myp = comm_order(i)-1 !Using precomputed comm_order
	myp = modulo(i-myid-1,nproc) !Using MAGIC ONE LINER
	!They still need to agree on who sends first: the one with larger id will
	if (myid>myp) then
	!First send then recv
	call mpi_send(x,1,mpi_double_precision,myp,myid,mpi_comm_world,mpi_err)
	call mpi_recv(xr,1,mpi_double_precision,myp,myp,mpi_comm_world,mpi_stat,mpi_err)
	xr2 = xr2 + xr !Just for the sake of the test, we keep a running sum of all the received values
	elseif (myid<myp) then
	!First recv then send
	call mpi_recv(xr,1,mpi_double_precision,myp,myp,mpi_comm_world,mpi_stat,mpi_err)
	call mpi_send(x,1,mpi_double_precision,myp,myid,mpi_comm_world,mpi_err)
	xr2 = xr2 + xr !Just for the sake of the test, we keep a running sum of all the received values
	else
	!It is me!!
	xr2 = xr2 + x !Just for the sake of the test, we keep a running sum of all the received values
	endif
	enddo

	!Pick up a random process to test the end result
	if (myid==0) then
	call random_number(rp)
	myp = floor(rp*nproc)
	endif
	call mpi_bcast(myp,1,mpi_integer,0,mpi_comm_world,mpi_err)

	!Check the end result against the MPI one using a random process
	if (myid==myp) write(*,'(a,g0,a,i0)') "DIFF: ", (xr1-xr2), " from proc ", myp

	!Finalize MPI
	call mpi_finalize(mpi_err)
	endprogram myalltoall