Nonblocking Halo Exchange Example
Our first application of nonblocking point-to=point communications is to the famiamilar halo exchange problem. We can modify the solutions to the Exercise,
We can easily convert the Sendrecv invocations to two Irecv and two Isend calls. With nonblocking sends and receives, we do not have to carefully manage the “sweeps” up and down or right to left; the MPI libary will handle that. This avoids the serialization of the blocking exchanges; this was the main purpose for the introduction of nonblocking communications.
As a general rule, the calls to Irecv should be posted first, followed by the Isend. In addition, in many cases such as this, it is preferable to use Waitall so that the MPI library can handle the ordering of the completions.
#include <iostream>
#include <mpi.h>
using namespace std;
int main (int argc, char *argv[]) {
int i, j;
int N;
// Added for MPI
int nr, nrl, nc, ncl;
int rank, nprocs;
MPI_Status status;
int root=0, tag=0;
int up, down;
//Initialize MPI
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
//Usually we would read in nr and nc; they represent the global grid RxC;
//For this case they are both N.
N=12;
nr=N;
nc=N;
//Use ncl for clarity
ncl=nc;
//Weak scaling
//nrl=nr;
//Strong scaling
if (nr%nprocs!=0) {
MPI_Finalize();
cout<<"Not an even division of the arrays.\n";
return 0;
}
else {
nrl=nr/nprocs;
}
//Find my neighbors
if (rank==0) {
up=MPI_PROC_NULL;
}
else {
up=rank-1;
}
if (rank==nprocs-1) {
down=MPI_PROC_NULL;
}
else {
down=rank+1;
}
//Two extra rows and columns for boundary conditions
int nrows=nrl+2;
int ncols=ncl+2;
double **w=new double*[nrows];
double *wptr=new double[(nrows)*(ncols)];
for (i=0;i<(nrl+2);++i,wptr+=ncols)
w[i] = wptr;
//Initialize
for (int i=0; i<=nrl+1; ++i) {
for (int j=0; j<=ncl+1; ++j) {
w[i][j]=50.;
}
}
for ( int j=0; j<=ncl+1; ++j ) {
w[1][j]=(double)(rank+1)*2.;
w[nrl][j]=(double)(rank+1)*2.5;
}
double topBC=0.;
double bottomBC=200.;
double leftBC=100.;
double rightBC=100.;
for (int i=1;i<=nrl;++i){
w[i][0]=leftBC;
w[i][ncl+1]=rightBC;
}
if (rank==0) {
for (int i=0;i<=ncl+1;++i){
w[0][i]=topBC;
}
}
if (rank==nprocs-1) {
for (int i=0;i<+ncl+1;++i){
w[nrl+1][i]=bottomBC;
}
}
int nrequests=4;
MPI_Request requests[nrequests];
MPI_Irecv(&w[nrl+1][0], ncl+2, MPI_DOUBLE, down, tag, MPI_COMM_WORLD, &requests[0]);
MPI_Irecv(&w[0][0], ncl+2, MPI_DOUBLE, up, tag, MPI_COMM_WORLD, &requests[1]);
MPI_Isend(&w[1][0], ncl+2, MPI_DOUBLE, up, tag, MPI_COMM_WORLD, &requests[2]);
MPI_Isend(&w[nrl][0],ncl+1,MPI_DOUBLE, down, tag, MPI_COMM_WORLD, &requests[3]);
MPI_Status status_arr[4];
MPI_Waitall(nrequests,requests,status_arr);
//Check results
MPI_Barrier(MPI_COMM_WORLD);
int uwsize=(nrl+2)*(ncl+2);
double *u = new double[uwsize];
memset(u,0.,uwsize);
int position;
if ( rank == 0 ) {
cout<<"---After for rank 0---"<<endl;
for (int i=0;i<nrl+2;i++) {
for (int j=0;j<ncl+2;j++) {
cout<<w[i][j]<<" ";
}
cout<<endl;
}
for (int n=1;n<nprocs;++n) {
memset(u,0.,uwsize);
MPI_Recv(u,uwsize,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
cout<<"--After for rank "<<status.MPI_SOURCE<<endl;
position=0;
for (int i=0;i<=nrl+1;i++) {
for (int j=0;j<=ncl+1;j++) {
cout<<u[position++]<<" ";
}
cout<<endl;
}
}
}
else {
// Pack the 2D array into the buffer
position=0;
for (int i=0; i<=nrl+1; i++)
for (int j=0;j<=ncl+1; j++)
u[position++]=w[i][j];
MPI_Send(u,uwsize, MPI_DOUBLE, 0, rank, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
}
program halo
use mpi_f08
double precision, allocatable, dimension(:,:) :: w,u
integer :: N=12
double precision :: topBC, bottomBC, edgeBC
integer :: i
integer :: nr, nc, nrl, ncl
integer :: rank, nprocs, tag=0
type(MPI_Status), dimension(:), allocatable :: mpi_status_arr
type(MPI_Request), dimension(:), allocatable :: mpi_requests
integer :: nrequests
integer, parameter :: root=0
integer :: left, right
integer :: usize
type(MPI_Status) :: check_stat
!Initialize MPI, get the local number of columns
call MPI_INIT()
call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs)
call MPI_COMM_RANK(MPI_COMM_WORLD,rank)
!For this case
!Usually we'd read in nr and nc; they represent the global grid size RxC.
nr=N
nc=N
!Generally we'd just use nr and not duplicate; this is for clarity later
nrl=nr
! weak scaling
!ncl = nc
! strong scaling
if (mod(nc,nprocs).ne.0) then
call MPI_FINALIZE()
stop 'Not an even division of the array'
else
ncl = nc/nprocs
endif
! Find my neighbors
if ( rank .eq. root ) then
left = MPI_PROC_NULL
else
left = rank - 1
endif
if ( rank .eq. nprocs-1 ) then
right = MPI_PROC_NULL
else
right = rank + 1
endif
allocate(w(0:nrl+1,0:ncl+1))
! Initialize the array
w(:,:)=50.
!Set up rows to exchange
w(:,1)=dble(rank+1)*2
w(:,ncl)=dble(rank+1)*2.5
! Boundary conditions
topBC=0.d0
bottomBC=200.d0
leftBC=100.d0
rightBC=100.d0
!Fortran stitches by column
w(0,:)=topBC
w(nrl+1,:)=bottomBC
if (rank==0) then
w(1:nrl,0)=leftBC
else if (rank==nprocs-1) then
w(1:nrl,ncl+1)=rightBC
endif
write(*,*) "Topology ",rank, left, right
! Exchange halo values
nrequests=4
allocate(mpi_requests(nrequests),mpi_status_arr(nrequests))
call MPI_Irecv(w(1:nrl,ncl+1),nrl,MPI_DOUBLE_PRECISION,right,tag, &
MPI_COMM_WORLD,mpi_requests(1))
call MPI_Irecv(w(1:nrl,0) ,nrl,MPI_DOUBLE_PRECISION,left,tag, &
MPI_COMM_WORLD,mpi_requests(2))
call MPI_Isend(w(1:nrl,1) ,nrl,MPI_DOUBLE_PRECISION,left,tag, &
MPI_COMM_WORLD,mpi_requests(3))
call MPI_Isend(w(1:nrl,ncl) ,nrl,MPI_DOUBLE_PRECISION,right,tag, &
MPI_COMM_WORLD,mpi_requests(4))
call MPI_Waitall(nrequests,mpi_requests,mpi_status_arr)
!Spot-check result
! u only used here, strictly only rank 0 needs it
if (rank==0) then
allocate(u(0:nrl+1,0:ncl+1))
usize=size(u)
write(*,*) 'Result for rank 0'
do j=0,nrl+1
write(*,'(*(f10.2))') w(j,:)
enddo
do i=1,nprocs-1
u=0.d0
call MPI_Recv(u,usize,MPI_DOUBLE_PRECISION,MPI_ANY_SOURCE, &
MPI_ANY_TAG, MPI_COMM_WORLD,check_stat)
write(*,*) 'Result for rank',check_stat%MPI_SOURCE
do j=0,nrl+1
write(*,'(*(f10.2))') u(j,:)
enddo
enddo
else
call MPI_Send(w,size(w),MPI_DOUBLE_PRECISION,0,rank,MPI_COMM_WORLD)
endif
call MPI_FINALIZE()
end program
import sys
import numpy as np
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nprocs = comm.Get_size()
N = 12
#Strong scaling
if N%nprocs==0:
nrl = N//nprocs
else:
print("Number of ranks should divide the number of rows evenly.")
sys.exit()
#Weak scaling
#nrl = N
#Both
ncl = N
w = np.zeros((nrl+2, ncl+2), dtype=np.double)
#Arbitarary value for interior
#Be sure not to overwrite boundaries.
w[1:nrl+1,1:ncl+1] = 50.
#Rank-specific values for some rows
w[1,:]=(rank+1)*2.
w[nrl,:]=(rank+1)*2.5
#Set up boundaries
topBC=0.
bottomBC=200.
leftBC=100.
rightBC=100.
if rank == 0 :
w[0,:] = topBC # up
if rank == nprocs-1 :
w[nrl+1,:] = bottomBC # bottom
w[:,0] = leftBC # left
w[:,ncl+1] = rightBC # right
# setting up the up and down rank for each process
if rank == 0 :
up = MPI.PROC_NULL
else :
up = rank - 1
if rank == nprocs - 1 :
down = MPI.PROC_NULL
else :
down = rank + 1
tag=0
# Receive
rcv_down=comm.Irecv([w[nrl+1,1:ncl+1],MPI.DOUBLE],down)
rcv_up=comm.Irecv([w[0,1:ncl+1],MPI.DOUBLE],up)
send_up=comm.Isend([w[1,1:ncl+1],MPI.DOUBLE], up)
send_down=comm.Isend([w[nrl,1:ncl+1],MPI.DOUBLE], down)
requests=[rcv_down,rcv_up,send_up,send_down]
MPI.Request.Waitall(requests)
# Print result. We will skip printing the starting values this time.
comm.Barrier()
status=MPI.Status()
if rank==0:
u=np.zeros_like(w)
print("Final for rank 0")
for i in range(nrl+2):
print(w[i,:])
print()
for i in range(1,nprocs):
comm.Recv([u,MPI.DOUBLE],source=MPI.ANY_SOURCE,tag=MPI.ANY_TAG,status=status)
print("Final for rank",status.Get_source())
for i in range(nrl+2):
print(u[i,:])
print()
else:
comm.Send([w,MPI.DOUBLE],dest=0,tag=rank)