MPI Two Dimensional Halo Exercise
We can now pull all of our new knowledge together to let MPI do more of the work for us in the heated-plate problem.
Exercise
In the language of your choice, write a heated-plate solution using two-dimensional data decomposition. Use MPI types and the MPI Cartesian communicator. Stay with blocking sendrecv for this exercise. Boundary conditions should be nonperiodic in both directions.
Check your rsults by plotting. If you wish, you can use the script contour2d.py:
contour2d.py source code
import sys
import os
import argparse
import re
import numpy as np
import matplotlib.pyplot as plt
parser = argparse.ArgumentParser()
parser.add_argument("-r", "--rows", help="Rows in process topology")
parser.add_argument("-c", "--cols", help="Columns in process topology")
#parser.add_argument("-f", "--fortran", help="Fortran ordering", action="store_true")
parser.add_argument("filename", help="Base name for data files")
args = parser.parse_args()
base = args.filename
nrows = int(args.rows)
ncols = int(args.cols)
files = [f for f in os.listdir('.') if re.match(base+r"\d*",f)]
files.sort()
for row in range(nrows):
subdomains=[]
for col in range(ncols):
thisrow=base+str(row)+str(col)
print(thisrow)
data=np.loadtxt(thisrow,unpack=False)
subdomains.append(data)
rowimage=np.hstack(subdomains)
image=np.vstack(rowimage)
fig=plt.figure()
plt.contourf(image)
plt.colorbar()
plt.show()
The usage is
python contour2d.py -r numrows -c numcols basefilename
The files should be written by row with the name fitting the pattern
base<coord[0]coord[1]>
(or coord[1]coord[2] for Fortran) e.g.
out00 out01 out11 ...
Example Solutions
C++
#include <iostream>
#include <cmath>
#include <string>
#include <sstream>
#include <fstream>
#include <iomanip>
#include <mpi.h>
#define MAX_ITER 10000000
void set_bcs(int nr, int nc, int nrows, int ncols, int* coords, double **u);
using namespace std;
int main (int argc, char *argv[]) {
double diff; // Change in value
int N, M;
int iterations = 0;
int diffInterval;
double epsilon;
// Added for MPI
int nrows, ncols;
int nrl, ncl;
int rank, nprocs;
int errcode;
int up, down, right, left;
int tag=0;
double gdiff;
//Initialize MPI
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
/* check number of parameters and read in epsilon, filename, optionally size */
if (argc < 3) {
printf ("USAGE: %s epsilon output-file <N> <M>\n", argv[0]);
MPI_Abort(MPI_COMM_WORLD,errcode);
return 1;
}
else {
stringstream ssepsilon;
ssepsilon<<argv[1];
if (ssepsilon.fail()) {
printf ("Error reading in epsilon value\n");
MPI_Abort(MPI_COMM_WORLD,errcode);
return 2;
}
ssepsilon>>epsilon;
if (argc==3) {
N=500;
M=500;
}
if (argc==4) {
stringstream ssnr;
ssnr<<argv[3];
if (ssnr.fail()) {
cout<<"Error converting row dimension \n";
MPI_Abort(MPI_COMM_WORLD,errcode);
return 2;
}
ssnr>>N;
M=N;
}
if (argc==5) {
stringstream ssnr;
ssnr<<argv[3];
if (ssnr.fail()) {
cout<<"Error converting row dimension \n";
MPI_Abort(MPI_COMM_WORLD,errcode);
return 2;
}
ssnr>>N;
stringstream ssnc;
ssnc<<argv[4];
if (ssnc.fail()) {
cout<<"Error converting row dimension \n";
MPI_Abort(MPI_COMM_WORLD,errcode);
return 2;
}
ssnc>>M;
}
}
//For simplicity, we will limit this code to perfect square process count
double rows=sqrt(nprocs);
nrows=(int)rows;
if ( nrows*nrows != nprocs ) {
cout<<"This code requires a perfect square number of processes\n";
MPI_Abort(MPI_COMM_WORLD,errcode);
return 1;
}
else {
ncols=nrows;
}
//Strong scaling
//Weak scaling would set nrl=N and ncl=M
if (N%nrows==0 && M%ncols==0) {
nrl = N/nrows;
ncl = M/ncols;
}
else {
cout<<"Number of ranks should divide the total number of rows/cols evenly.\n";
MPI_Abort(MPI_COMM_WORLD,errcode);
}
double **u=new double*[nrl+2];
double **w=new double*[nrl+2];
double **diffs=new double*[nrl+2];
double *uptr=new double[(nrl+2)*(ncl+2)];
double *wptr=new double[(nrl+2)*(ncl+2)];
double *dptr=new double[(nrl+2)*(ncl+2)];
for (int i=0;i<nrl+2;++i,uptr+=ncl+2) {
u[i] = uptr;
}
for (int i=0;i<nrl+2;++i,wptr+=ncl+2) {
w[i] = wptr;
}
for (int i=0;i<nrl+2;++i,dptr+=ncl+2) {
diffs[i] = dptr;
}
//create Cartesian topology
int ndims=2;
int dims[2]={nrows,ncols};
int periods[2]={0,0};
int reorder=0;
MPI_Comm grid_comm;
int grid_rank;
int grid_coords[2];
MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, periods, reorder, &grid_comm);
MPI_Comm_rank(grid_comm, &grid_rank);
MPI_Cart_coords(grid_comm, grid_rank, ndims, grid_coords);
int direction=0;
int displ=1;
MPI_Cart_shift(grid_comm, direction, displ, &up, &down);
direction=1;
MPI_Cart_shift(grid_comm, direction, displ, &left, &right);
cout<<"topo "<<grid_rank<<" "<<left<<" "<<right<<" "<<up<<" "<<down<<endl;
//Set up the MPI type for columns
MPI_Datatype col;
MPI_Type_vector(nrl+2,1,ncl+2,MPI_DOUBLE,&col);
MPI_Type_commit(&col);
MPI_Status status;
//Initialize solution matrices
for ( int i = 0; i <= nrl+1; i++ ) {
for (int j = 0; j <= ncl+1; j++ ) {
u[i][j] = 0.;
w[i][j] = 0.;
}
}
set_bcs(nrl, ncl, nrows, ncols, grid_coords, u);
diffInterval=1;
double time0=MPI_Wtime();
// Compute steady-state solution
// The diffs array is mostly to make it simpler to extend the testing interval
if (rank==0) {
cout<<"Running until the difference is <="<<epsilon<<" with size "<<N<<"x"<<M<<"\n";
}
while ( iterations<=MAX_ITER ) {
// reset diff each time through to get max abs diff
// max_element doesn't work great for twod arrays and is often slow
diff=.8*epsilon;
MPI_Sendrecv(&u[1][1],ncl, MPI_DOUBLE,up,tag,&u[nrl+1][1],
ncl, MPI_DOUBLE,down,tag,grid_comm,&status);
MPI_Sendrecv(&u[nrl][1],ncl,MPI_DOUBLE,down,tag,&u[0][1],
ncl,MPI_DOUBLE,up,tag,grid_comm,&status);
MPI_Sendrecv(&u[0][ncl],1,col,right,tag,&u[0][0],1,col,left, tag, grid_comm, &status);
MPI_Sendrecv(&u[0][1],1,col,left,tag,&u[0][ncl+1],1,col,right, tag, grid_comm, &status);
for (int i=1; i<=nrl;i++) {
for (int j=1;j<=ncl;j++) {
w[i][j] = (u[i-1][j] + u[i+1][j] + u[i][j-1] + u[i][j+1])/4.0;
diffs[i][j] = abs(w[i][j] - u[i][j]);
}
}
if (iterations%diffInterval==0) {
for (int i=1; i<=nrl;i++) {
for (int j=1;j<=ncl;j++) {
if (diff<diffs[i][j]) {
diff=diffs[i][j];
}
}
}
//Find max of diff in all the processors.
MPI_Allreduce(&diff,&gdiff,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
if (gdiff <= epsilon)
break;
}
//Set halo values
/*
*/
for (int j=0; j<=ncl+1; j++) {
w[0][j]=u[0][j];
w[nrl+1][j]=u[nrl+1][j];
}
for (int i=0;i<=nrl+1; i++) {
w[i][0]=u[i][0];
w[i][ncl+1]=u[i][ncl+1];
}
for (int i=0; i<=nrl+1;i++) {
for (int j=0;j<=ncl+1;j++) {
u[i][j] = w[i][j];
}
}
set_bcs(nrl, ncl, nrows, ncols, grid_coords, u);
iterations++;
} //end of computation
if (iterations>MAX_ITER) {
if (rank==0) {
cout<<"Warning: maximum iterations exceeded\n";
}
}
double totalTime=(MPI_Wtime()-time0);
if (rank==0) {
cout << "Completed "<<iterations<<" iterations; time "<<totalTime<<endl;
}
// Write solution to output file
ofstream fp;
string fname=argv[2]+to_string(grid_coords[0])+to_string(grid_coords[1]);
fp.open(fname,ios::out);
for (int i = 1; i <= nrl; i++) {
for (int j = 1; j <= ncl; j++) {
fp<<u[i][j]<<" ";
}
fp<<"\n";
}
fp.close();
//Type_free not really necessary but good practice
MPI_Type_free(&col);
MPI_Comm_free(&grid_comm);
MPI_Finalize();
return 0;
}
void set_bcs(int nr, int nc, int nrows, int ncols, int* coords, double **u) {
/* Set boundary values.
* This has an ice bath on the top edge.
* Note: when plotting, 0,0 is the bottom.
*/
double topBC=0.;
double bottomBC=100.;
double leftBC=100.;
double rightBC=100.;
if (coords[0]==0) {
for (int i=0;i<=nc+1;++i){
u[0][i]=topBC;
}
} else if (coords[0]==nrows-1) {
for (int i=0;i<=nc+1;++i){
u[nr+1][i]=bottomBC;
}
}
if (coords[1]==0) {
for (int i=0;i<=nr+1;++i){
u[i][0]=leftBC;
}
} else if (coords[1]==ncols-1) {
for (int i=0;i<=nr+1;++i){
u[i][nc+1]=rightBC;
}
}
}
Fortran
program sendrows
use mpi_f08
integer, parameter :: maxiter=10000000
integer :: N, M
integer :: i,j
integer :: numargs, diff_interval, interations=0
double precision :: eps, diff = 1.0
character(len=80) :: arg, filename
character(len=4) :: char_row, char_col
double precision, allocatable, dimension(:,:) :: u, w
double precision :: time0, time1
double precision :: rows
integer :: nrows, ncols, nrl, ncl
integer :: rank, nprocs, tag=0
integer :: left, right, up, down
integer, dimension(2) :: dims, grid_coords
logical, dimension(2) :: periods
logical :: reorder
integer :: ndims, grid_rank
integer :: direction, displ
integer, parameter :: root=0
type(MPI_Comm) :: grid_comm
type(MPI_Status) :: mpi_stat
type(MPI_Datatype) :: row
double precision :: gdiff
character(len=24) :: fname
interface
subroutine set_bcs(nr,nc,nrows,ncols,coords,u)
implicit none
integer, intent(in) :: nr, nc, nrows, ncols
integer, dimension(:), intent(in) :: coords
double precision, dimension(0:,0:), intent(inout) :: u
end subroutine
end interface
!Initialize MPI, get the local number of columns
call MPI_INIT()
call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs)
call MPI_COMM_RANK(MPI_COMM_WORLD,rank)
! check number of parameters and read in epsilon, filename, optionally size
! all ranks do this
numargs=command_argument_count()
if (numargs .lt. 2) then
call MPI_Finalize()
stop 'USAGE: epsilon output-file <N> <M>'
else
call get_command_argument(1,arg)
read(arg,*) eps
call get_command_argument(2,filename)
if (numargs==2) then
N=500
M=500
endif
if (numargs==3) then
call get_command_argument(3,arg)
read(arg,*) N
M=N
endif
if (numargs==4) then
call get_command_argument(3,arg)
read(arg,*) N
call get_command_argument(4,arg)
read(arg,*) M
endif
endif
!For simplicity, we will limit this code to perfect square process count
rows=sqrt(dble(nprocs))
if ( rows /= dble(int(rows)) ) then
call MPI_Finalize()
stop "This code requires a perfect square number of processes"
else
nrows=int(rows)
ncols=nrows
endif
!Strong scaling
!Weak scaling would set nrl=N and ncl=M
if (mod(N,nrows)==0 .and. mod(M,ncols)==0) then
nrl = N/nrows
ncl = M/ncols
else
call MPI_Finalize()
stop "Number of ranks should divide the number of rows evenly."
endif
!create cartesian topology for processes
ndims=2
dims(1) = nrows ! number of rows
dims(2) = ncols ! number of columns
periods(1) = .false. ! cyclic in this direction
periods(2) = .false. ! not cyclic in this direction
reorder = .false. ! allow MPI to reorder if it sees fit
call MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, periods, &
reorder, grid_comm)
call MPI_Comm_rank(grid_comm, grid_rank)
call MPI_Cart_coords(grid_comm, grid_rank, ndims, grid_coords)
direction = 0 ! shift along the 1st index (0 or 1)
displ = 1 ! shift by 1
call MPI_Cart_shift(grid_comm, direction, displ, up, down)
direction = 1 ! shift along the 2nd index (0 or 1)
call MPI_Cart_shift(grid_comm, direction, displ, left, right)
write(*,*) 'topo', grid_rank, up, down, left, right
! Set up values
allocate(u(0:nrl+1, 0:ncl+1), w(0:nrl+1,0:ncl+1))
u=0.d0
w=0.d0
! Boundary conditions
call set_bcs(nrl,ncl,nrows,ncols,grid_coords,u)
if ( grid_coords(1)==nrows-1) then
! print *,grid_rank, grid_coords, u(nrl+1,:)
endif
call MPI_Type_vector(ncl+2,1,nrl+2,MPI_DOUBLE_PRECISION,row)
call MPI_TYPE_COMMIT(row)
diff_interval=1
! Walltime from process 0 is good enough for me
if (grid_rank .eq. 0) then
write (*,'(a,es15.8,a,i4,a,i4)') 'Running until the difference is <= ', &
eps,' for global size ',N,'x',M
time0=MPI_WTIME()
endif
! Compute steady-state solution
do while ( iterations<=maxiter )
! Exchange halo values
! Send left and right
call MPI_SENDRECV(u(1:nrl,1) ,nrl,MPI_DOUBLE_PRECISION,left,tag, &
u(1:nrl,ncl+1),nrl,MPI_DOUBLE_PRECISION,right,tag, &
grid_comm,mpi_stat)
call MPI_SENDRECV(u(1:nrl,ncl) ,nrl,MPI_DOUBLE_PRECISION,right,tag, &
u(1:nrl,0) ,nrl,MPI_DOUBLE_PRECISION,left,tag, &
grid_comm,mpi_stat)
! Send up and down
call MPI_SENDRECV(u(nrl,0) ,1, row, up,tag, &
u(nrl+1,0),1, row,down,tag, &
grid_comm,mpi_stat)
call MPI_SENDRECV(u(1,0) ,1,row, down,tag, &
u(0,0) ,1,row, up,tag, &
grid_comm,mpi_stat)
do j=1,ncl
do i=1,nrl
w(i,j) = 0.25*(u(i-1,j) + u(i+1,j) + u(i,j-1) + u(i,j+1))
enddo
enddo
if (mod(iterations,diff_interval)==0) then
if (diff_interval==-1) continue !disable convergence test
diff=maxval(abs(w(1:nrl,1:ncl)-u(1:nrl,1:ncl)))
call MPI_ALLREDUCE(diff,gdiff,1,MPI_DOUBLE_PRECISION,MPI_MAX, &
MPI_COMM_WORLD)
if (gdiff <= eps) then
exit
endif
endif
!Set halo values
w(0,:)=u(0,:)
w(nrl+1,:)=u(nrl+1,:)
w(:,0)=u(:,0)
w(:,ncl+1)=u(:,ncl+1)
u = w
! Reset physical boundaries (they get overwritten in the halo exchange)
call set_bcs(nrl,ncl,nrows,ncols,grid_coords,u)
!If this happens we will exit at next conditional test.
!Avoids explicit break here
if (iterations>maxiter) then
if (grid_rank==0) then
write(*,*) "Warning: maximum iterations exceeded"
endif
endif
iterations = iterations + 1
enddo
if (grid_rank==root) then
time1 = MPI_WTIME()
write(*,*) 'completed; iterations = ', iterations,"in ",time1-time0," sec"
endif
! Write solution to output file by row. Must use correct stitching to get
! overall result, such as a numpy concatenate on axis=1.
! It would also work to output by columns, in which case stitching is more
! straightforward.
!Some compilers (Intel) like to add EOLs to line up columns in output. The
! Fortran standard permits this with * IO, but it complicates plotting.
! Use some F08 features to fix this.
write(char_row,'(i4)') grid_coords(1)
write(char_col,'(i4)') grid_coords(2)
fname=trim(filename)//trim(adjustl(char_row))//trim(adjustl(char_col))
open (unit=10,file=fname)
do i=1,nrl
write (10,'(*(g0,1x))') u(i,1:ncl)
enddo
call MPI_Comm_free(grid_comm)
call MPI_Finalize()
end program
subroutine set_bcs(nr,nc,nrows,ncols,coords,u)
implicit none
integer, intent(in) :: nr, nc, nrows, ncols
integer, dimension(:), intent(in) :: coords
double precision, dimension(0:,0:), intent(inout) :: u
double precision :: topBC, bottomBC, leftBC, rightBC
! Set boundary values
! This has the ice bath on the bottom edge.
! Note: when plotting, 0,0 is the bottom.
topBC=0.d0
bottomBC=100.d0
rightBC=100.d0
leftBC=100.d0
!Set boundary conditions
if (coords(1)==0) then
u(0,:)=topBC
else if (coords(1)==nrows-1) then
u(nr+1,:)=bottomBC
endif
if (coords(2)==0) then
u(:,0)=leftBC
else if (coords(2)==ncols-1) then
u(:,nc+1)=rightBC
endif
end subroutine set_bcs
Python
import sys
import time
import numpy as np
from mpi4py import MPI
def set_bcs(u,nrl,ncl,nrows,ncols,coords):
# Set physical boundary values and compute mean boundary value.
# Due to Python pass-by-assignment, mutable parameters will be
#changed outside (so this is a subroutine)
topBC=0.
bottomBC=100.
leftBC = 100.
rightBC = 100.
if coords[0]==0:
u[0,:]=topBC
if coords[0]==nrows-1:
u[nrl+1,:]=bottomBC
if coords[1]==0:
u[:,0]=leftBC
if coords[1]==ncols-1:
u[:,ncl+1]=rightBC
# check number of parameters and read in parameters
#Better to use argparse but this is closer to compiled language code and simpler
#if one hasn't used argparse.
argv=sys.argv
if len(argv) > 2:
try:
epsilon=float(argv[1])
filename=argv[2]
except:
"Illegal input"
sys.exit(1)
if len(argv) == 3:
N=500
M=500
if len(argv) == 4:
try:
N=int(argv[3])
M=N
except:
"Cannot convert grid dimension to integer"
sys.exit(2)
if len(argv) == 5:
try:
N=int(argv[3])
M=int(argv[4])
except:
"Cannot convert grid dimension to integer"
sys.exit(2)
else:
print('USAGE: epsilon output-file <N> <M>')
sys.exit(1)
# Initializing MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nprocs = comm.Get_size()
root = 0
tag = 0
#Set max iterations. 10 million should do it.
max_iterations=10000000
#Set number of rows and columns
#For simplicity, require a perfect square number of processes
nrows=np.sqrt(nprocs)
if nrows != int(nrows):
print("This code requires a perfect square number of processes")
sys.exit()
else:
nrows=int(nrows)
ncols=nrows
#Strong scaling
if M%nrows!=0:
print("Number of rows must be evenly divisible by number of processes")
sys.exit(0)
else:
nrl = N//nrows
ncl = N//ncols
#Set up topology
ndims=2
dims=[nrows, ncols]
periods=[False, False]
grid_comm=comm.Create_cart(dims, periods)
grid_rank=grid_comm.Get_rank()
grid_coords=grid_comm.coords
direction=0
displ=1
(up,down)=grid_comm.Shift(direction, displ)
direction=1
(left,right)=grid_comm.Shift(direction,displ)
u=np.zeros((nrl+2,ncl+2))
w=np.zeros((nrl+2,ncl+2))
u=np.zeros((nrl+2,ncl+2))
w=np.zeros((nrl+2,ncl+2))
set_bcs(u,nrl,ncl,nrows,ncols,grid_coords)
#Set up types
column_zero=MPI.DOUBLE.Create_subarray([nrl+2,ncl+2],[nrl,1],[1,0])
column_zero.Commit()
column_one=MPI.DOUBLE.Create_subarray([nrl+2,ncl+2],[nrl,1],[1,1])
column_one.Commit()
column_end=MPI.DOUBLE.Create_subarray([nrl+2,ncl+2],[nrl,1],[1,ncl])
column_end.Commit()
column_endbc=MPI.DOUBLE.Create_subarray([nrl+2,ncl+2],[nrl,1],[1,ncl+1])
column_endbc.Commit()
# Compute steady-state solution
iterations=0
diff_interval=1
start_time=MPI.Wtime()
my_diff = np.zeros(1)
global_diff=np.zeros(1)
if rank==root:
print(f'Running until the difference is < {epsilon:}, global size {N:}x{M:}')
while iterations<=max_iterations:
# Send up and receive down
grid_comm.Sendrecv([u[1,1:ncl+1], MPI.DOUBLE], up, tag, [u[nrl+1,1:ncl+1], MPI.DOUBLE], down, tag)
# Send down and receive up
grid_comm.Sendrecv([u[nrl, 1:ncl+1], MPI.DOUBLE], down, tag, [u[0,1:ncl+1], MPI.DOUBLE], up, tag)
#Send and receive left and right
grid_comm.Sendrecv([u,1,column_one], left, tag, [u,1,column_endbc], right, tag)
grid_comm.Sendrecv([u,1,column_end], right, tag, [u,1,column_zero], left, tag)
w[1:-1,1:-1]=0.25*(u[:-2,1:-1]+u[2:,1:-1]+u[1:-1,:-2]+u[1:-1,2:])
#set halo values
w[0,:]=u[0,:]
w[nrl+1,:]=u[nrl+1,:]
w[:,0]=u[:,0]
w[:,ncl+1]=u[:,ncl+1]
if iterations%diff_interval==0:
my_diff[0]=np.max(np.abs(w[1:-1,1:-1]-u[1:-1,1:-1]))
comm.Allreduce(my_diff,global_diff,op=MPI.MAX)
if global_diff[0]<=epsilon:
break
u[:,:]=w[:,:]
#Reapply physical boundary conditions
set_bcs(u,nrl,ncl,nrows,ncols,grid_coords)
iterations+=1
#This is what the weird "else" clause in Python for/while loops is used to do.
#Our stopping criterion is now on exceeding max_iterations so don't need
# to break, but we need to warn that it happened.
else:
if grid_rank==0:
print("Warning: maximum iterations exceeded")
total_time=MPI.Wtime()-start_time
if grid_rank==0:
print(f'completed in {iterations:} iterations with time {total_time:.2f}')
# Write solution to output file
filename = filename + str(grid_coords[0]) + str(grid_coords[1])
fout = open (filename,'w')
for i in range(1,nrl+1):
line=" ".join(map(str,list(u[i,1:ncl+1])))
row=line+"\n"
fout.write(row)
# All done!
#print(f"wrote output file {filename:}")