MPI Project Set 4

Project 1

Many algorithms can or must utilize more than one ghost zone in each dimension. Write a program in the language of your choice to send and receive the halo zones between two processes. You may assume the number of ghost zones is the same in each dimension. You do not have to use subarrays but they may simplify the code.

Example Solutions

C++

Contents of mpi_ghostzones.cxx

#include <cstring>
#include <iostream> 
#include <iomanip> 
#include <mpi.h> 

using namespace std; 

/*#This example exchanges data among four rectangular domains with halos.  Most real codes use squares, but we want to illustrate how to use different dimensions. 
*/

int main (int argc, char *argv[]) {

    int N, M;

    // Added for MPI
    int nrl, ncl;
    int rank, grid_rank, nprocs;
    int up, down, right, left;
    int tag=0;
    int nghosts, w_halo;

    //Initialize MPI
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);

    N = 8;
    M = 12;

    int rows=2;
    int cols=3;

    if (rows*cols != nprocs) {
        cout<<"Number of rows times columns does not equal nprocs\n";
        return 1;
    }

    if (N%rows==0 && M%cols==0) {
        nrl = N/rows;
        ncl = M/cols;
    }
    else {
        cout<<"Number of ranks should divide the number of rows evenly.\n";
        return 2;
    }

    nghosts=2;

    w_halo=nghosts*2;
    int nrl_total=nrl+w_halo;
    int ncl_total=ncl+w_halo;
    double **w=new double*[nrl_total];
    double *wptr=new double[(nrl_total)*(ncl_total)];

    for (int i=0;i<nrl_total;++i,wptr+=ncl_total) {
       w[i] = wptr;
    }

    double counter=1.;
    for ( int i = 0; i < nrl_total; i++ ) {
         for (int j = 0; j < ncl_total; j++ ) {
             w[i][j] = (rank+1)*counter;
             counter++;
         }
    }

    //Set up the topology assuming processes number left to right by row
    const int ndims=2;
    int dims[2]={rows,cols};
    int periods[2]={1,0};
    int reorder=0;
    MPI_Comm grid_comm;
    int grid_coords[2];

    MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, periods, reorder, &grid_comm);
    MPI_Comm_rank(grid_comm, &grid_rank);
    MPI_Cart_coords(grid_comm, grid_rank, ndims, grid_coords);

    int direction=0;
    int displ=1;
    MPI_Cart_shift(grid_comm, direction, displ, &up, &down);

    direction=1;
    MPI_Cart_shift(grid_comm, direction, displ, &left, &right);

    cout<<"topo "<<grid_rank<<" "<<up<<" "<<down<<" "<<left<<" "<<right<<endl;

    //Boundary conditions
    double topBC=100.;
    double bottomBC=400.;
    double rightBC=350.;
    double leftBC=350.;

    if (grid_coords[0]==0) {
       for (int i=0; i<nghosts; ++i) {
           for (int j=0;j<ncl_total;++j){
               w[i][j]=topBC;
           }
       }
    }
    if (grid_coords[0]==rows-1) {
        for (int i=nrl+nghosts; i<nrl_total; ++i) {
            for (int j=0;j<ncl_total;++j){
                w[i][j]=bottomBC;
            }
       }
    }

    if (grid_coords[1]==0) {
        for (int i=0; i<nrl_total; ++i){
            for (int j=0; j<nghosts; ++j) {
                w[i][j]=leftBC;
            }
        }
    }

    if (grid_coords[1]==cols-1) {
        for (int i=0;i<nrl_total;++i){
            for (int j=ncl+nghosts; j<ncl_total; ++j) {
                w[i][j]=rightBC;
            }
        }
    }


    int starts[ndims]; 
    int subsizes[ndims];

    int sizes[]={nrl_total,ncl_total};

    MPI_Datatype sbuf_left,rbuf_right,sbuf_right,rbuf_left;
    MPI_Datatype sbuf_up,rbuf_down,sbuf_down,rbuf_up;

    //Set up the MPI type for row buffers
    //
    subsizes[0]=nghosts; subsizes[1]=ncl_total;

    starts[0]=nghosts; starts[1]=0;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&sbuf_up);
    MPI_Type_commit(&sbuf_up);

    starts[0]=nrl+nghosts; starts[1]=0;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&rbuf_down);
    MPI_Type_commit(&rbuf_down);

    starts[0]=nrl; starts[1]=0;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&sbuf_down);
    MPI_Type_commit(&sbuf_down);

    starts[0]=0; starts[1]=0;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&rbuf_up);
    MPI_Type_commit(&rbuf_up);

    //Set up MPI types for columns
    //
    subsizes[0]=nrl; subsizes[1]=nghosts;

    starts[0]=nghosts; starts[1]=ncl;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&sbuf_right);
    MPI_Type_commit(&sbuf_right);

    starts[0]=nghosts; starts[1]=0;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&rbuf_left);
    MPI_Type_commit(&rbuf_left);

    starts[0]=nghosts; starts[1]=nghosts;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&sbuf_left);
    MPI_Type_commit(&sbuf_left);

    starts[0]=nghosts; starts[1]=ncl+nghosts;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&rbuf_right);
    MPI_Type_commit(&rbuf_right);

    //Buffers set up

    MPI_Status status;

    //We declared our array as an array of pointers, so we must align the 
    //beginning pointer for MPI at the start of the actual data

    MPI_Sendrecv(&(w[0][0]),1,sbuf_up,up,tag,&(w[0][0]),1,rbuf_down,down,tag,grid_comm,&status);
    MPI_Sendrecv(&(w[0][0]),1,sbuf_down,down,tag,&(w[0][0]),1,rbuf_up,up,tag,grid_comm,&status);

    MPI_Sendrecv(&(w[0][0]),1,sbuf_right,right,tag,&(w[0][0]),1,rbuf_left,left,tag,grid_comm,&status);
    MPI_Sendrecv(&(w[0][0]),1,sbuf_left,left,tag,&(w[0][0]),1,rbuf_right,right,tag,grid_comm,&status);


    //Verification. Usually done with plots in real codes.
    //Simplified from earlier examples, we'll just spot-check
    //
    double *u=new double[(nrl_total)*(ncl_total)];
    int uwsize=(nrl_total)*(ncl_total);
    memset(u,0.,uwsize);
    int position;

    MPI_Barrier(grid_comm);
    if ( grid_rank == 0 ) {
        MPI_Recv(u,uwsize,MPI_DOUBLE,1,1,grid_comm,&status);
        cout<<"Ranks 0 and 1  Check columns"<<endl;
        position=0;
        for (int i=0;i<nrl_total;i++) {
            for (int j=0;j<ncl_total;j++) {
                cout<<w[i][j]<<" ";
            }
            cout<<" | ";
            for (int j=0;j<ncl_total;j++) {
                cout<<u[position++]<<" ";
            }
            cout<<endl;
        }
        cout<<endl;
    }

    if ( grid_rank == 1 ) {
        position=0;
        for (int i=0; i<nrl_total; i++)
            for (int j=0;j<ncl_total; j++)
                u[position++]=w[i][j];

        MPI_Send(u,uwsize,MPI_DOUBLE,0,1,grid_comm);
        cout<<"Rank 1 sent to rank 0 "<<endl;

    }
    MPI_Barrier(grid_comm);


    //Check columns on other side
    //
    memset(u,0.,uwsize);
    if ( grid_rank == 1 ) {
        MPI_Recv(u,uwsize,MPI_DOUBLE,2,2,grid_comm,&status);
        cout<<"Ranks 1 and 2 Check columns"<<endl;
        position=0;
        for (int i=0;i<nrl_total;i++) {
            for (int j=0;j<ncl_total;j++) {
                cout<<w[i][j]<<" ";
            }
            cout<<" | ";
            for (int j=0;j<ncl_total;j++) {
                cout<<u[position++]<<" ";
            }
            cout<<endl;
        }
        cout<<endl;
    }

    if ( grid_rank == 2 ) {
        position=0;
        for (int i=0; i<nrl_total; i++)
            for (int j=0;j<ncl_total; j++)
                u[position++]=w[i][j];

        MPI_Send(u,uwsize,MPI_DOUBLE,1,2,grid_comm);

    }
    MPI_Barrier(grid_comm);

    //Check periodic row exchanges
    //
    memset(u,0.,uwsize);
    if ( grid_rank == 0 ) {
        MPI_Recv(u,uwsize,MPI_DOUBLE,3,3,grid_comm,&status);
        cout<<"Ranks 0 and 3  Check rows including periodic exchange"<<endl;
        for (int i=0;i<nrl_total;i++) {
            for (int j=0;j<ncl_total;j++) {
                cout<<w[i][j]<<" ";
            }
            cout<<endl;
        }

        cout<<"-=---------------------------------------------------"<<endl;

        position=0;
        for (int i=0;i<nrl_total;i++) {
            for (int j=0;j<ncl_total;j++) {
                cout<<u[position++]<<" ";
            }
            cout<<endl;
        }
        cout<<endl;
    }

    if ( grid_rank == 3 ) {
        position=0;
        for (int i=0; i<nrl_total; i++)
            for (int j=0;j<ncl_total; j++)
                u[position++]=w[i][j];

        MPI_Send(u,uwsize,MPI_DOUBLE,0,3,grid_comm);

    }
    MPI_Barrier(grid_comm);
    //end checking section

    //Type_free not really necessary but good practice
    MPI_Type_free(&sbuf_up);
    MPI_Type_free(&sbuf_down);
    MPI_Type_free(&rbuf_up);
    MPI_Type_free(&rbuf_down);
    MPI_Type_free(&sbuf_left);
    MPI_Type_free(&sbuf_right);
    MPI_Type_free(&rbuf_left);
    MPI_Type_free(&rbuf_right);

    MPI_Comm_free(&grid_comm);

    MPI_Finalize();

    return 0;

}

Fortran

Contents of mpi_ghostplate.f90

program ghosts
   use mpi_f08
   implicit none

   integer, parameter :: maxiters=10000000
   integer            :: iteration=0
   integer            :: N, M
   integer            :: i,j
   integer            :: numargs, diff_interval, interations=0
   integer            :: lb, ubr, ubc
   double precision, allocatable, dimension(:,:)  :: u,w

   double precision   :: eps, diff = 1.0
   character(len=80)  :: arg, filename
   character(len=4)   :: char_row, char_col

   double precision   :: time0, time1
   double precision   :: rows
   integer            :: nrows, ncols, nrl, ncl
   integer            :: world_rank, nprocs, tag=0
   integer, parameter :: root=0
   integer            :: left, right, up, down
   integer            :: nghosts, w_halo
   integer            :: nrl_gz,ncl_gz,nrl_total,ncl_total
   integer            :: nsubdims
   integer, dimension(2) :: dims, grid_coords
   logical, dimension(2) :: periods
   integer, dimension(2) :: sizes, subsizes, starts
   logical            :: reorder
   integer            :: ndims, grid_rank
   integer            :: direction, displ
   integer            :: nrequests
   type(MPI_Comm)     :: grid_comm
   type(MPI_Status)   :: mpi_stat
   type(MPI_Datatype) :: sbuf_up, sbuf_down, rbuf_up, rbuf_down
   type(MPI_Datatype) :: sbuf_left, sbuf_right, rbuf_left, rbuf_right
   type(MPI_Request), dimension(:), allocatable :: mpi_requests

   double precision   :: gdiff
   character(len=24)  :: fname


   interface
     subroutine set_bcs(lb, nghosts,nr,nc,nrows,ncols,coords,u)
        implicit none
        integer,                intent(in)  :: lb, nghosts, nr, nc, nrows, ncols
        integer, dimension(:),  intent(in)  :: coords
        double precision, dimension(0:,0:), intent(inout) :: u
     end subroutine
   end interface

   !Initialize MPI, get the local number of columns
   call MPI_INIT()
   call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs)
   call MPI_COMM_RANK(MPI_COMM_WORLD,world_rank)

   ! check number of parameters and read in epsilon, filename, optionally size
   ! all ranks do this
   numargs=command_argument_count()
   if (numargs .lt. 2) then
      call MPI_Finalize()
      stop 'USAGE: epsilon output-file <N> <M>'
   else
      call get_command_argument(1,arg)
      read(arg,*) eps
      call get_command_argument(2,filename)

      if (numargs==2) then
         N=500
         M=500
      endif
      if (numargs==3) then
         call get_command_argument(3,arg)
         read(arg,*) N
         M=N
      endif
      if (numargs==4) then
         call get_command_argument(3,arg)
         read(arg,*) N
         call get_command_argument(4,arg)
         read(arg,*) M
      endif
   endif

  !For simplicity, we will limit this code to perfect square process count
   rows=sqrt(dble(nprocs))
   if ( rows /= dble(int(rows)) ) then
       call MPI_Finalize()
       stop "This code requires a perfect square number of processes"
   else
       nrows=int(rows)
       ncols=nrows
   endif

   !Strong scaling
   !Weak scaling would set nrl=N and ncl=M
   if (mod(N,nrows)==0 .and. mod(M,ncols)==0) then
       nrl = N/nrows
       ncl = M/ncols
   else
       call MPI_Finalize()
       stop "Number of ranks should divide the number of rows evenly."
   endif

   !create cartesian topology for processes
   ndims=2
   dims(1) = nrows      ! number of rows
   dims(2) = ncols      ! number of columns
   periods(1) = .false.  ! cyclic in this direction
   periods(2) = .false. ! not cyclic in this direction
   reorder    = .false.  ! allow MPI to reorder if it sees fit
   call MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, periods,                 &
                                                     reorder, grid_comm)
   call MPI_Comm_rank(grid_comm, grid_rank)
   call MPI_Cart_coords(grid_comm, grid_rank, ndims, grid_coords)

   direction =  0   ! shift along the 1st index (0 or 1)
   displ =  1       ! shift by  1
   call MPI_Cart_shift(grid_comm, direction, displ, up, down)

   direction =  1   ! shift along the 2nd index (0 or 1)
   call MPI_Cart_shift(grid_comm, direction, displ, left, right)

   ! Set up values
   ! Assume number of ghosts zones same in both dimensions, lower bounds same
   nghosts=1        ! number of ghost zones
   w_halo=2*nghosts ! total halo width in each dimension (including corners)

   nrl_gz=nrl+nghosts
   ncl_gz=ncl+nghosts
   nrl_total=nrl+w_halo
   ncl_total=ncl+w_halo

   !Assume lower bound same for both dimensions
   lb=0
   ubr=nrl_total-1
   ubc=ncl_total-1
   allocate(u(lb:ubr,lb:ubc), w(lb:ubr,lb:ubc))

   u=0.d0
   w=0.d0

   !Physical boundary conditions
   call set_bcs(lb,nghosts,nrl,ncl,nrows,ncols,grid_coords,u)

   ! Set up sendrecv/buffers
   nsubdims  = 2
   sizes  = [nrl_total,ncl_total]

   !Rows
   subsizes  = [nghosts,ncl_total]

   ! Start indices must begin at 0 like C regardless of declared lower bound

   starts = [nghosts,0]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                             MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION, sbuf_up)
   call MPI_TYPE_COMMIT(sbuf_up)

   starts = [nrl+nghosts,0]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,rbuf_down)
   call MPI_TYPE_COMMIT(rbuf_down)

   starts = [nrl,0]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,sbuf_down)
   call MPI_TYPE_COMMIT(sbuf_down)

   starts = [0,0]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION, rbuf_up)
   call MPI_TYPE_COMMIT(rbuf_up)

   !Columns
   subsizes  = [nrl,nghosts]

   starts = [nghosts,ncl]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,sbuf_right)
   call MPI_TYPE_COMMIT(sbuf_right)

   starts = [nghosts,0]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,rbuf_left)
   call MPI_TYPE_COMMIT(rbuf_left)

   starts = [nghosts,nghosts]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,sbuf_left)
   call MPI_TYPE_COMMIT(sbuf_left)

   starts = [nghosts,ncl+nghosts]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,rbuf_right)
   call MPI_TYPE_COMMIT(rbuf_right)

   nrequests=8
   allocate(mpi_requests(nrequests))

   diff_interval=1

  ! Walltime from process 0 is good enough for me
   if (grid_rank .eq. 0) then
      write (*,'(a,es15.8,a,i4,a,i4)') 'Running until the difference is <= ', &
                                              eps,' for global size ',N,'x',M
     time0=MPI_WTIME()
   endif

   ! Compute steady-state solution
   do while ( iteration<=maxiters )

     ! Exchange halo values

       call MPI_Irecv(u, 1, rbuf_right, right, tag, grid_comm,mpi_requests(1))
       call MPI_Irecv(u, 1, rbuf_left,  left,  tag, grid_comm,mpi_requests(2))
       call MPI_Irecv(u, 1, rbuf_up,    up,    tag, grid_comm,mpi_requests(3))
       call MPI_Irecv(u, 1, rbuf_down,  down,  tag, grid_comm,mpi_requests(4))

       call MPI_Isend(u, 1, sbuf_left,  left,  tag, grid_comm,mpi_requests(5))
       call MPI_Isend(u, 1, sbuf_right, right, tag, grid_comm,mpi_requests(6))
       call MPI_Isend(u, 1, sbuf_down,  down,  tag, grid_comm,mpi_requests(7))
       call MPI_Isend(u, 1, sbuf_up,    up,    tag, grid_comm,mpi_requests(8))

   ! complete communications
       call MPI_Waitall(nrequests,mpi_requests,MPI_STATUSES_IGNORE)

       do j=lb+nghosts,lb+nrl_gz-1
           do i=lb+nghosts,lb+ncl_gz-1
               w(i,j) = 0.25*(u(i-1,j) + u(i+1,j) + u(i,j-1) + u(i,j+1))
           enddo
       enddo

       if (mod(iteration,diff_interval)==0) then
           if (diff_interval==-1) continue  !disable convergence test
               diff=maxval(abs(w(nghosts:ubr-nghosts-1,nghosts:ubc-nghosts-1)  &
                   -u(nghosts:ubr-nghosts-1,nghosts:ubc-nghosts-1)))
               call MPI_ALLREDUCE(diff,gdiff,1,MPI_DOUBLE_PRECISION,MPI_MAX,   &
                                                             grid_comm)

               if (gdiff <= eps) then
                   exit
               endif
       endif

       !Update u. Don't overwrite boundaries.
       u(1:nrl,1:ncl) = w(1:nrl,1:ncl)

     ! Reset physical boundaries (they get overwritten in the halo exchange)
       call set_bcs(lb,nghosts,nrl,ncl,nrows,ncols,grid_coords,u)

      !If this happens we will exit at next conditional test.
      !Avoids explicit break here
       if (iteration>maxiters) then
          if (grid_rank==0) then
              write(*,*) "Warning: maximum iterations exceeded"
          endif
       endif

       iteration = iteration + 1

   enddo

   if (grid_rank==root) then
     time1 = MPI_WTIME()
     write(*,*) 'completed; iterations = ', iteration,"in ",time1-time0," sec"
   endif

  ! Write solution to output file by row. Must use correct stitching to get
  ! overall result, such as a numpy concatenate on axis=1.
  ! It would also work to output by columns, in which case stitching is more
  ! straightforward.

  !Some compilers (Intel) like to add EOLs to line up columns in output. The
  ! Fortran standard permits this with * IO, but it complicates plotting.
  ! Use some F08 features to fix this.

   write(char_row,'(i4)')  grid_coords(1)
   write(char_col,'(i4)')  grid_coords(2)
   fname=trim(filename)//trim(adjustl(char_row))//trim(adjustl(char_col))

   open (unit=10,file=fname)
   do i=lb+nghosts,lb+nghosts+nrl
      write (10,'(*(g0,1x))') u(i,1:ncl)
   enddo

   call MPI_Comm_free(grid_comm)

   call MPI_Finalize()

end program


subroutine set_bcs(lb,nghosts,nr,nc,nrows,ncols,coords,u)
  implicit none
  integer, intent(in)                      :: lb ,nghosts, nr, nc, nrows, ncols
  integer, dimension(:), intent(in)        :: coords
  double precision, dimension(lb:,lb:), intent(inout) :: u
  double precision                         :: topBC, bottomBC, leftBC, rightBC

  ! Set boundary values
  ! This has the ice bath on the bottom edge.
  ! Note: when plotting, 0,0 is the bottom.

  topBC=0.d0
  bottomBC=100.d0
  rightBC=100.d0
  leftBC=100.d0

  !Set boundary conditions
   if (coords(1)==0) then
      u(lb:lb+nghosts-1,:)=topBC
   else if (coords(1)==nrows-1) then
      u(nr+nghosts+lb:,:)=bottomBC
   endif
   if (coords(2)==0) then
      u(:,lb:lb+nghosts-1)=leftBC
   else if (coords(2)==ncols-1) then
      u(:,nc+nghosts+lb:)=rightBC
   endif

end subroutine set_bcs

Python

Contents of mpi_ghostzones.py

import sys
import numpy as np
from mpi4py import MPI

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nprocs = comm.Get_size()

#This example exchanges data among four rectangular domains with halos.
#Most real codes use squares, but we want to illustrate how to use different
#dimensions.

#Divide up the processes.  Either we require a perfect square, or we
#must specify how to distribute by row/column.  In a realistic program,
#the process distribution (either the total, for a perfect square, or
#the rows/columns) would be read in and we would need to check that the number
#of processes requested is consistent with the decomposition.

N = 8
M = 12

nrows=2
ncols=3

if nrows*ncols != nprocs:
    print("Number of rows times columns does not equal nprocs")
    sys.exit()

if N%nrows==0 and M%ncols==0:
    nrl = N//nrows
    ncl = M//ncols
else:
    print("Number of ranks should divide the number of rows evenly.")
    sys.exit()

#Set up the topology
ndims=2
dims=[nrows, ncols]
periods=[True, False]
grid_comm=comm.Create_cart(dims, periods)

grid_rank=grid_comm.Get_rank()
grid_coords=grid_comm.coords

direction=0
displ=1
(up,down)=grid_comm.Shift(direction, displ)
direction=1
(left,right)=grid_comm.Shift(direction,displ)

#print("Topo {:d} {:} {:d} {:d} {:d}".format(grid_rank,left,right,up,down))

#Assume same number of ghost zones for rows and columns
nghosts=2
w_halo=2*nghosts
nrl_total=nrl+w_halo
ncl_total=ncl+w_halo

w = np.zeros((nrl_total, ncl_total), dtype=np.double)

#Arbitrary values.
w[:,:]=np.reshape(np.arange(1,(nrl_total)*(ncl_total)+1),(nrl_total,ncl_total))
w=w*(grid_rank+1)

#Boundary conditions
#Make it easier to distinguish one side from another for checking
topBC=100
bottomBC=400.
edgeBC=350.
if grid_coords[0]==0:
    w[0:nghosts,:]=topBC
if grid_coords[0]==nrows-1:
    w[nrl+nghosts:,:]=bottomBC
if grid_coords[1]==0:
    w[:,0:nghosts]=edgeBC
if grid_coords[1]==ncols-1:
    w[:,ncl+nghosts:]=edgeBC

# set up MPI types for buffers

sizes = [nrl_total,ncl_total]

# Rows
subsizes  = [nghosts,ncl_total]

starts = [nghosts,0]
sbuf_up=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
sbuf_up.Commit()

starts = [nrl+nghosts,0]
rbuf_down=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
rbuf_down.Commit()

starts = [nrl,0]
sbuf_down=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
sbuf_down.Commit()

starts = [0,0]
rbuf_up=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
rbuf_up.Commit()

# Columns
subsizes  = [nrl,nghosts]

starts = [nghosts,ncl]
sbuf_right=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
sbuf_right.Commit()

starts = [nghosts,0]
rbuf_left=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
rbuf_left.Commit()

starts = [nghosts,nghosts]
sbuf_left=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
sbuf_left.Commit()

starts = [nghosts,ncl+nghosts]
rbuf_right=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
rbuf_right.Commit()

tag=0

# sending up and receiving down
grid_comm.Sendrecv([w,1,sbuf_up], up, tag, [w,1,rbuf_down], down, tag)
# sending down and receiving up
grid_comm.Sendrecv([w,1,sbuf_down], down, tag, [w,1,rbuf_up], up, tag)

# sending left and right
grid_comm.Sendrecv([w,1,sbuf_left], left, tag, [w,1,rbuf_right], right, tag)
grid_comm.Sendrecv([w,1,sbuf_right], right, tag, [w,1,rbuf_left], left, tag)

#Results
status=MPI.Status()
#Check row exchange
u=np.zeros_like(w)
uwsize=(nrl+2)*(ncl+2)

grid_comm.Barrier()
if grid_rank==0:
    grid_comm.Recv([u,MPI.DOUBLE],source=1,tag=1,status=status)
    print("Ranks 0 and 1 check columns")

    for i in range(nrl_total):
        print(w[i,:],end='')
        print("  |  ",end='')
        print(u[i,:])
    
    print()

if grid_rank==1:
    grid_comm.Send([w,MPI.DOUBLE],dest=0,tag=1)

grid_comm.Barrier()
u[:,:]=0.0
if grid_rank==1:
    grid_comm.Recv([u,MPI.DOUBLE],source=2,tag=2,status=status)
    print("Ranks 1 and 2 check columns")

    for i in range(nrl_total):
        print(w[i,:],end='')
        print("  |  ",end='')
        print(u[i,:])
    
    print()

if grid_rank==2:
    grid_comm.Send([w,MPI.DOUBLE],dest=1,tag=2)

#Check rows including periodic
grid_comm.Barrier()
u[:,:]=0.0
if grid_rank==0:
    grid_comm.Recv([u,MPI.DOUBLE],source=3,tag=3,status=status)
    print("Ranks 0 and 3 check rows including periodic exchange")

    for i in range(nrl_total):
        print(w[i,:])

    print('-----------------------------------------')

    for i in range(nrl_total):
        print(u[i,:])

if grid_rank==3:
    grid_comm.Send([w,MPI.DOUBLE],dest=0,tag=3)

Project 2

Modify or write a two-dimensional Jacobi solver with a variable number of ghost zones in each dimension. As in (1) you may assume the same number of ghost zones in each dimension. Use subarrays for the halo communications. Make your sends and receives nonblocking.

Example Solutions

C++

Contents of mpi_ghostplate.cxx

#include <iostream>
#include <cmath>
#include <string>
#include <sstream>
#include <fstream> 
#include <iomanip>
#include <mpi.h>
   

#define MAX_ITER 10000000

void set_bcs(int nghosts, int nr, int nc, int nrows, int ncols, int* coords, double **u);

using namespace std; 

int main (int argc, char *argv[]) {


    double diff;            // Change in value
    int N, M;
    int iteration = 0;
    int diffInterval;
    double epsilon;


    // Added for MPI
    int nrows, ncols;
    int nrl, ncl;
    int rank, grid_rank, nprocs;
    int up, down, right, left;
    int tag=0;
    int nghosts, w_halo;
    int errcode;
    double gdiff;

    //Initialize MPI
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);

   //check number of parameters and read in epsilon, filename, optionally size
    if (argc < 3) {
        printf ("USAGE:  %s epsilon output-file <N> <M>\n", argv[0]);
        MPI_Abort(MPI_COMM_WORLD,errcode);
        return 1;
    }
    else {
        stringstream ssepsilon;
        ssepsilon<<argv[1];
        if (ssepsilon.fail()) {
           printf ("Error reading in epsilon value\n");
           MPI_Abort(MPI_COMM_WORLD,errcode);
           return 2;
        }
        ssepsilon>>epsilon;

        if (argc==3) {
           N=500;
           M=500;
        }
        if (argc==4) {
            stringstream ssnr;
            ssnr<<argv[3];
            if (ssnr.fail()) {
                cout<<"Error converting row dimension \n";
                MPI_Abort(MPI_COMM_WORLD,errcode);
                return 2;
            }
            ssnr>>N;
            M=N;
        }
        if (argc==5) {
        stringstream ssnr;
        ssnr<<argv[3];
        if (ssnr.fail()) {
             cout<<"Error converting row dimension \n";
             MPI_Abort(MPI_COMM_WORLD,errcode);
             return 2;
        }
        ssnr>>N;
        stringstream ssnc;
        ssnc<<argv[4];
        if (ssnc.fail()) {
             cout<<"Error converting row dimension \n";
             MPI_Abort(MPI_COMM_WORLD,errcode);
             return 2;
         }
         ssnc>>M;
      }
   }

   //For simplicity, we will limit this code to perfect square process count
    double rows=sqrt(nprocs);
    nrows=(int)rows;
    if ( nrows*nrows != nprocs ) {
        cout<<"This code requires a perfect square number of processes\n";
        MPI_Abort(MPI_COMM_WORLD,errcode);
        return 1;
    }
    else {
        ncols=nrows;
    }

    //Strong scaling
    //Weak scaling would set nrl=N and ncl=M
    if (N%nrows==0 && M%ncols==0) {
        nrl = N/nrows;
        ncl = M/ncols;
    }
    else {
        cout<<"Number of ranks should divide the total number of rows/cols evenly.\n";
        MPI_Abort(MPI_COMM_WORLD,errcode);
    }


    //Number of ghost zones
    nghosts=1;

    w_halo=nghosts*2;
    int nrl_total=nrl+w_halo;
    int ncl_total=ncl+w_halo;

    double **u=new double*[nrl_total];
    double *uptr=new double[(nrl_total)*(ncl_total)];

    double **w=new double*[nrl_total];
    double *wptr=new double[(nrl_total)*(ncl_total)];

    for (int i=0;i<nrl_total;++i,wptr+=ncl_total) {
       w[i] = wptr;
    }

    for (int i=0;i<nrl_total;++i,uptr+=ncl_total) {
       u[i] = uptr;
    }

    //Declare one-d diff array for faster access.
    int dsize = nrl*ncl;
    double *diffs = new double[dsize];

    for ( int i = 0; i < nrl_total; i++ ) {
         for (int j = 0; j < ncl_total; j++ ) {
             u[i][j] = 0.;
             w[i][j] = 0.;
         }
    }

    //Set up the topology assuming processes number left to right by row
    const int ndims=2;
    int dims[2]={nrows,ncols};
    int periods[2]={0,0};
    int reorder=0;
    MPI_Comm grid_comm;
    int grid_coords[2];

    MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, periods, reorder, &grid_comm);
    MPI_Comm_rank(grid_comm, &grid_rank);
    MPI_Cart_coords(grid_comm, grid_rank, ndims, grid_coords);

    int direction=0;
    int displ=1;
    MPI_Cart_shift(grid_comm, direction, displ, &up, &down);

    direction=1;
    MPI_Cart_shift(grid_comm, direction, displ, &left, &right);

    set_bcs(nghosts, nrl, ncl, nrows, ncols, grid_coords, u);

    int starts[ndims]; 
    int subsizes[ndims];

    int sizes[]={nrl_total,ncl_total};

    MPI_Datatype sbuf_left,rbuf_right,sbuf_right,rbuf_left;
    MPI_Datatype sbuf_up,rbuf_down,sbuf_down,rbuf_up;

    //Set up the MPI type for row buffers
    //
    subsizes[0]=nghosts; subsizes[1]=ncl_total;

    starts[0]=nghosts; starts[1]=0;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&sbuf_up);
    MPI_Type_commit(&sbuf_up);

    starts[0]=nrl+nghosts; starts[1]=0;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&rbuf_down);
    MPI_Type_commit(&rbuf_down);

    starts[0]=nrl; starts[1]=0;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&sbuf_down);
    MPI_Type_commit(&sbuf_down);

    starts[0]=0; starts[1]=0;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&rbuf_up);
    MPI_Type_commit(&rbuf_up);

    //Set up MPI types for columns
    //
    subsizes[0]=nrl; subsizes[1]=nghosts;

    starts[0]=nghosts; starts[1]=ncl;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&sbuf_right);
    MPI_Type_commit(&sbuf_right);

    starts[0]=nghosts; starts[1]=0;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&rbuf_left);
    MPI_Type_commit(&rbuf_left);

    starts[0]=nghosts; starts[1]=nghosts;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&sbuf_left);
    MPI_Type_commit(&sbuf_left);

    starts[0]=nghosts; starts[1]=ncl+nghosts;
    MPI_Type_create_subarray(ndims,sizes,subsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&rbuf_right);
    MPI_Type_commit(&rbuf_right);

    //Buffers set up

    MPI_Status status;

    int nrequests=8;
    MPI_Request requests[nrequests];

    diffInterval=1;

    if (rank==0) {
        cout<<"Running until the difference is <="<<epsilon<<" with size "<<N<<"x"<<M<<"\n";
    }

    // Compute steady-state solution
    double time0=MPI_Wtime();

    while ( iteration <= MAX_ITER ) {

        MPI_Irecv(&u[0][0], 1, rbuf_up, up, tag, grid_comm, &requests[0]);
        MPI_Irecv(&u[0][0], 1, rbuf_down, down, tag, grid_comm, &requests[1]);
        MPI_Irecv(&u[0][0], 1, rbuf_left, left, tag, grid_comm, &requests[2]);
        MPI_Irecv(&u[0][0], 1, rbuf_right, right, tag, grid_comm, &requests[3]);

        MPI_Isend(&u[0][0], 1, sbuf_down, down, tag, grid_comm, &requests[5]);
        MPI_Isend(&u[0][0], 1, sbuf_up, up, tag, grid_comm, &requests[4]);
        MPI_Isend(&u[0][0], 1, sbuf_right, right, tag, grid_comm, &requests[6]);
        MPI_Isend(&u[0][0], 1, sbuf_left, left, tag, grid_comm, &requests[7]);

        MPI_Waitall(nrequests,requests,MPI_STATUS_IGNORE);

        for (int i=nghosts; i<nrl+nghosts;i++) {
           for (int j=nghosts;j<ncl+nghosts;j++) {
               w[i][j] = (u[i-1][j] + u[i+1][j] + u[i][j-1] + u[i][j+1])/4.0;
               diffs[(i-nghosts)*ncl+j-nghosts] = abs(w[i][j] - u[i][j]);
           }
        }

        if (iteration%diffInterval==0) {
             diff=diffs[0];
             for (int i=1;i<dsize;i++) {
                 if (diff<diffs[i]) {
                     diff=diffs[i];
                 }
             }
        }

        //Find max of diff in all the processors.
        MPI_Allreduce(&diff,&gdiff,1,MPI_DOUBLE,MPI_MAX,grid_comm);
        if (gdiff <= epsilon)
             break;

        //Update u
        for (int i=0; i<nrl_total; ++i){
            for (int j=0; j<ncl_total; ++j) {
                 u[i][j]=w[i][j];
            }
        }

        //Reset halo values
        for (int i=0;i<nghosts;i++) {
            for (int j=0; j<ncl_total; j++) {
                w[i][j]=u[i][j];
                w[nrl+nghosts+i][j]=u[nrl+nghosts+i][j];
            }
        }
        for (int i=0;i<nrl_total;i++) {
            for (int j=0;j<nghosts;j++) {
                w[i][j]=u[i][j];
                w[i][ncl+nghosts+i]=u[i][ncl+nghosts+i];
            }
        }

        set_bcs(nghosts, nrl, ncl, nrows, ncols, grid_coords, u);

        iteration++;
    } //end of computation

    if (iteration>MAX_ITER) {
         if (rank==0) {
            cout<<"Warning: maximum iteration exceeded\n";
         }
    }


    double totalTime=(MPI_Wtime()-time0);
    if (rank==0) {
        cout << "Completed "<<iteration<<" iterations; time "<<totalTime<<endl;
    }

    // Write solution to output file
    ofstream fp;
    string fname=argv[2]+to_string(grid_coords[0])+to_string(grid_coords[1]);
    fp.open(fname,ios::out);
    for (int i = 1; i <= nrl; i++) {
        for (int j = 1; j <= ncl; j++) {
            fp<<u[i][j]<<" ";
        }
        fp<<"\n";
     }
     fp.close();

    //Type_free not really necessary but good practice
    MPI_Type_free(&sbuf_up);
    MPI_Type_free(&sbuf_down);
    MPI_Type_free(&rbuf_up);
    MPI_Type_free(&rbuf_down);
    MPI_Type_free(&sbuf_left);
    MPI_Type_free(&sbuf_right);
    MPI_Type_free(&rbuf_left);
    MPI_Type_free(&rbuf_right);

    MPI_Comm_free(&grid_comm);
    MPI_Finalize();

    return 0;

}


void set_bcs(int nghosts,int nr, int nc, int nrows, int ncols, int* coords, double **u) {

  /* Set boundary values.
   * This has an ice bath on the top edge.
   * Note: when plotting, 0,0 is the bottom.
   */

  int nr_total=nr+2*nghosts;
  int nc_total=nc+2*nghosts;

  double topBC=0.;
  double bottomBC=100.;
  double leftBC=100.;
  double rightBC=100.;

  if (coords[0]==0) {
      for (int i=0; i<nghosts; ++i) {
          for (int j=0;j<nc_total;++j){
             u[i][j]=topBC;
          }
      }
  } else if (coords[0]==nrows-1) {
      for (int i=nr+nghosts; i<nr_total; ++i) {
          for (int j=0;j<nc_total;++j){
                u[i][j]=bottomBC;
          }
      }
  }

  if (coords[1]==0) {
       for (int i=0; i<nr_total; ++i){
           for (int j=0; j<nghosts; ++j) {
               u[i][j]=leftBC;
           }
       }
  } else if (coords[1]==ncols-1) {
       for (int i=0; i<nr_total; ++i) {
           for (int j=nc+nghosts;j<nc_total;++j){
               u[i][j]=rightBC;
           }
       }
  }

}

Fortran

Contents of mpi_ghostplate.f90

program ghosts
   use mpi_f08
   implicit none

   integer, parameter :: maxiters=10000000
   integer            :: iteration=0
   integer            :: N, M
   integer            :: i,j
   integer            :: numargs, diff_interval, interations=0
   integer            :: lb, ubr, ubc
   double precision, allocatable, dimension(:,:)  :: u,w

   double precision   :: eps, diff = 1.0
   character(len=80)  :: arg, filename
   character(len=4)   :: char_row, char_col

   double precision   :: time0, time1
   double precision   :: rows
   integer            :: nrows, ncols, nrl, ncl
   integer            :: world_rank, nprocs, tag=0
   integer, parameter :: root=0
   integer            :: left, right, up, down
   integer            :: nghosts, w_halo
   integer            :: nrl_gz,ncl_gz,nrl_total,ncl_total
   integer            :: nsubdims
   integer, dimension(2) :: dims, grid_coords
   logical, dimension(2) :: periods
   integer, dimension(2) :: sizes, subsizes, starts
   logical            :: reorder
   integer            :: ndims, grid_rank
   integer            :: direction, displ
   integer            :: nrequests
   type(MPI_Comm)     :: grid_comm
   type(MPI_Status)   :: mpi_stat
   type(MPI_Datatype) :: sbuf_up, sbuf_down, rbuf_up, rbuf_down
   type(MPI_Datatype) :: sbuf_left, sbuf_right, rbuf_left, rbuf_right
   type(MPI_Request), dimension(:), allocatable :: mpi_requests

   double precision   :: gdiff
   character(len=24)  :: fname


   interface
     subroutine set_bcs(lb, nghosts,nr,nc,nrows,ncols,coords,u)
        implicit none
        integer,                intent(in)  :: lb, nghosts, nr, nc, nrows, ncols
        integer, dimension(:),  intent(in)  :: coords
        double precision, dimension(0:,0:), intent(inout) :: u
     end subroutine
   end interface

   !Initialize MPI, get the local number of columns
   call MPI_INIT()
   call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs)
   call MPI_COMM_RANK(MPI_COMM_WORLD,world_rank)

   ! check number of parameters and read in epsilon, filename, optionally size
   ! all ranks do this
   numargs=command_argument_count()
   if (numargs .lt. 2) then
      call MPI_Finalize()
      stop 'USAGE: epsilon output-file <N> <M>'
   else
      call get_command_argument(1,arg)
      read(arg,*) eps
      call get_command_argument(2,filename)

      if (numargs==2) then
         N=500
         M=500
      endif
      if (numargs==3) then
         call get_command_argument(3,arg)
         read(arg,*) N
         M=N
      endif
      if (numargs==4) then
         call get_command_argument(3,arg)
         read(arg,*) N
         call get_command_argument(4,arg)
         read(arg,*) M
      endif
   endif

  !For simplicity, we will limit this code to perfect square process count
   rows=sqrt(dble(nprocs))
   if ( rows /= dble(int(rows)) ) then
       call MPI_Finalize()
       stop "This code requires a perfect square number of processes"
   else
       nrows=int(rows)
       ncols=nrows
   endif

   !Strong scaling
   !Weak scaling would set nrl=N and ncl=M
   if (mod(N,nrows)==0 .and. mod(M,ncols)==0) then
       nrl = N/nrows
       ncl = M/ncols
   else
       call MPI_Finalize()
       stop "Number of ranks should divide the number of rows evenly."
   endif

   !create cartesian topology for processes
   ndims=2
   dims(1) = nrows      ! number of rows
   dims(2) = ncols      ! number of columns
   periods(1) = .false.  ! cyclic in this direction
   periods(2) = .false. ! not cyclic in this direction
   reorder    = .false.  ! allow MPI to reorder if it sees fit
   call MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, periods,                 &
                                                     reorder, grid_comm)
   call MPI_Comm_rank(grid_comm, grid_rank)
   call MPI_Cart_coords(grid_comm, grid_rank, ndims, grid_coords)

   direction =  0   ! shift along the 1st index (0 or 1)
   displ =  1       ! shift by  1
   call MPI_Cart_shift(grid_comm, direction, displ, up, down)

   direction =  1   ! shift along the 2nd index (0 or 1)
   call MPI_Cart_shift(grid_comm, direction, displ, left, right)

   ! Set up values
   ! Assume number of ghosts zones same in both dimensions, lower bounds same
   nghosts=1        ! number of ghost zones
   w_halo=2*nghosts ! total halo width in each dimension (including corners)

   nrl_gz=nrl+nghosts
   ncl_gz=ncl+nghosts
   nrl_total=nrl+w_halo
   ncl_total=ncl+w_halo

   !Assume lower bound same for both dimensions
   lb=0
   ubr=nrl_total-1
   ubc=ncl_total-1
   allocate(u(lb:ubr,lb:ubc), w(lb:ubr,lb:ubc))

   u=0.d0
   w=0.d0

   !Physical boundary conditions
   call set_bcs(lb,nghosts,nrl,ncl,nrows,ncols,grid_coords,u)

   ! Set up sendrecv/buffers
   nsubdims  = 2
   sizes  = [nrl_total,ncl_total]

   !Rows
   subsizes  = [nghosts,ncl_total]

   ! Start indices must begin at 0 like C regardless of declared lower bound

   starts = [nghosts,0]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                             MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION, sbuf_up)
   call MPI_TYPE_COMMIT(sbuf_up)

   starts = [nrl+nghosts,0]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,rbuf_down)
   call MPI_TYPE_COMMIT(rbuf_down)

   starts = [nrl,0]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,sbuf_down)
   call MPI_TYPE_COMMIT(sbuf_down)

   starts = [0,0]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION, rbuf_up)
   call MPI_TYPE_COMMIT(rbuf_up)

   !Columns
   subsizes  = [nrl,nghosts]

   starts = [nghosts,ncl]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,sbuf_right)
   call MPI_TYPE_COMMIT(sbuf_right)

   starts = [nghosts,0]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,rbuf_left)
   call MPI_TYPE_COMMIT(rbuf_left)

   starts = [nghosts,nghosts]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,sbuf_left)
   call MPI_TYPE_COMMIT(sbuf_left)

   starts = [nghosts,ncl+nghosts]
   call MPI_Type_create_subarray(nsubdims, sizes, subsizes, starts,            &
                              MPI_ORDER_FORTRAN,MPI_DOUBLE_PRECISION,rbuf_right)
   call MPI_TYPE_COMMIT(rbuf_right)

   nrequests=8
   allocate(mpi_requests(nrequests))

   diff_interval=1

  ! Walltime from process 0 is good enough for me
   if (grid_rank .eq. 0) then
      write (*,'(a,es15.8,a,i4,a,i4)') 'Running until the difference is <= ', &
                                              eps,' for global size ',N,'x',M
     time0=MPI_WTIME()
   endif

   ! Compute steady-state solution
   do while ( iteration<=maxiters )

     ! Exchange halo values

       call MPI_Irecv(u, 1, rbuf_right, right, tag, grid_comm,mpi_requests(1))
       call MPI_Irecv(u, 1, rbuf_left,  left,  tag, grid_comm,mpi_requests(2))
       call MPI_Irecv(u, 1, rbuf_up,    up,    tag, grid_comm,mpi_requests(3))
       call MPI_Irecv(u, 1, rbuf_down,  down,  tag, grid_comm,mpi_requests(4))

       call MPI_Isend(u, 1, sbuf_left,  left,  tag, grid_comm,mpi_requests(5))
       call MPI_Isend(u, 1, sbuf_right, right, tag, grid_comm,mpi_requests(6))
       call MPI_Isend(u, 1, sbuf_down,  down,  tag, grid_comm,mpi_requests(7))
       call MPI_Isend(u, 1, sbuf_up,    up,    tag, grid_comm,mpi_requests(8))

   ! complete communications
       call MPI_Waitall(nrequests,mpi_requests,MPI_STATUSES_IGNORE)

       do j=lb+nghosts,lb+nrl_gz-1
           do i=lb+nghosts,lb+ncl_gz-1
               w(i,j) = 0.25*(u(i-1,j) + u(i+1,j) + u(i,j-1) + u(i,j+1))
           enddo
       enddo

       if (mod(iteration,diff_interval)==0) then
           if (diff_interval==-1) continue  !disable convergence test
               diff=maxval(abs(w(nghosts:ubr-nghosts-1,nghosts:ubc-nghosts-1)  &
                   -u(nghosts:ubr-nghosts-1,nghosts:ubc-nghosts-1)))
               call MPI_ALLREDUCE(diff,gdiff,1,MPI_DOUBLE_PRECISION,MPI_MAX,   &
                                                             grid_comm)

               if (gdiff <= eps) then
                   exit
               endif
       endif

       !Update u. Don't overwrite boundaries.
       u(1:nrl,1:ncl) = w(1:nrl,1:ncl)

     ! Reset physical boundaries (they get overwritten in the halo exchange)
       call set_bcs(lb,nghosts,nrl,ncl,nrows,ncols,grid_coords,u)

      !If this happens we will exit at next conditional test.
      !Avoids explicit break here
       if (iteration>maxiters) then
          if (grid_rank==0) then
              write(*,*) "Warning: maximum iterations exceeded"
          endif
       endif

       iteration = iteration + 1

   enddo

   if (grid_rank==root) then
     time1 = MPI_WTIME()
     write(*,*) 'completed; iterations = ', iteration,"in ",time1-time0," sec"
   endif

  ! Write solution to output file by row. Must use correct stitching to get
  ! overall result, such as a numpy concatenate on axis=1.
  ! It would also work to output by columns, in which case stitching is more
  ! straightforward.

  !Some compilers (Intel) like to add EOLs to line up columns in output. The
  ! Fortran standard permits this with * IO, but it complicates plotting.
  ! Use some F08 features to fix this.

   write(char_row,'(i4)')  grid_coords(1)
   write(char_col,'(i4)')  grid_coords(2)
   fname=trim(filename)//trim(adjustl(char_row))//trim(adjustl(char_col))

   open (unit=10,file=fname)
   do i=lb+nghosts,lb+nghosts+nrl
      write (10,'(*(g0,1x))') u(i,1:ncl)
   enddo

   call MPI_Comm_free(grid_comm)

   call MPI_Finalize()

end program


subroutine set_bcs(lb,nghosts,nr,nc,nrows,ncols,coords,u)
  implicit none
  integer, intent(in)                      :: lb ,nghosts, nr, nc, nrows, ncols
  integer, dimension(:), intent(in)        :: coords
  double precision, dimension(lb:,lb:), intent(inout) :: u
  double precision                         :: topBC, bottomBC, leftBC, rightBC

  ! Set boundary values
  ! This has the ice bath on the bottom edge.
  ! Note: when plotting, 0,0 is the bottom.

  topBC=0.d0
  bottomBC=100.d0
  rightBC=100.d0
  leftBC=100.d0

  !Set boundary conditions
   if (coords(1)==0) then
      u(lb:lb+nghosts-1,:)=topBC
   else if (coords(1)==nrows-1) then
      u(nr+nghosts+lb:,:)=bottomBC
   endif
   if (coords(2)==0) then
      u(:,lb:lb+nghosts-1)=leftBC
   else if (coords(2)==ncols-1) then
      u(:,nc+nghosts+lb:)=rightBC
   endif

end subroutine set_bcs

Python

Contents of mpi_ghostplate.py

import sys
import time
import numpy as np
from mpi4py import MPI

def set_bcs(u,nrl,ncl,nrows,ncols,coords):
    # Set physical boundary values and compute mean boundary value.
    # Due to Python pass-by-assignment, mutable parameters will be
    #changed outside (so this is a subroutine)

    topBC=0.
    bottomBC=100.
    leftBC = 100.
    rightBC = 100.

    if coords[0]==0:
        u[0:nghosts,:]=topBC
    if coords[0]==nrows-1:
        u[nrl+nghosts:,:]=bottomBC
    if coords[1]==0:
        u[:,0:nghosts]=leftBC
    if coords[1]==ncols-1:
        u[:,ncl+nghosts:]=rightBC

# check number of parameters and read in parameters
#Better to use argparse but this is closer to compiled language code and simpler
#if one hasn't used argparse.

argv=sys.argv

if  len(argv) > 2:
   try:
      epsilon=float(argv[1])
      filename=argv[2]
   except:
      "Illegal input"
      sys.exit(1)
   if len(argv) == 3:
       N=500
       M=500
   if len(argv) == 4:
       try:
          N=int(argv[3])
          M=N
       except:
           "Cannot convert grid dimension to integer"
           sys.exit(2)
   if len(argv) == 5:
       try:
           N=int(argv[3])
           M=int(argv[4])
       except:
           "Cannot convert grid dimension to integer"
           sys.exit(2)
else:
   print('USAGE: epsilon output-file <N> <M>')
   sys.exit(1)

#Set up MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nprocs = comm.Get_size()

root=0
tag=0

#Set number of rows and columns
#For simplicity, require a perfect square number of processes
nrows=np.sqrt(nprocs)
if nrows != int(nrows):
    print("This code requires a perfect square number of processes")
    sys.exit()
else:
    nrows=int(nrows)
    ncols=nrows

#Strong scaling
if M%nrows!=0:
    print("Number of rows must be evenly divisible by number of processes")
    sys.exit(0)
else:
    nrl = N//nrows
    ncl = N//ncols

#Set max iterations. 10 million should do it.
max_iterations=10000000

#Set up the topology
ndims=2
dims=[nrows, ncols]
periods=[False, False]
grid_comm=comm.Create_cart(dims, periods)

grid_rank=grid_comm.Get_rank()
grid_coords=grid_comm.coords

direction=0
displ=1
(up,down)=grid_comm.Shift(direction, displ)
direction=1
(left,right)=grid_comm.Shift(direction,displ)

#Assume same number of ghost zones for rows and columns
nghosts=2
w_halo=2*nghosts
nrl_total=nrl+w_halo
ncl_total=ncl+w_halo

#Solution arrays
u = np.zeros((nrl_total, ncl_total), dtype=np.double)
w = np.zeros((nrl_total, ncl_total), dtype=np.double)

set_bcs(u,nrl,ncl,nrows,ncols,grid_coords)

# set up MPI types for buffers

sizes = [nrl_total,ncl_total]

# Rows
subsizes  = [nghosts,ncl_total]

starts = [nghosts,0]
sbuf_up=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
sbuf_up.Commit()

starts = [nrl+nghosts,0]
rbuf_down=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
rbuf_down.Commit()

starts = [nrl,0]
sbuf_down=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
sbuf_down.Commit()

starts = [0,0]
rbuf_up=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
rbuf_up.Commit()

# Columns
subsizes  = [nrl,nghosts]

starts = [nghosts,ncl]
sbuf_right=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
sbuf_right.Commit()

starts = [nghosts,0]
rbuf_left=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
rbuf_left.Commit()

starts = [nghosts,nghosts]
sbuf_left=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
sbuf_left.Commit()

starts = [nghosts,ncl+nghosts]
rbuf_right=MPI.DOUBLE.Create_subarray(sizes, subsizes, starts)
rbuf_right.Commit()

# Compute steady-state solution
iteration=0
diff_interval=1

start_time=MPI.Wtime()
my_diff = np.zeros(1)
global_diff=np.zeros(1)

nrl_gz=nrl+nghosts
ncl_gz=ncl+nghosts

if rank==root:
    print(f'Running until the difference is < {epsilon:}, global size {N:}x{M:}')

while iteration<=max_iterations:

    rcv_down =comm.Irecv([u, 1, rbuf_down],  down)
    rcv_up   =comm.Irecv([u, 1, rbuf_up],    up)
    rcv_left =comm.Irecv([u, 1, rbuf_left],  left)
    rcv_right=comm.Irecv([u, 1, rbuf_right], right)

    snd_up   =comm.Isend([u, 1, sbuf_up],    up)
    snd_down =comm.Isend([u, 1, sbuf_down],  down)
    snd_right=comm.Isend([u, 1, sbuf_right], right)
    snd_left =comm.Isend([u, 1, sbuf_left],  left)

    requests=[rcv_down,rcv_up,snd_up,snd_down,
              rcv_left,rcv_right,snd_right,snd_left]

    MPI.Request.Waitall(requests)

    w[nghosts:nrl_gz,nghosts:ncl_gz]=0.25*(u[nghosts-1:nrl_gz-1,nghosts:ncl_gz]+
                                           u[nghosts+1:nrl_gz+1,nghosts:ncl_gz]+
                                           u[nghosts:nrl_gz,nghosts-1:ncl_gz-1]+
                                           u[nghosts:nrl_gz,nghosts+1:ncl_gz+1])

    if iteration%diff_interval==0:
        my_diff[0]=np.max(np.abs(w[nghosts:nrl_gz,nghosts:ncl_gz]-
                                 u[nghosts:ncl_gz,nghosts:ncl_gz]))
        comm.Allreduce(my_diff,global_diff,op=MPI.MAX)

        if global_diff[0]<=epsilon:
            break

    #Update u
    u[nghosts:nrl_gz,nghosts:ncl_gz]=w[nghosts:nrl_gz,nghosts:ncl_gz]

    #Reapply physical boundary conditions
    set_bcs(u,nrl,ncl,nrows,ncols,grid_coords)

    iteration+=1

#This is what the weird "else" clause in Python for/while loops is used to do.
#Our stopping criterion is now on exceeding max_iterations so don't need
# to break, but we need to warn that it happened.
else:
     if grid_rank==0:
         print("Warning: maximum iterations exceeded")

total_time=MPI.Wtime()-start_time

if grid_rank==0:
    print(f'completed in {iteration:} iterations with time {total_time:.2f}')

# Write solution to output file
filename = filename + str(grid_coords[0]) + str(grid_coords[1])
fout = open (filename,'w')
for i in range(1,nrl+1):
    line=" ".join(map(str,list(u[i,1:ncl+1])))
    row=line+"\n"
    fout.write(row)

# All done!
#print(f"wrote output file {filename:}")

Last updated on Apr 17, 2026