c multidimensional-array parallel-processing mpi hpc

Writing distributed arrays using MPI-IO and Cartesian topology

I have an MPI code that implements 2D domain decomposition to compute numerical solutions to a PDE. Currently I write certain 2D distributed arrays out for each process (e.g. array_x--> proc000x.bin). I want to reduce that to a single binary file.

array_0, array_1,

array_2, array_3,

Suppose the above illustrates a cartesian topology with 4 processes (2x2). Each 2D array has dimension (nx + 2, nz + 2). The +2 signifies "ghost" layers added to all sides for communication purposes.

I would like to extract the main arrays (omit the ghost layers) and write them to a single binary file with an order something like,

array_0, array_1, array_2, array_3 --> output.bin

If possible it would be preferable to write it as though I had access to the global grid and was writing row-by-row i.e.,

row 0 of array_0, row 0 of array_1, row 1 of array_0 row_1 of array_1 ....

The attempt below attempts the former of the two output formats in file array_test.c

#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>

/* 2D array allocation */
float **alloc2D(int rows, int cols);

float **alloc2D(int rows, int cols) {
  int i, j; 
  float *data = malloc(rows * cols * sizeof(float));
  float **arr2D = malloc(rows * sizeof(float *));

  for (i = 0; i < rows; i++) {
    arr2D[i] = &(data[i * cols]);
  }                 
  /* Initialize to zero */
  for (i= 0; i < rows; i++) {
     for (j=0; j < cols; j++) {
        arr2D[i][j] = 0.0;      
     }                               
  }                    
  return arr2D;
}   

int main(void) {

   /* Creates 5x5 array of floats with padding layers and 
   * attempts to write distributed arrays */

   /* Run toy example with 4 processes */
   int i, j, row, col;
   int nx = 5, ny = 5, npad = 1;
   int my_rank, nproc=4;
   int dim[2] = {2, 2}; /* 2x2 cartesian grid */
   int period[2] = {0, 0};
   int coord[2];
   int reorder = 1;
   float **A = NULL;
   MPI_Comm grid_Comm;

   /* Initialize MPI */
   MPI_Init(NULL, NULL);
   MPI_Comm_size(MPI_COMM_WORLD, &nproc);
   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

   /* Establish cartesian topology */
   MPI_Cart_create(MPI_COMM_WORLD, 2, dim, period, reorder, &grid_Comm);

   /* Get cartesian grid indicies of processes */
   MPI_Cart_coords(grid_Comm, my_rank, 2, coord);
   row = coord[1];
   col = coord[0];

   /* Add ghost layers */
   nx += 2 * npad;
   ny += 2 * npad;
   A = alloc2D(nx, ny);

   /* Create derived datatype for interior grid (output grid) */
   MPI_Datatype grid;
   int start[2] = {npad, npad};
   int arrsize[2] = {nx, ny};
   int gridsize[2] = {nx - 2 * npad, ny - 2 * npad};

   MPI_Type_create_subarray(2, arrsize, gridsize,
                            start, MPI_ORDER_C, MPI_FLOAT, &grid);
   MPI_Type_commit(&grid);

   /* Fill interior grid */
   for (i = npad; i < nx-npad; i++) {
      for (j = npad; j < ny-npad; j++) {
         A[i][j] = my_rank + i;
      }
   }

   /* MPI IO */
   MPI_File fh;
   MPI_Status status;
   char file_name[100];
   int N, offset;

   sprintf(file_name, "output.bin");
   MPI_File_open(grid_Comm, file_name, MPI_MODE_CREATE | MPI_MODE_WRONLY,
              MPI_INFO_NULL, &fh);

   N = (nx - 2 * npad) * (ny - 2 *npad);
   offset = (row * 2 + col) * N * sizeof(float);
   MPI_File_set_view(fh, offset, MPI_FLOAT, grid, "native",
                     MPI_INFO_NULL);
   MPI_File_write_all(fh, &A[0][0], N, MPI_FLOAT, MPI_STATUS_IGNORE);
   MPI_File_close(&fh);

   /* Cleanup */
   free(A[0]);
   free(A);
   MPI_Type_free(&grid);
   MPI_Finalize();

   return 0;

}

Compiles with

mpicc -o array_test array_test.c

Runs with

mpiexec -n 4 array_test

While the code compiles and runs, the output is incorrect. I'm assuming that I have misinterpreted the use of the derived datatype and file writing in this case. I'd appreciate some help figuring out my mistakes.

Solution

The error you make here is that you have the wrong file view. Instead of creating a type representing the share of the file the current processor is responsible of, you use the mask corresponding to the local data you want to write.

You have actually two very distinct masks to consider:

The mask for the local data, excluding the halo layers; and
The mask for the global data, as it should be once collated into the file.

The former corresponds to this layout:

Here, the data that you want to output on the file for a given process in in dark blue, and the halo layer that should not be written on the file is in lighter blue.

The later corresponds to this layout:

Here, each colour corresponds to the local data coming from a different process, as distributed on the 2D Cartesian grid.

To understand what you need to create to reach this final result, you have to think backwards:

Your final call to the IO routine should be MPI_File_write_all(fh, &A[0][0], 1, interior, MPI_STATUS_IGNORE);. So you have to have your interior type defined such as to exclude the halo boundary. Well fortunately, the type grid you created already does exactly that. So we will use it.
But now, you have to have the view on the file to allow for this MPI_Fie_write_all() call. So the view must be as described in the second picture. We will therefore create a new MPI type representing it. For that, MPI_Type_create_subarray() is what we need.

Here is the synopsis of this function:

int MPI_Type_create_subarray(int ndims,
                             const int array_of_sizes[],
                             const int array_of_subsizes[],
                             const int array_of_starts[],
                             int order,
                             MPI_Datatype oldtype,
                             MPI_Datatype *newtype)

   Create a datatype for a subarray of a regular, multidimensional array

INPUT PARAMETERS
  ndims   - number of array dimensions (positive integer)
  array_of_sizes
          - number of elements of type oldtype in each
            dimension of the full array (array of positive integers)
  array_of_subsizes
          - number of elements of type oldtype in each dimension of
            the subarray (array of positive integers)
  array_of_starts
          - starting coordinates of the subarray in each dimension
            (array of nonnegative integers)
  order   - array storage order flag (state)
  oldtype - array element datatype (handle)

OUTPUT PARAMETERS
  newtype - new datatype (handle)

For our 2D Cartesian file view, here are what we need for these input parameters:

ndims: 2 as the grid is 2D
array_of_sizes: these are the dimensions of the global array to output, namely { nnx*dim[0], nny*dim[1] }
array_of_subsizes: these are the dimensions of the local share of the data to output, namely { nnx, nny }
array_of_start: these are the x,y start coordinates of the local share into the global grid, namely { nnx*coord[0], nny*coord[1] }
order: the ordering is C so this must be MPI_ORDER_C
oldtype: data are floats so this must be MPI_FLOAT

Now that we have our type for the file view, we simply apply it with MPI_File_set_view(fh, 0, MPI_FLOAT, view, "native", MPI_INFO_NULL); and the magic is done.

Your full code becomes:

#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>

/* 2D array allocation */
float **alloc2D(int rows, int cols);

float **alloc2D(int rows, int cols) {
  int i, j; 
  float *data = malloc(rows * cols * sizeof(float));
  float **arr2D = malloc(rows * sizeof(float *));

  for (i = 0; i < rows; i++) {
    arr2D[i] = &(data[i * cols]);
  }                 
  /* Initialize to zero */
  for (i= 0; i < rows; i++) {
     for (j=0; j < cols; j++) {
        arr2D[i][j] = 0.0;      
     }                               
  }                    
  return arr2D;
}   

int main(void) {

   /* Creates 5x5 array of floats with padding layers and 
   * attempts to write distributed arrays */

   /* Run toy example with 4 processes */
   int i, j, row, col;
   int nx = 5, ny = 5, npad = 1;
   int my_rank, nproc=4;
   int dim[2] = {2, 2}; /* 2x2 cartesian grid */
   int period[2] = {0, 0};
   int coord[2];
   int reorder = 1;
   float **A = NULL;
   MPI_Comm grid_Comm;

   /* Initialize MPI */
   MPI_Init(NULL, NULL);
   MPI_Comm_size(MPI_COMM_WORLD, &nproc);
   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

   /* Establish cartesian topology */
   MPI_Cart_create(MPI_COMM_WORLD, 2, dim, period, reorder, &grid_Comm);

   /* Get cartesian grid indicies of processes */
   MPI_Cart_coords(grid_Comm, my_rank, 2, coord);
   row = coord[1];
   col = coord[0];

   /* Add ghost layers */
   nx += 2 * npad;
   ny += 2 * npad;
   A = alloc2D(nx, ny);

   /* Create derived datatype for interior grid (output grid) */
   MPI_Datatype grid;
   int start[2] = {npad, npad};
   int arrsize[2] = {nx, ny};
   int gridsize[2] = {nx - 2 * npad, ny - 2 * npad};

   MPI_Type_create_subarray(2, arrsize, gridsize,
                            start, MPI_ORDER_C, MPI_FLOAT, &grid);
   MPI_Type_commit(&grid);

   /* Fill interior grid */
   for (i = npad; i < nx-npad; i++) {
      for (j = npad; j < ny-npad; j++) {
         A[i][j] = my_rank + i;
      }
   }

   /* Create derived type for file view */
   MPI_Datatype view;
   int nnx = nx-2*npad, nny = ny-2*npad; 
   int startV[2] = { coord[0]*nnx, coord[1]*nny };
   int arrsizeV[2] = { dim[0]*nnx, dim[1]*nny };
   int gridsizeV[2] = { nnx, nny };

   MPI_Type_create_subarray(2, arrsizeV, gridsizeV,
                            startV, MPI_ORDER_C, MPI_FLOAT, &view);
   MPI_Type_commit(&view);

   /* MPI IO */
   MPI_File fh;

   MPI_File_open(grid_Comm, "output.bin", MPI_MODE_CREATE | MPI_MODE_WRONLY,
                 MPI_INFO_NULL, &fh);

   MPI_File_set_view(fh, 0, MPI_FLOAT, view, "native", MPI_INFO_NULL);
   MPI_File_write_all(fh, &A[0][0], 1, grid, MPI_STATUS_IGNORE);
   MPI_File_close(&fh);

   /* Cleanup */
   free(A[0]);
   free(A);
   MPI_Type_free(&view);
   MPI_Type_free(&grid);
   MPI_Finalize();

   return 0;

}