c performance parallel-processing mpi matrix-multiplication

MPI : Sending and receiving dynamically allocated sub-matrix

I have problem with sending dynamically allocated sub-matrix to workers. I can't understand how I can correctly do that (and what I should send).

Here is sending part:

MPI_Send(&(a[offset][0]), rows * NCA, MPI_DOUBLE, dest, FROM_MASTER + 2, MPI_COMM_WORLD);
MPI_Send(&b, NCA * NCB, MPI_DOUBLE, dest, FROM_MASTER + 3, MPI_COMM_WORLD);

Here is receiving part:

MPI_Recv(&(a[0][0]), rows * NCA, MPI_DOUBLE, MASTER, FROM_MASTER + 2, MPI_COMM_WORLD, &status);
MPI_Recv(&(b[0][0]), NCA * NCB, MPI_DOUBLE, MASTER, FROM_MASTER + 3, MPI_COMM_WORLD, &status);

Here is error message:

[pop-os:29368] Read -1, expected 80000, errno = 14
[pop-os:29367] *** Process received signal ***
[pop-os:29367] Signal: Segmentation fault (11)
[pop-os:29367] Signal code: Address not mapped (1)
[pop-os:29367] Failing at address: 0x7fffc2ae8000

Here is all code:

#include <cstdio>
#include <cstdlib>

#include "mpi.h"
#define NRA 100
/* number of rows in matrix A */
#define NCA 100
/* number of columns in matrix A */
#define NCB 100
/* number of columns in matrix B */
#define MASTER 0
/* taskid of first task */
#define FROM_MASTER 1  /* setting a message type */
#define FROM_WORKER 10 /* setting a message type */

double **alloc_2d_int(int rows, int cols) {
  double **array= (double **)malloc(rows*sizeof(double*));
  for (int i=0; i<rows; i++)
    array[i] = (double *)malloc(rows*cols*sizeof(double));
  return array;
}


int main(int argc, char *argv[]) {
  int numtasks, taskid, numworkers, source, dest, rows,
      /* rows of matrix A sent to each worker */
      averow, extra, offset, i, j, k;

  double **a = alloc_2d_int(NRA, NCA);
  double **b = alloc_2d_int(NCA, NCB);
  double **c = alloc_2d_int(NRA, NCB);

  MPI_Init(&argc, &argv);

  MPI_Status status;

  MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
  MPI_Comm_rank(MPI_COMM_WORLD, &taskid);

  if (numtasks < 2) {
    printf("Need at least two MPI tasks. Quitting...\n");
    MPI_Abort(MPI_COMM_WORLD, -1);
    exit(1);
  }

  numworkers = numtasks - 1;

  if (taskid == MASTER) {
    printf("mpi_mm has started with %d tasks (task1).\n", numtasks);

    for (i = 0; i < NRA; i++)
      for (j = 0; j < NCA; j++) a[i][j] = 10;
    for (i = 0; i < NCA; i++)
      for (j = 0; j < NCB; j++) b[i][j] = 10;

    double t1 = MPI_Wtime();

    averow = NRA / numworkers;
    extra = NRA % numworkers;
    offset = 0;

    for (dest = 1; dest <= numworkers; dest++) {
      rows = (dest <= extra) ? averow + 1 : averow;
      printf("Sending %d rows to task %d offset=%d\n", rows, dest, offset);

      MPI_Send(&offset, 1, MPI_INT, dest, FROM_MASTER, MPI_COMM_WORLD);
      MPI_Send(&rows, 1, MPI_INT, dest, FROM_MASTER + 1, MPI_COMM_WORLD);
      MPI_Send(&(a[offset][0]), rows * NCA, MPI_DOUBLE, dest, FROM_MASTER + 2,
               MPI_COMM_WORLD);
      MPI_Send(&b, NCA * NCB, MPI_DOUBLE, dest, FROM_MASTER + 3,
               MPI_COMM_WORLD);

      offset = offset + rows;
    }

    /* Receive results from worker tasks */
    for (source = 1; source <= numworkers; source++) {
      MPI_Recv(&offset, 1, MPI_INT, source, FROM_WORKER, MPI_COMM_WORLD,
               &status);
      MPI_Recv(&rows, 1, MPI_INT, source, FROM_WORKER + 1, MPI_COMM_WORLD,
               &status);
      MPI_Recv(&(c[offset][0]), rows * NCB, MPI_DOUBLE, source, FROM_WORKER + 2,
               MPI_COMM_WORLD, &status);

      printf("Received results from task %d\n", source);
    }

    /* Print results */
    /*
    printf("****\n");
    printf("Result Matrix:\n");

    for (i = 0; i < NRA; i++)
     {
      printf("\n");
      for (j = 0; j < NCB; j++) printf("%6.2f ", c[i][j]);
    }*/

    printf("\n********\n");
    printf("Done.\n");

    t1 = MPI_Wtime() - t1;

    printf("\nExecution time: %.2f\n", t1);
  }
  /******** worker task *****************/
  else { /* if (taskid > MASTER) */
    MPI_Recv(&offset, 1, MPI_INT, MASTER, FROM_MASTER, MPI_COMM_WORLD, &status);
    MPI_Recv(&rows, 1, MPI_INT, MASTER, FROM_MASTER + 1, MPI_COMM_WORLD,
             &status);
    MPI_Recv(&(a[0][0]), rows * NCA, MPI_DOUBLE, MASTER, FROM_MASTER + 2,
             MPI_COMM_WORLD, &status);
    MPI_Recv(&(b[0][0]), NCA * NCB, MPI_DOUBLE, MASTER, FROM_MASTER + 3, MPI_COMM_WORLD,
             &status);

    for (k = 0; k < NCB; k++)
      for (i = 0; i < rows; i++) {
        c[i][k] = 0.0;
        for (j = 0; j < NCA; j++) c[i][k] = c[i][k] + a[i][j] * b[j][k];
      }

    MPI_Send(&offset, 1, MPI_INT, MASTER, FROM_WORKER, MPI_COMM_WORLD);
    MPI_Send(&rows, 1, MPI_INT, MASTER, FROM_WORKER + 1, MPI_COMM_WORLD);
    MPI_Send(&c, rows * NCB, MPI_DOUBLE, MASTER, FROM_WORKER + 2,
             MPI_COMM_WORLD);
  }

  for (i=0; i<NRA; i++)
    free(a[i]);
  free(a);

  for (i=0; i<NCA; i++)
    free(b[i]);
  free(b);

  for (i=0; i<NRA; i++)
    free(c[i]);
  free(c);

  MPI_Finalize();
}

Solution: link to GitHub with correct code

Solution

MPI_Send and MPI_recv first parameter is const void * so you need to change:

 MPI_Send(&b, NCA * NCB, MPI_DOUBLE, dest, FROM_MASTER + 3, MPI_COMM_WORLD);

MPI_Send(b, NCA * NCB, MPI_DOUBLE, dest, FROM_MASTER + 3, MPI_COMM_WORLD);

and

MPI_Send(&c, rows * NCB, MPI_DOUBLE, MASTER, FROM_WORKER + 2, MPI_COMM_WORLD);

 MPI_Send(c, rows * NCB, MPI_DOUBLE, MASTER, FROM_WORKER + 2, MPI_COMM_WORLD);

Another issue that you have is that you are allocating a array of pointers:

double **alloc_2d_int(int rows, int cols) {
  double **array= (double **)malloc(rows*sizeof(double*));
  for (int i=0; i<rows; i++)
    array[i] = (double *)malloc(rows*cols*sizeof(double));
  return array;
}

But the data to be send/recv on the MPI_Send and MPI_Recv are assumed to be continuously. To solve this, you can create a continuously 2D array, simply represent the matrix as an array, create a MPI custom type, among others.