MPI Scatter with std::vector

For simplicity I'll remove custom functions etc. Compiled && run with:

mpic++ main.cpp && mpiexec -np 4 ./a.out

My aim is that dividing matrices with dynamically according to the rank size. For example we have a matrix:

std::vector<std::vector<int>> A = {
      {0, 0, 0, 0}, {1, 1, 1, 1}, {2, 2, 2, 2}, {3, 3, 3, 3}};

if rank size 2, matrix will be divide into:

Thread 0: Local A = ((0,0,0,0), (1,1,1,1))
Thread 1: Local A = ((0,0,0,0), (1,1,1,1))
Thread 2: Local A = ((2,2,2,2), (3,3,3,3))
Thread 3: Local A = ((2,2,2,2), (3,3,3,3))

my code:

#include <mpi.h>

#include <iostream>
#include <vector>

void print_matrix(const std::vector<std::vector<int>>& mat,
                  const std::string& name, int rank) {
  std::cout << "Process " << rank << " received matrix " << name
            << " with dimensions: " << mat.size() << "x"
            << (mat.empty() ? 0 : mat[0].size()) << std::endl;
  std::cout << "Process " << rank << " printing " << name << ":" << std::endl;
  for (const auto& row : mat) {
    for (int val : row) {
      std::cout << val << " ";
    }
    std::cout << std::endl;
  }
}

int main(int argc, char** argv) {
  MPI_Init(&argc, &argv);
  int rank, size;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  int n = 4;                  // Assuming a 4x4 matrix
  int rows_per_proc = n / 2;  // Each process handles half the rows

  MPI_Comm group_comm;
  int group = rank % 2;
  MPI_Comm_split(MPI_COMM_WORLD, group, rank, &group_comm);

  int new_rank;
  MPI_Comm_rank(group_comm, &new_rank);

  std::vector<std::vector<int>> A = {
      {0, 0, 0, 0}, {1, 1, 1, 1}, {2, 2, 2, 2}, {3, 3, 3, 3}};
  std::vector<std::vector<int>> localA(rows_per_proc, std::vector<int>(n));

  // Create a datatype for a block of rows
  MPI_Datatype matrix_block;
  MPI_Type_vector(rows_per_proc, n, n, MPI_INT, &matrix_block);
  MPI_Type_commit(&matrix_block);

  int root = 0;

  // Use MPI_Scatter with the created datatype
  if (new_rank == root) {
    MPI_Scatter(&A[0][0], 1, matrix_block, &localA[0][0], 1, matrix_block, root,
                group_comm);
  } else {
    MPI_Scatter(nullptr, 0, matrix_block, &localA[0][0], 1, matrix_block, root,
                group_comm);
  }

  print_matrix(localA, "Matrix A", rank);

  // Free the custom datatype
  MPI_Type_free(&matrix_block);

  MPI_Finalize();
  return 0;
}

this code's output is:

Process 0 received matrix Matrix A with dimensions: 2x4
Process 0 printing Matrix A:
0 0 0 0 
0 0 0 0 
Process 1 received matrix Matrix A with dimensions: 2x4
Process 1 printing Matrix A:
0 0 0 0 
0 0 0 0 
Process 2 received matrix Matrix A with dimensions: 2x4
Process 2 printing Matrix A:
1 1 1 1 
0 0 0 0 
Process 3 received matrix Matrix A with dimensions: 2x4
Process 3 printing Matrix A:
1 1 1 1 
0 0 0 0

Process 0	Process 1	Process 2	Process 3
0,0,0,0	0,0,0,0	2,2,2,2	2,2,2,2
1,1,1,1	1,1,1,1	3,3,3,3	3,3,3,3

Solution

Thanks to @user4581301 and Doug from this post, I've altered his class a bit for printing some infos, and solve the problem with fake 2D 1D vector.

#include <mpi.h>

#include <iostream>

#include "utils.cpp"

class Array2D {
 public:
  std::vector<int> v;
  int nc;
  Array2D(int NR, int NC) : v(NR * NC), nc(NC) {}

  int* operator[](int r) { return &v[r * nc]; }
  const int* operator[](int r) const { return &v[r * nc]; }
};

void print_matrix(const Array2D& mat, int rows, int cols,
                  const std::string& name, int rank) {
  std::cout << "Process " << rank << " received matrix " << name
            << " with dimensions: " << rows << "x" << cols << std::endl;
  std::cout << "Process " << rank << " printing " << name << ":" << std::endl;
  for (int i = 0; i < rows; i++) {
    for (int j = 0; j < cols; j++) {
      std::cout << mat[i][j] << " ";
    }
    std::cout << std::endl;
  }
}

/*
                        Matrix Distribution Diagram:
  +---------+---------+       +-----------------+       +--------+--------+
  |         |         |       |                 |       |        |        |
  |   P0    |   P1    |  <-   |       A0        |   *   |        |        |
  |         |         |       |                 |       |        |        |
  +---------+---------+       +-----------------|       |   B0   |   B1   |
  |         |         |       |       A1        |       |        |        |
  |   P2    |   P3    |       |                 |       |        |        |
  |         |         |       +-----------------+       +--------+--------+
  +---------+---------+
      Matrix C
*/

int main(int argc, char** argv) {
  MPI_Init(&argc, &argv);
  int rank, size;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  // ! Read Dynamically
  int n = 4;
  int rows_per_proc = n / 2;  // Half rows per process for A

  // ? Calculate the new communicator for 6 processes
  MPI_Comm group_comm;
  int group = rank % 2;  // Group 0 for ranks 0 and 2, Group 1 for ranks 1 and 3
  MPI_Comm_split(MPI_COMM_WORLD, group, rank, &group_comm);

  int new_rank;
  MPI_Comm_rank(group_comm, &new_rank);  // Get rank in new communicator

  // ! Read from file
  Array2D A(4, 4);  // Full matrix
  // Initialize matrix A
  for (int i = 0; i < 4; i++) {
    for (int j = 0; j < 4; j++) {
      A[i][j] = i;
    }
  }

  Array2D localA(rows_per_proc, n);  // Local submatrix

  // Calculate the offset based on the group
  int offset = (group == 0) ? 0 : rows_per_proc * n;

  // Determine the root for each group
  int root = 0;  // Root is always rank 0 in the new communicator

  // Scatter the matrix A
  if (new_rank == root) {
    std::cout << "Scattering A from root " << root << " in group " << group
              << std::endl;
    MPI_Scatter(A.v.data(), rows_per_proc * n, MPI_INT, localA.v.data(),
                rows_per_proc * n, MPI_INT, root, group_comm);
  } else {
    MPI_Scatter(nullptr, 0, MPI_INT, localA.v.data(), rows_per_proc * n,
                MPI_INT, root, group_comm);
  }

  // Print the received submatrix
  print_matrix(localA, rows_per_proc, n, "Matrix A", rank);

  MPI_Finalize();
  return 0;
}

This code gives the proffered output:

Scattering A from root 0 in group 1
Process 1 received matrix Matrix A with dimensions: 2x4
Process 1 printing Matrix A:
0 0 0 0 
1 1 1 1 
Process 2 received matrix Matrix A with dimensions: 2x4
Process 2 printing Matrix A:
2 2 2 2 
3 3 3 3 
Process 3 received matrix Matrix A with dimensions: 2x4
Process 3 printing Matrix A:
2 2 2 2 
3 3 3 3 
Scattering A from root 0 in group 0
Process 0 received matrix Matrix A with dimensions: 2x4
Process 0 printing Matrix A:
0 0 0 0 
1 1 1 1