c++performance parallel-processing mpi hpc

Using MPI_Send and MPI_Recv instead of MPI_Gather

I want to compare the performance difference of MPI_Send and MPI_recv with MPI_Gather; so I'm trying to get the answer from this code without MPI_Gather, but the thing is it seems that the buffer of root is not updating. The code is:

#include <fstream>
#include <iostream>
#include <vector>
#include <string>
#include "mpi.h"
using namespace std;
const int N = 2;

int main() {
    MPI_Init(NULL, NULL);
    int rank;
    int size;
    int root = 0;
    vector<int> x(N);
    vector<int> receive_data(N);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    const int leng = size * N;
    vector<int> single_arr(leng);

    for (int i = 0; i < N;i++) {
        x[i] = rank + i;
    }

    if (rank == 0) {
        for (int i = 0; i < N; i++) {
            single_arr[i] = x[i];
        }
    }

    if (rank != root) {
        MPI_Send(x.data(), N, MPI_INT, 0, 0, MPI_COMM_WORLD);
    }

    if (rank == root) {
        for (int i = 1; i < size; i++) {
            MPI_Recv(single_arr.data(), N, MPI_INT, rank + i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        }
    }
    if (rank == root) {
        for (int i = 0; i < single_arr.size();i++) {
            cout << i << "\t" << single_arr[i] << endl;
        }
    }
    MPI_Finalize();
}

It's result is:

But, I want this one:

Is there any way to do this?

Solution

The problem is that in:

if (rank != root) {
    MPI_Send(x.data(), N, MPI_INT, 0, 0, MPI_COMM_WORLD);
}

every process will send N element of the x array to the process = 0,

Then process 0 :

if (rank == root) {
    for (int i = 1; i < size; i++) {
        MPI_Recv(single_arr.data(), N, MPI_INT, rank + i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    }
}

Will receive that data from each process, however it will be overwriting (and not appending) the value of the array single_arr.

And that is why in

if (rank == root) {
        for (int i = 0; i < single_arr.size();i++) {
            cout << i << "\t" << single_arr[i] << endl;
        }
    }

you print the values of the last process (i.e., 3 and 4) followed by zeros.

To fix it you have to do the following:

if (rank == root) {
    for (int i = 1; i < size; i++) {
        MPI_Recv(&(single_arr.data()[N * i]), N, MPI_INT, rank + i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    }
}

Receive the data into the array single_arr starting from the position [N * i] to [N * i + N]