Search code examples
cmpims-mpi

MPI Reduce and Broadcast work, but cause a future return value to fail


I am using MPI to implement Dijkstras algorithm for a class. My teacher also has no idea of why this is broken and has given me permission to post here.

My problem is happening in the chooseVertex function. The program works fine for 1 processor, but when I run it with 2 processors, processor 0 fails to return leastPostition, even though I am able to print the contents of leastPosition on the line before the return. My code:

#include "mpi.h"
#include <stdlib.h>
#include <stdio.h>

#define min(x,y) ((x) > (y) ? (y) : (x))
#define MASTER 0
#define INFINTY 100000

void dijkstra(int, int, int **, int *, int, int);
int chooseVertex(int *, int, int *, int, int);

int main(int argc, char* argv[])
{
int rank, size, i, j;
//Initialize MPI
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);

//Initialize graph
int src = 0;
int n = 12;
int **edge = (int**) malloc(n * sizeof(int *));
for (i = 0; i < n; i++)
    edge[i] = (int *)malloc(n * sizeof(int));
int dist[12];

//Set all graph lengths to infinity
for (i = 0; i < n; i++)
{
    for (j = 0; j < n; j++)
    {
        if (i == j) { edge[i][j] = 0; }
        else { edge[i][j] = INFINTY; }
    }
}
//set graph edge lengths
edge[0][3] = 5;
edge[0][6] = 13;
edge[1][5] = 12;
edge[2][1] = 7;
edge[3][2] = 9;
edge[3][4] = 2;
edge[4][7] = 3;
edge[5][10] = 1;
edge[5][11] = 4;
edge[6][9] = 9;
edge[7][8] = 4;
edge[8][9] = 10;
edge[8][10] = 7;
edge[9][10] = 6;
edge[10][11] = 1;

dijkstra(src, n, edge, dist, rank, size);

if(rank == MASTER){ printf("The distance is %d", dist[n - 1]); }

MPI_Finalize();
return 0;
}

//called by dijkstras function below
int chooseVertex(int *dist, int n, int *found, int rank, int size) {
int i, tmp, partition, lower, upper, leastPosition;
int least = INFINTY;

//set the number of nodes wach processor will work with
partition = n / size;
lower = rank * partition;
upper = lower + partition;

//used for MPI_Reduce
struct {
    int pos;
    int val;
} sendBuffr, recvBuffr;

//calculate least position
for (i = lower; i < upper; i++) {
    tmp = dist[i];
    if ((!found[i]) && (tmp < least)) {
        least = tmp;
        leastPosition = i;
    }
}
//if all nodes checked are INFINITY, go with last node checked
if (least == INFINTY) leastPosition = i;

//set the send buffer for MPI_Reduce
sendBuffr.val = least;
sendBuffr.pos = leastPosition;

//Rank 0 processor has correct least position and value
MPI_Reduce(&sendBuffr, &recvBuffr, 1, MPI_DOUBLE_INT, MPI_MINLOC, MASTER, MPI_COMM_WORLD);


if (rank == MASTER) leastPosition = recvBuffr.pos;

//Update all processors to have correct position
MPI_Bcast(&leastPosition, 1, MPI_INT, MASTER, MPI_COMM_WORLD);

//Print the contents of leastPosition on rank 0 for debugging
if(rank == MASTER) printf("LeastPosition for rank %d is: %d\n", rank, leastPosition);
fflush(stdout);

return leastPosition;
}

void dijkstra(int SOURCE, int n, int **edge, int *dist, int rank, int size)          
{
int i, j, count, partition, lower, upper, *found, *sendBuffer;

j = INFINTY;
sendBuffer = (int *)malloc(n * sizeof(int));
found = (int *)calloc(n, sizeof(int));

partition = n / size;
lower = rank * partition;
upper = lower + partition;

//set the distance array
for (i = 0; i < n; i++) {
    found[i] = 0;
    dist[i] = edge[SOURCE][i];
    sendBuffer[i] = dist[i];
}

found[SOURCE] = 1;
count = 1;

//Dijkstra loop
while (count < n) {
    printf("before ChooseVertex:  rank %d reporting\n", rank);
    fflush(stdout);
    j = chooseVertex(dist, n, found, rank, size);
    printf("after ChooseVertex:   rank %d reporting\n", rank);
    fflush(stdout);
    count++;

    found[j] = 1;
    for (i = lower; i < upper; i++) {
        if (!found[i])
        {
            dist[i] = min(dist[i], dist[j] + edge[j][i]);
            sendBuffer[i] = dist[i];
        }
    }
    MPI_Reduce(sendBuffer, dist, n, MPI_INT, MPI_MIN, MASTER, MPI_COMM_WORLD);
    MPI_Bcast(dist, n, MPI_INT, MASTER, MPI_COMM_WORLD);
}
}

Sample error messages:

before ChooseVertex:  rank 1 reporting
before ChooseVertex:  rank 0 reporting
LeastPosition for rank 0 is: 3
after ChooseVertex:   rank 1 reporting
after ChooseVertex:   rank 0 reporting
before ChooseVertex:  rank 1 reporting
before ChooseVertex:  rank 0 reporting
after ChooseVertex:   rank 1 reporting
LeastPosition for rank 0 is: 4
after ChooseVertex:   rank 0 reporting
before ChooseVertex:  rank 0 reporting
before ChooseVertex:  rank 1 reporting
LeastPosition for rank 0 is: 7
after ChooseVertex:   rank 1 reporting

job aborted:
[ranks] message

[0] process exited without calling finalize

[1] terminated

---- error analysis -----

[0] on My-ComputerName
Assignmet3PP ended prematurely and may have crashed. exit code 3

---- error analysis -----

Solution

  • Your reduce command is:

    MPI_Reduce(&sendBuffr, &recvBuffr, 1, MPI_DOUBLE_INT, MPI_MINLOC, MASTER, MPI_COMM_WORLD);
    

    By using MPI_DOUBLE_INT, you are saying that you are sending a struct with two variables: a double followed by an int. This is not your struct however: you only have 2 ints. Therefore you should use MPI_2INT. These types were derived from this source. Alternatively, you could create your own type using vectors.

    An example fix is:

    MPI_Reduce(&sendBuffr, &recvBuffr, 1, MPI_2INT, MPI_MINLOC, MASTER, MPI_COMM_WORLD);
    

    Also, a reduction, followed by a broadcast can be easily combined into one step with MPI_Allreduce().