Search code examples
cudampiipcmultiprocess

Multiple processes sharing / reading from one large block of CUDA device memory


I have a multi-process application with a single GPU using CUDA multi process service (MPS). Each process creates several device arrays, however one of them is large (~5 GB), and its a constant array, so I thought I could allocate the memory once with one process, and instruct other processes to read from that memory block using "inter process communication" (similar to the CUDA API example shown here ).

Following the linked CUDA example, I tried to implement a simple test program, but have been hitting an API error. It seems when I call cudaIPCOpenMemHandle, I have an invalid argument. I post the code below, in hopes that someone might easily identify the reason for the error, or perhaps suggest a better use of the CUDA API to accomplish what Im trying to do.

#include <stdio.h>
#include <mpi.h>
#include <assert.h>
#include <sys/mman.h>

#define blockSize 128
#define N 1000
#define gpuErr(ans) { gpuAssert((ans), __FILE__, __LINE__); }


__global__ void kernel(double* out, double* in, double val){
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int thread_stride = blockDim.x * gridDim.x;
    for (int i=tid; i < N; i+=thread_stride){
        out[i] = in[i]*val;
    }
}

static void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

static void error_msg(cudaError_t err, int rank){
    if (err != cudaSuccess){
        printf("RANK %d recvd CUDA error message: %s\n", rank, cudaGetErrorString(err));
        exit(err);
    }
}

void check_access(){
    cudaDeviceProp prop;
    gpuErr(cudaGetDeviceProperties(&prop, 0));
    if (prop.unifiedAddressing)
        printf("> GPU%d = is capable of UVA\n", 0);

    // NOTE: only interested in enabling intra-device peer2peer, so I think this test doesnt matter ?
    //int can_access=-1;
    //int num_dev=2;
    //// note, here I was confused, I want the ability to have a process on device 0 access
    //for (peer_dev=0; peer_dev <num_dev, peer_dev++){
    //    int peer_dev=0; // note if peer_dev is 1
    //    gpuErr(cudaDeviceCanAccessPeer(&can_access, 0,peer_dev));
    //    if (can_access)
    //        printf("device 0 has peerdev=%d access\n", peer_dev);
    //    else
    //        printf("device 0 has no peerdev=%d access\n", peer_dev);
    //}
}

int main(){
    MPI_Init(NULL,NULL);
    int size,rank;
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (rank==0)
        check_access();
    gpuErr(cudaSetDevice(0));

    double* out;
    double * in;
    gpuErr(cudaMallocManaged((void **)&out, N*sizeof(double)));

    cudaIpcMemHandle_t * memHand = (cudaIpcMemHandle_t *)
        mmap(NULL, sizeof(cudaIpcMemHandle_t),
            PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0);

    assert(MAP_FAILED != memHand);
    memset((void *) memHand, 0, sizeof(cudaIpcMemHandle_t));

    MPI_Barrier(MPI_COMM_WORLD);

    if (rank==0){
        gpuErr(cudaMalloc((void **)&in, N*sizeof(double)));
        gpuErr(cudaIpcGetMemHandle((cudaIpcMemHandle_t *) &memHand[0], (void *)in));
        
        double * temp = new double[N];
        for (int i=0; i < N; i++)
            temp[i] = 1;
        gpuErr(cudaMemcpy(in, temp, N*sizeof(double), cudaMemcpyHostToDevice));
        delete temp;
    }
    MPI_Barrier(MPI_COMM_WORLD);

    // the following is throwing a CUDAerror, invalid
    if (rank >0 )
        gpuErr(cudaIpcOpenMemHandle((void **) &in, memHand[0], cudaIpcMemLazyEnablePeerAccess));
    
    MPI_Barrier(MPI_COMM_WORLD);

    int numBlocks = (N + blockSize - 1) / blockSize;
    double rank_val=(double) rank;
    kernel<<<numBlocks, blockSize>>>(out, in, rank_val);
    error_msg(cudaGetLastError(), rank);
    gpuErr(cudaDeviceSynchronize());
    MPI_Barrier(MPI_COMM_WORLD);

    // test the kernel results
    double sum = 0;
    for (int i=0; i < N; i++)
        sum += out[i];
    printf("mpirank=%d, comm.size=%d, result=%f\n", rank, size, sum);
    assert(sum==N*rank);

    // cleanup
    if (rank>0)
        cudaIpcCloseMemHandle(in);
    cudaFree(out);
    if (rank==0)
        cudaFree(in);

    return 0;
}

I compile with

 nvcc -I/usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/include  --compiler-options=-march=skylake-avx512 -L/usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib -lmpi ipc_tester.cu

Single process job output: (looks correct)

$ srun -n1 -c2 ./a.out
> GPU0 = is capable of UVA
mpirank=0, comm.size=1, result=0.000000

Multi process job output: (hits error in call cudaIPCOpenMemHandle)

$ srun -n2 -c2 ./a.out
GPUassert: invalid argument ipc_tester.cu 92

Compute sanitizer output:

$ srun -n2 -c2 compute-sanitizer ./a.out
========= COMPUTE-SANITIZER
========= COMPUTE-SANITIZER
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
=========     Saved host backtrace up to driver entry point at error
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:cuCtxGetDevice [0x155550d083eb]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame:uct_cuda_base_query_devices [0x15553e03f170]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
=========     Host Frame:cuCtxGetDevice [0x155550d083eb]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame:uct_md_query_tl_resources [0x15553e6c44c6]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
=========     Host Frame: [0x15553e9095a9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15553e90a7f9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15553e90abfd]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:ucp_init_version [0x15553e90b7f3]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:uct_cuda_base_query_devices [0x155546040170]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
=========     Host Frame:mca_pml_ucx_open [0x15553edc7e70]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
=========     Host Frame:mca_base_framework_components_open [0x15555299ef2d]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame: [0x155554472ec7]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:uct_md_query_tl_resources [0x1555466c54c6]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
=========     Host Frame: [0x15554690a5a9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15554690b7f9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15554690bbfd]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:mca_base_framework_open [0x1555529a8b31]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame:ompi_mpi_init [0x15555447fb5b]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:MPI_Init [0x15555442dc01]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:ucp_init_version [0x15554690c7f3]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x403f04]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========     Host Frame:mca_pml_ucx_open [0x155546dc8e70]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
=========     Host Frame:mca_base_framework_components_open [0x15555299ef2d]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame: [0x155554472ec7]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:mca_base_framework_open [0x1555529a8b31]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame:ompi_mpi_init [0x15555447fb5b]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:MPI_Init [0x15555442dc01]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame: [0x403f04]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:cuCtxGetDevice [0x155550d083eb]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame:uct_cuda_base_query_devices [0x15553e03f170]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
=========     Host Frame:uct_md_query_tl_resources [0x15553e6c44c6]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
=========     Host Frame: [0x15553e9095a9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15553e90a7f9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15553e90abfd]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:ucp_init_version [0x15553e90b7f3]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:mca_pml_ucx_open [0x15553edc7e70]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
=========     Host Frame:mca_base_framework_components_open [0x15555299ef2d]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame: [0x155554472ec7]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:mca_base_framework_open [0x1555529a8b31]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame:cuCtxGetDevice [0x155550d083eb]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame:ompi_mpi_init [0x15555447fb5b]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:uct_cuda_base_query_devices [0x155546040170]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
=========     Host Frame:MPI_Init [0x15555442dc01]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame: [0x403f04]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame:uct_md_query_tl_resources [0x1555466c54c6]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
=========     Host Frame: [0x15554690a5a9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15554690b7f9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15554690bbfd]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:ucp_init_version [0x15554690c7f3]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
=========     Host Frame:mca_pml_ucx_open [0x155546dc8e70]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
=========     Host Frame:mca_base_framework_components_open [0x15555299ef2d]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame: [0x155554472ec7]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:mca_base_framework_open [0x1555529a8b31]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame:ompi_mpi_init [0x15555447fb5b]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:MPI_Init [0x15555442dc01]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame: [0x403f04]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
=========     Saved host backtrace up to driver entry point at error
========= Program hit invalid device context (error 201) on CUDA API call to cuCtxGetDevice.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:cuCtxGetDevice [0x155550d083eb]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame:cuCtxGetDevice [0x155550d083eb]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame:uct_cuda_base_query_devices [0x15553e03f170]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
=========     Host Frame:uct_cuda_base_query_devices [0x155546040170]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/ucx/libuct_cuda.so.0
=========     Host Frame:uct_md_query_tl_resources [0x15553e6c44c6]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
=========     Host Frame: [0x15553e9095a9]
=========     Host Frame:uct_md_query_tl_resources [0x1555466c54c6]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libuct.so.0
=========     Host Frame: [0x15554690a5a9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15554690b7f9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15554690bbfd]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15553e90a7f9]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame: [0x15553e90abfd]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:ucp_init_version [0x15553e90b7f3]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:ucp_init_version [0x15554690c7f3]
=========                in /usr/common/software/sles15_cgpu/ucx/1.10.1/lib/libucp.so.0
=========     Host Frame:mca_pml_ucx_open [0x155546dc8e70]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
=========     Host Frame:mca_pml_ucx_open [0x15553edc7e70]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/openmpi/mca_pml_ucx.so
=========     Host Frame:mca_base_framework_components_open [0x15555299ef2d]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame: [0x155554472ec7]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:mca_base_framework_components_open [0x15555299ef2d]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame: [0x155554472ec7]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:mca_base_framework_open [0x1555529a8b31]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame:mca_base_framework_open [0x1555529a8b31]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libopen-pal.so.40
=========     Host Frame:ompi_mpi_init [0x15555447fb5b]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:ompi_mpi_init [0x15555447fb5b]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame:MPI_Init [0x15555442dc01]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame: [0x403f04]
=========     Host Frame:MPI_Init [0x15555442dc01]
=========                in /usr/common/software/sles15_cgpu/openmpi/4.0.3/gcc/lib/libmpi.so.40
=========     Host Frame: [0x403f04]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
========= Program hit invalid argument (error 1) on CUDA API call to cudaIpcOpenMemHandle.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x155550dde1b3]
=========                in /usr/common/software/sles15_cgpu/cuda/11.1.1/lib64/compat/libcuda.so.1
=========     Host Frame: [0x433fac]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame: [0x40412e]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
=========     Host Frame:__libc_start_main [0x1555531173ea]
=========                in /lib64/libc.so.6
=========     Host Frame: [0x403d1a]
=========                in /global/cfs/cdirs/lcls/dermen/dulios/./a.out
========= 
GPUassert: invalid argument ipc_tester.cu 92
========= Error: process didn't terminate successfully
========= Target application returned an error
========= ERROR SUMMARY: 4 errors


System info:

$ lsb_release  -a
LSB Version:    n/a
Distributor ID: SUSE
Description:    SUSE Linux Enterprise Server 15 SP2
Release:        15.2
Codename:       n/a

$ nvidia-smi 
Tue Sep 27 10:05:48 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla V100-SXM2...  On   | 00000000:89:00.0 Off |                    0 |
| N/A   34C    P0    38W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+


Solution

  • As pointed out in the comments, the memHandler was not properly set on ranks>0.

    After learning how to broadcast the memHandler, I arrived at a solution. The patch below leads to a working code.

    @@ -66,12 +66,7 @@ int main(){
         double * in;
         gpuErr(cudaMallocManaged((void **)&out, N*sizeof(double)));
     
    -    cudaIpcMemHandle_t * memHand = (cudaIpcMemHandle_t *)
    -        mmap(NULL, sizeof(cudaIpcMemHandle_t),
    -            PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0);
    -
    -    assert(MAP_FAILED != memHand);
    -    memset((void *) memHand, 0, sizeof(cudaIpcMemHandle_t));
    +    cudaIpcMemHandle_t memHand[1];
     
         MPI_Barrier(MPI_COMM_WORLD);
     
    @@ -87,6 +82,21 @@ int main(){
         }
         MPI_Barrier(MPI_COMM_WORLD);
     
    +//  Broadcast the MPI handle
    +//  get size of memHandler container needed for broadcast
    +    int hand_size[1];
    +    if (rank==0)
    +        hand_size[0]= sizeof(memHand[0]);
    +    MPI_Bcast(&hand_size[0], 1, MPI_INT, 0, MPI_COMM_WORLD);
    +
    +    // create the char container for memHandler broadcast
    +    char memHand_C[hand_size[0]];
    +    if (rank==0)
    +        memcpy(&memHand_C, &memHand[0], hand_size[0]);
    +    MPI_Bcast(&memHand_C, hand_size[0], MPI_BYTE, 0, MPI_COMM_WORLD);
    +    if (rank >0)
    +        memcpy(&memHand[0], &memHand_C, hand_size[0]);
    +
         // the following is throwing a CUDAerror, invalid
         if (rank >0 )
             gpuErr(cudaIpcOpenMemHandle((void **) &in, memHand[0], cudaIpcMemLazyEnablePeerAccess));