Search code examples
c++cudamemcpy

PeerToPeer data transfer with CUDA graphs


Utilizing CUDA Graphs, I want to transfer some data from one GPU to the other one via NVLink. After defining the graph and nodes, I am populating the memcpy parameters as following to transfer from GPU 0 to 1:

cudaMemcpy3DPeerParms memcpyParams = {0};

memset(&memcpyParams, 0, sizeof(memcpyParams));
memcpyParams.srcDevice = 0;
memcpyParams.srcArray = NULL;
memcpyParams.srcPos = make_cudaPos(0, 0, 0);
memcpyParams.srcPtr =
  make_cudaPitchedPtr((void *)d_inputs[0], data_size, data_count, 1);
memcpyParams.dstDevice = 1;
memcpyParams.dstArray = NULL;
memcpyParams.dstPos = make_cudaPos(0, 0, 0);
memcpyParams.dstPtr =
  make_cudaPitchedPtr(d_results[1], data_size, data_count, 1);
memcpyParams.extent = make_cudaExtent(data_size, 1, 1);

// Add the first copy node with no dependency
cudaGraphAddMemcpyNode(&copy_0to1, graph, NULL, 0, &memcpyParams);

The compiler doesn't like the last parameter of the cudaGraphAddMemcpyNode. It says:

error: argument of type "cudaMemcpy3DPeerParms *" is incompatible with parameter of type "const cudaMemcpy3DParms *"

Static conversion of the structures doesn't work, and they have different parameters too. From driver headerfile:

/**
 * CUDA 3D memory copying parameters
 */
struct __device_builtin__ cudaMemcpy3DParms
{
    cudaArray_t            srcArray;  /**< Source memory address */
    struct cudaPos         srcPos;    /**< Source position offset */
    struct cudaPitchedPtr  srcPtr;    /**< Pitched source memory address */
  
    cudaArray_t            dstArray;  /**< Destination memory address */
    struct cudaPos         dstPos;    /**< Destination position offset */
    struct cudaPitchedPtr  dstPtr;    /**< Pitched destination memory address */
  
    struct cudaExtent      extent;    /**< Requested memory copy size */
    enum cudaMemcpyKind    kind;      /**< Type of transfer */
};

/**
 * CUDA 3D cross-device memory copying parameters
 */
struct __device_builtin__ cudaMemcpy3DPeerParms
{
    cudaArray_t            srcArray;  /**< Source memory address */
    struct cudaPos         srcPos;    /**< Source position offset */
    struct cudaPitchedPtr  srcPtr;    /**< Pitched source memory address */
    int                    srcDevice; /**< Source device */
  
    cudaArray_t            dstArray;  /**< Destination memory address */
    struct cudaPos         dstPos;    /**< Destination position offset */
    struct cudaPitchedPtr  dstPtr;    /**< Pitched destination memory address */
    int                    dstDevice; /**< Destination device */
  
    struct cudaExtent      extent;    /**< Requested memory copy size */
};

It seems to me that there are no functions like cudaGraphAddMemcpyNode that accept cudaMemcpy3DPeerParms arguments. So, how can I send a piece of data from one GPU to its peer directly using NVLink in CUDA graphs?!


Solution

  • As Abator commented, using cudaMemcpyDefault with cudaMemcpy3Params worked:

    cudaMemcpy3DParms memcpyParams = {0};
    
    memset(&memcpyParams, 0, sizeof(memcpyParams));
    memcpyParams.srcArray = NULL;
    memcpyParams.srcPos = make_cudaPos(0, 0, 0);
    memcpyParams.srcPtr =
      make_cudaPitchedPtr((void *)d_inputs[0], data_size, data_count, 1);
    
    memcpyParams.dstArray = NULL;
    memcpyParams.dstPos = make_cudaPos(0, 0, 0);
    memcpyParams.dstPtr =
      make_cudaPitchedPtr(d_results[1], data_size, data_count, 1);
    memcpyParams.extent = make_cudaExtent(data_size, 1, 1);
    memcpyParams.kind = cudaMemcpyDefault;
    

    As mentioned in driver_types.h: Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing