Utilizing CUDA Graphs, I want to transfer some data from one GPU to the other one via NVLink. After defining the graph and nodes, I am populating the memcpy
parameters as following to transfer from GPU 0 to 1:
cudaMemcpy3DPeerParms memcpyParams = {0};
memset(&memcpyParams, 0, sizeof(memcpyParams));
memcpyParams.srcDevice = 0;
memcpyParams.srcArray = NULL;
memcpyParams.srcPos = make_cudaPos(0, 0, 0);
memcpyParams.srcPtr =
make_cudaPitchedPtr((void *)d_inputs[0], data_size, data_count, 1);
memcpyParams.dstDevice = 1;
memcpyParams.dstArray = NULL;
memcpyParams.dstPos = make_cudaPos(0, 0, 0);
memcpyParams.dstPtr =
make_cudaPitchedPtr(d_results[1], data_size, data_count, 1);
memcpyParams.extent = make_cudaExtent(data_size, 1, 1);
// Add the first copy node with no dependency
cudaGraphAddMemcpyNode(©_0to1, graph, NULL, 0, &memcpyParams);
The compiler doesn't like the last parameter of the cudaGraphAddMemcpyNode
. It says:
error: argument of type "cudaMemcpy3DPeerParms *" is incompatible with parameter of type "const cudaMemcpy3DParms *"
Static conversion of the structures doesn't work, and they have different parameters too. From driver headerfile:
/**
* CUDA 3D memory copying parameters
*/
struct __device_builtin__ cudaMemcpy3DParms
{
cudaArray_t srcArray; /**< Source memory address */
struct cudaPos srcPos; /**< Source position offset */
struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */
cudaArray_t dstArray; /**< Destination memory address */
struct cudaPos dstPos; /**< Destination position offset */
struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */
struct cudaExtent extent; /**< Requested memory copy size */
enum cudaMemcpyKind kind; /**< Type of transfer */
};
/**
* CUDA 3D cross-device memory copying parameters
*/
struct __device_builtin__ cudaMemcpy3DPeerParms
{
cudaArray_t srcArray; /**< Source memory address */
struct cudaPos srcPos; /**< Source position offset */
struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */
int srcDevice; /**< Source device */
cudaArray_t dstArray; /**< Destination memory address */
struct cudaPos dstPos; /**< Destination position offset */
struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */
int dstDevice; /**< Destination device */
struct cudaExtent extent; /**< Requested memory copy size */
};
It seems to me that there are no functions like cudaGraphAddMemcpyNode
that accept cudaMemcpy3DPeerParms
arguments. So, how can I send a piece of data from one GPU to its peer directly using NVLink in CUDA graphs?!
As Abator commented, using cudaMemcpyDefault
with cudaMemcpy3Params
worked:
cudaMemcpy3DParms memcpyParams = {0};
memset(&memcpyParams, 0, sizeof(memcpyParams));
memcpyParams.srcArray = NULL;
memcpyParams.srcPos = make_cudaPos(0, 0, 0);
memcpyParams.srcPtr =
make_cudaPitchedPtr((void *)d_inputs[0], data_size, data_count, 1);
memcpyParams.dstArray = NULL;
memcpyParams.dstPos = make_cudaPos(0, 0, 0);
memcpyParams.dstPtr =
make_cudaPitchedPtr(d_results[1], data_size, data_count, 1);
memcpyParams.extent = make_cudaExtent(data_size, 1, 1);
memcpyParams.kind = cudaMemcpyDefault;
As mentioned in driver_types.h
: Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing