For example, it looks like I don't need to do cudaHostUnregister
and cudaHostRegister
if I switch the device.
The performance and GPU shared memory does not change much.
// Set Device to 0
cudaError_t cudaStatus = cudaSetDevice(0);
// Allocate buffers
int size = 8192;
float* cpuBuffer = new float[size * size];
auto totalBytes = sizeof(float) * size * size;
cudaHostRegister((void*)cpuBuffer, totalBytes, cudaHostRegisterPortable);
float* gpuBuffer = nullptr;
cudaMalloc((void**)&gpuBuffer, totalBytes);
// Transfer
cudaMemcpy((void*)gpuBuffer, (const void*)cpuBuffer, totalBytes, cudaMemcpyHostToDevice);
// Free gpu
cudaFree(gpuBuffer);
//cudaHostUnregister(cpuBuffer);
// Switch to Device 1
cudaStatus = cudaSetDevice(1);
cudaMalloc((void**)&gpuBuffer, totalBytes);
//cudaHostRegister((void*)cpuBuffer, totalBytes, cudaHostRegisterPortable);
// Transfer
cudaMemcpy((void*)gpuBuffer, (const void*)cpuBuffer, totalBytes, cudaMemcpyHostToDevice);
cudaFree(gpuBuffer);
cudaHostUnregister(cpuBuffer);
delete[] cpuBuffer;
No. cudaHostRegister
just pins the CPU memory of an already allocated CPU buffer. It is not dependent on the current GPU. Host memory is always accessible by all GPUs in cudaMemcpy
calls.