For example the following C++ code concurrently allocates 2 4GB slabs on 2 separate GPU devices using cuMemAlloc()
. The numerical address ranges appear to never overlap with each other. Is this guaranteed for GPU devices in general on the same host? I wasn't able to find mention of this in the documentation. (In contrast, I have read that this is not guaranteed for CPU and GPU memory addresses.)
The practical application of this is so that one doesn't need to store both the (address,device_id) pair for a given GPU memory location. Instead, as along as we keep track of what memory address ranges have been allocated on which device, then we can always infer the correct device when given only the (GPU) memory address.
#include <iostream>
#include <string>
#include <thread>
#include <cuda.h>
// Allocated memory on different devices appear to not overlap.
CUcontext g_device_contexts[2];
std::string errorMessage(CUresult const result) {
char const *err;
cuGetErrorString(result, &err);
return err;
}
void checkError(CUresult const result, int line) {
if (result != CUDA_SUCCESS) {
throw std::runtime_error(errorMessage(result) + " on line " +
std::to_string(line));
}
}
void create_contexts() {
for (int i = 0; i < 2; ++i) {
checkError(cuCtxCreate(&g_device_contexts[i], 0, i), __LINE__);
}
}
void cuda_mem_alloc(int const device_id, CUdeviceptr *device_ptr_ptr) {
checkError(cuCtxSetCurrent(g_device_contexts[device_id]), __LINE__);
checkError(cuMemAlloc(device_ptr_ptr, 0x100000000ull), __LINE__);
}
int main() {
checkError(cuInit(0), __LINE__);
create_contexts();
CUdeviceptr device_ptr[2];
std::thread t1(cuda_mem_alloc, 1, &device_ptr[1]);
std::thread t0(cuda_mem_alloc, 0, &device_ptr[0]);
t0.join();
t1.join();
std::cout << "device_ptr[0]=" << (void *)device_ptr[0] << std::endl;
std::cout << "device_ptr[1]=" << (void *)device_ptr[1] << std::endl;
size_t const diff = device_ptr[0] < device_ptr[1]
? device_ptr[1] - device_ptr[0]
: device_ptr[0] - device_ptr[1];
std::cout << "Absolute diff=" << diff << std::endl;
return 0;
}
It should be guaranteed in a UA setting (and for driver API), subject to an appropriate interpretation of what that means for UM usage.
Non overlap of CPU/GPU addresses should also be guaranteed in a UA setting. UA is in effect for any 64-bit platform.