This program:
#include <string>
#include <stdexcept>
struct buffers_t {
void* host_buffer;
void* device_buffer;
};
void ensure_no_error(std::string message) {
auto status = cudaGetLastError();
if (status != cudaSuccess) {
throw std::runtime_error(message + ": " + cudaGetErrorString(status));
}
}
void my_callback(cudaStream_t stream, cudaError_t status, void* args) {
auto buffers = (buffers_t *) args;
cudaMemcpyAsync(
buffers->host_buffer, buffers->device_buffer,
1, cudaMemcpyDefault, stream);
ensure_no_error("after cudaMemcpyAsync");
}
int main() {
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
buffers_t buffers;
cudaMallocHost(&buffers.host_buffer, 1);
cudaMalloc(&buffers.device_buffer, 1);
cudaStreamAddCallback(stream, my_callback, &buffers, 0);
ensure_no_error("after enqueue callback");
cudaStreamSynchronize(stream);
ensure_no_error("after sync");
}
yields:
terminate called after throwing an instance of 'std::runtime_error'
what(): after cudaMemcpyAsync: operation not permitted
Aborted
That's kind of weird, because the API reference for cudaMemcpyAsync
does not list cudaErrorNotPermitted
as one of the potential errors. Is there really a problem with scheduling an async copy from a callback?
Note: My machine has a GTX 650 Ti (CC 3.0), CUDA 9.0, Linux kernel 4.8.0, driver 384.59.
Is there really a problem with scheduling an async copy from a callback?
From the documentation on stream callbacks:
A callback must not make CUDA API calls (directly or indirectly), as it might end up waiting on itself if it makes such a call leading to a deadlock.