I use thrust a lot in my code, because it is a great wrapper and provide very useful utilities, I am even more convinced since the support of asynchronous behaviour has been added.
My code was working well using cuda thrust until I recently tried to add multi-GPU support in my application. I experienced annoying
CUDA Runtime API error 77 : an illegal memory access was encountered
over part of my code that never showed any bounds problems before.
I added verbosity to my code and it appeared that my thrust::device_vector pointer address were changing along the execution, for no apparent reason, generating error 77 in handwritten kernels.
I may have misunderstood the UVA concept and its eventual "side effects", but still, I am interested in the understanding of the process that lead to pointer update.
I was not able to reproduce exactly my problem, in which I do not use temporary host variable to store cuda memory pointer, but only thrust::raw_pointer_cast on the fly when needed in kernel wrapper call.
But I have written a small program that shows what kind error I may have trouble with, note that this is not robust and you need to have at least 2 gpu on your system to run it:
/********************************************************************************************
** Compile using nvcc ./test.cu -gencode arch=compute_35,code=sm_35 -std=c++11 -o test.exe **
********************************************************************************************/
//Standard Library
#include <iostream>
#include <vector>
//Cuda
#include "cuda_runtime.h"
//Thrust
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( err != cudaSuccess )
{
printf("%s(%i) : CUDA Runtime API error %i : %s \n",file ,line, (int)err, cudaGetErrorString(err) );
}
};
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
__global__ void write_memory( float* buf, float value )
{
printf("GPU TALK: Raw pointer is %p \n",buf);
buf[0] = value;
}
int main( int argc, char* argv[] )
{
//declare a vector of vector
std::vector<thrust::device_vector<float> > v;
float test;
float* tmp;
//Initialize first vector on GPU 0
cudaSetDevice( 0 );
v.emplace_back( 65536, 1.0f );
tmp = thrust::raw_pointer_cast( v.at(0).data() );
std::cout << " Host TALK: Raw pointer of vector 0 at step 0 " << (void*)tmp << std::endl;
//Try to use it raw pointer
write_memory<<<1,1,0,0>>>( tmp, 2.0f );
checkCudaErrors( cudaStreamSynchronize( NULL ) );
test = v.at(0)[0];
std::cout << " Host TALK: After first kernel launch, value is " << test << std::endl;
//Initialize second vector on GPU 1, but we do not use it
cudaSetDevice( 1 );
v.emplace_back( 65536, 1.0f );
std::cout << " Host TALK: Raw pointer of vector 0 at step 1 is now " << (void*)thrust::raw_pointer_cast( v.at(0).data() ) << " != " << (void*)tmp << std::endl;
std::cout << " Host TALK: Raw pointer of vector 1 at step 1 is " << (void*)thrust::raw_pointer_cast( v.at(1).data() ) << std::endl;
//Try to use the first vector : No segmentation fault ?
test = v.at(0)[0];
std::cout << " Host TALK: Before second kernel launch, value is " << test << std::endl;
write_memory<<<1,1,0,0>>>( thrust::raw_pointer_cast( v.at(0).data() ), 3.0f );
checkCudaErrors( cudaStreamSynchronize( NULL ) );
test = v.at(0)[0];
std::cout << " Host TALK: After second kernel launch, value is " << test << std::endl;
//Raw pointer stored elsewhere: generates a segmentation fault
write_memory<<<1,1,0,0>>>( tmp, 4.0f );
checkCudaErrors( cudaStreamSynchronize( NULL ) );
test = v.at(0)[0];
std::cout << " Host TALK: After third kernel launch, value is " << test << std::endl;
return 0;
}
Here is and example of the output it produces on my machine:
Host TALK: Raw pointer of vector 0 at step 0 0xb043c0000
GPU TALK: Raw pointer is 0xb043c0000
Host TALK: After first kernel launch, value is 2
Host TALK: Raw pointer of vector 0 at step 1 is now 0xb08000000 != 0xb043c0000
Host TALK: Raw pointer of vector 1 at step 1 is 0xb07fc0000
Host TALK: Before second kernel launch, value is 2
GPU TALK: Raw pointer is 0xb08000000
Host TALK: After second kernel launch, value is 3
GPU TALK: Raw pointer is 0xb043c0000
./test.cu(68) : CUDA Runtime API error 77 : an illegal memory access was encountered terminate called after throwing an instance of 'thrust::system::system_error' what(): an illegal memory access was encountered
Thank you in advance for your help, I may also ask this question on thrust's github.
EDIT: Thanks to m.s and Hiura, here is a code that works as expected:
/********************************************************************************************
** Compile using nvcc ./test.cu -gencode arch=compute_35,code=sm_35 -std=c++11 -o test.exe **
********************************************************************************************/
//Standard Library
#include <iostream>
#include <vector>
//Cuda
#include "cuda_runtime.h"
//Thrust
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( err != cudaSuccess )
{
printf("%s(%i) : CUDA Runtime API error %i : %s \n",file ,line, (int)err, cudaGetErrorString(err) );
}
};
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
__global__ void write_memory( float* buf, float value )
{
printf("GPU TALK: Raw pointer is %p \n",buf);
buf[0] = value;
}
int main( int argc, char* argv[] )
{
//declare a vector of vector
std::vector<thrust::device_vector<float> > v;
v.reserve(2);
float test;
float* tmp;
//Initialize first vector on GPU 0
cudaSetDevice( 0 );
v.emplace_back( 65536, 1.0f );
tmp = thrust::raw_pointer_cast( v.at(0).data() );
std::cout << " Host TALK: Raw pointer of vector 0 at step 0 " << (void*)tmp << std::endl;
//Try to use it raw pointer
write_memory<<<1,1,0,0>>>( tmp, 2.0f );
checkCudaErrors( cudaStreamSynchronize( NULL ) );
test = v.at(0)[0];
std::cout << " Host TALK: After first kernel launch, value is " << test << std::endl;
//Initialize second vector on GPU 1, but we do not use it
cudaSetDevice( 1 );
v.emplace_back( 65536, 1.0f );
std::cout << " Host TALK: Raw pointer of vector 0 at step 1 is now " << (void*)thrust::raw_pointer_cast( v.at(0).data() ) << " != " << (void*)tmp << std::endl;
std::cout << " Host TALK: Raw pointer of vector 1 at step 1 is " << (void*)thrust::raw_pointer_cast( v.at(1).data() ) << std::endl;
//Try to use the first vector : No segmentation fault ?
cudaSetDevice( 0 );
test = v.at(0)[0];
std::cout << " Host TALK: Before second kernel launch, value is " << test << std::endl;
write_memory<<<1,1,0,0>>>( thrust::raw_pointer_cast( v.at(0).data() ), 3.0f );
checkCudaErrors( cudaStreamSynchronize( NULL ) );
test = v.at(0)[0];
std::cout << " Host TALK: After second kernel launch, value is " << test << std::endl;
//Raw pointer stored elsewhere: generates a segmentation fault
write_memory<<<1,1,0,0>>>( tmp, 4.0f );
checkCudaErrors( cudaStreamSynchronize( NULL ) );
test = v.at(0)[0];
std::cout << " Host TALK: After third kernel launch, value is " << test << std::endl;
return 0;
}
It was one of the last place in my code where I did not used vector of pointer to objects instead of vector of objects for simplicity, but I see that I should have to avoid these annoying move/copy problems ...
Output now is:
Host TALK: Raw pointer of vector 0 at step 0 0xb043c0000
GPU TALK: Raw pointer is 0xb043c0000
Host TALK: After first kernel launch, value is 2
Host TALK: Raw pointer of vector 0 at step 1 is now 0xb043c0000 != xb043c0000
Host TALK: Raw pointer of vector 1 at step 1 is 0xb07fc0000
Host TALK: Before second kernel launch, value is 2
GPU TALK: Raw pointer is 0xb043c0000
Host TALK: After second kernel launch, value is 3
GPU TALK: Raw pointer is 0xb043c0000
Host TALK: After third kernel launch, value is 4
So I installed CUDA quickly to test my hypothesis: adding a reserve
statement preserves the addresses.
//declare a vector of vector
std::vector<thrust::device_vector<float> > v;
v.reserve(2); // <<-- HERE
float test;
float* tmp;
And the outputs, first without the patch.
$ nvcc thrust.cu -std=c++11 -o test
$ ./test
Host TALK: Raw pointer of vector 0 at step 0 0x700ca0000
GPU TALK: Raw pointer is 0x700ca0000
Host TALK: After first kernel launch, value is 2
Host TALK: Raw pointer of vector 0 at step 1 is now 0x700d20000 != 0x700ca0000
Host TALK: Raw pointer of vector 1 at step 1 is 0x700ce0000
Host TALK: Before second kernel launch, value is 2
GPU TALK: Raw pointer is 0x700d20000
Host TALK: After second kernel launch, value is 3
GPU TALK: Raw pointer is 0x700ca0000
Host TALK: After third kernel launch, value is 3
with the patch:
$ nvcc thrust.cu -std=c++11 -o test
$ ./test
Host TALK: Raw pointer of vector 0 at step 0 0x700ca0000
GPU TALK: Raw pointer is 0x700ca0000
Host TALK: After first kernel launch, value is 2
Host TALK: Raw pointer of vector 0 at step 1 is now 0x700ca0000 != 0x700ca0000
Host TALK: Raw pointer of vector 1 at step 1 is 0x700ce0000
Host TALK: Before second kernel launch, value is 2
GPU TALK: Raw pointer is 0x700ca0000
Host TALK: After second kernel launch, value is 3
GPU TALK: Raw pointer is 0x700ca0000
Host TALK: After third kernel launch, value is 4