Search code examples
c++cudathrust

thrust::raw_pointer_cast and multiple GPU, weird behaviour


I use thrust a lot in my code, because it is a great wrapper and provide very useful utilities, I am even more convinced since the support of asynchronous behaviour has been added.

My code was working well using cuda thrust until I recently tried to add multi-GPU support in my application. I experienced annoying

CUDA Runtime API error 77 : an illegal memory access was encountered

over part of my code that never showed any bounds problems before.

I added verbosity to my code and it appeared that my thrust::device_vector pointer address were changing along the execution, for no apparent reason, generating error 77 in handwritten kernels.

I may have misunderstood the UVA concept and its eventual "side effects", but still, I am interested in the understanding of the process that lead to pointer update.

I was not able to reproduce exactly my problem, in which I do not use temporary host variable to store cuda memory pointer, but only thrust::raw_pointer_cast on the fly when needed in kernel wrapper call.

But I have written a small program that shows what kind error I may have trouble with, note that this is not robust and you need to have at least 2 gpu on your system to run it:

/********************************************************************************************
** Compile using nvcc ./test.cu -gencode arch=compute_35,code=sm_35 -std=c++11 -o test.exe **
********************************************************************************************/

//Standard Library
#include <iostream>
#include <vector>

//Cuda
#include "cuda_runtime.h"

//Thrust
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform.h>

inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
    if( err != cudaSuccess )
    {
        printf("%s(%i) : CUDA Runtime API error %i : %s \n",file ,line, (int)err, cudaGetErrorString(err) );
    }
};

#define checkCudaErrors(err)    __checkCudaErrors (err, __FILE__, __LINE__)

__global__ void write_memory( float* buf, float value )
{
    printf("GPU TALK: Raw pointer is %p \n",buf);
    buf[0] = value;
}

int main( int argc, char* argv[] )
{
    //declare a vector of vector
    std::vector<thrust::device_vector<float> > v;
    float test;
    float* tmp;

    //Initialize first vector on GPU 0
    cudaSetDevice( 0 );
    v.emplace_back( 65536, 1.0f );
    tmp = thrust::raw_pointer_cast( v.at(0).data() );
    std::cout << " Host TALK: Raw pointer of vector 0 at step 0 " << (void*)tmp << std::endl;

    //Try to use it raw pointer
    write_memory<<<1,1,0,0>>>( tmp, 2.0f );
    checkCudaErrors( cudaStreamSynchronize( NULL ) );
    test = v.at(0)[0];
    std::cout << " Host TALK: After first kernel launch, value is " << test << std::endl;

    //Initialize second vector on GPU 1, but we do not use it
    cudaSetDevice( 1 );
    v.emplace_back( 65536, 1.0f );
    std::cout << " Host TALK: Raw pointer of vector 0 at step 1 is now " << (void*)thrust::raw_pointer_cast( v.at(0).data() ) << " != " << (void*)tmp << std::endl;
    std::cout << " Host TALK: Raw pointer of vector 1 at step 1 is " << (void*)thrust::raw_pointer_cast( v.at(1).data() ) << std::endl; 

    //Try to use the first vector : No segmentation fault ?
    test = v.at(0)[0];
    std::cout << " Host TALK: Before second kernel launch, value is " << test << std::endl;
    write_memory<<<1,1,0,0>>>( thrust::raw_pointer_cast( v.at(0).data() ), 3.0f );
    checkCudaErrors( cudaStreamSynchronize( NULL ) );
    test = v.at(0)[0];
    std::cout << " Host TALK: After second kernel launch, value is " << test << std::endl;

    //Raw pointer stored elsewhere: generates a segmentation fault
    write_memory<<<1,1,0,0>>>( tmp, 4.0f );
    checkCudaErrors( cudaStreamSynchronize( NULL ) );
    test = v.at(0)[0];
    std::cout << " Host TALK: After third kernel launch, value is " << test << std::endl;

    return 0;
}

Here is and example of the output it produces on my machine:

Host TALK: Raw pointer of vector 0 at step 0 0xb043c0000
GPU TALK: Raw pointer is 0xb043c0000
Host TALK: After first kernel launch, value is 2
Host TALK: Raw pointer of vector 0 at step 1 is now 0xb08000000 != 0xb043c0000
Host TALK: Raw pointer of vector 1 at step 1 is 0xb07fc0000
Host TALK: Before second kernel launch, value is 2
GPU TALK: Raw pointer is 0xb08000000
Host TALK: After second kernel launch, value is 3
GPU TALK: Raw pointer is 0xb043c0000
./test.cu(68) : CUDA Runtime API error 77 : an illegal memory access was encountered terminate called after throwing an instance of 'thrust::system::system_error' what(): an illegal memory access was encountered

Thank you in advance for your help, I may also ask this question on thrust's github.

EDIT: Thanks to m.s and Hiura, here is a code that works as expected:

/********************************************************************************************
** Compile using nvcc ./test.cu -gencode arch=compute_35,code=sm_35 -std=c++11 -o test.exe **
********************************************************************************************/

//Standard Library
#include <iostream>
#include <vector>

//Cuda
#include "cuda_runtime.h"

//Thrust
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform.h>

inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
    if( err != cudaSuccess )
    {
        printf("%s(%i) : CUDA Runtime API error %i : %s \n",file ,line, (int)err, cudaGetErrorString(err) );
    }
};

#define checkCudaErrors(err)    __checkCudaErrors (err, __FILE__, __LINE__)

__global__ void write_memory( float* buf, float value )
{
    printf("GPU TALK: Raw pointer is %p \n",buf);
    buf[0] = value;
}

int main( int argc, char* argv[] )
{
    //declare a vector of vector
    std::vector<thrust::device_vector<float> > v;
    v.reserve(2);
    float test;
    float* tmp;

    //Initialize first vector on GPU 0
    cudaSetDevice( 0 );
    v.emplace_back( 65536, 1.0f );
    tmp = thrust::raw_pointer_cast( v.at(0).data() );
    std::cout << " Host TALK: Raw pointer of vector 0 at step 0 " << (void*)tmp << std::endl;

    //Try to use it raw pointer
    write_memory<<<1,1,0,0>>>( tmp, 2.0f );
    checkCudaErrors( cudaStreamSynchronize( NULL ) );
    test = v.at(0)[0];
    std::cout << " Host TALK: After first kernel launch, value is " << test << std::endl;

    //Initialize second vector on GPU 1, but we do not use it
    cudaSetDevice( 1 );
    v.emplace_back( 65536, 1.0f );
    std::cout << " Host TALK: Raw pointer of vector 0 at step 1 is now " << (void*)thrust::raw_pointer_cast( v.at(0).data() ) << " != " << (void*)tmp << std::endl;
    std::cout << " Host TALK: Raw pointer of vector 1 at step 1 is " << (void*)thrust::raw_pointer_cast( v.at(1).data() ) << std::endl; 

    //Try to use the first vector : No segmentation fault ?
    cudaSetDevice( 0 );
    test = v.at(0)[0];
    std::cout << " Host TALK: Before second kernel launch, value is " << test << std::endl;
    write_memory<<<1,1,0,0>>>( thrust::raw_pointer_cast( v.at(0).data() ), 3.0f );
    checkCudaErrors( cudaStreamSynchronize( NULL ) );
    test = v.at(0)[0];
    std::cout << " Host TALK: After second kernel launch, value is " << test << std::endl;

    //Raw pointer stored elsewhere: generates a segmentation fault
    write_memory<<<1,1,0,0>>>( tmp, 4.0f );
    checkCudaErrors( cudaStreamSynchronize( NULL ) );
    test = v.at(0)[0];
    std::cout << " Host TALK: After third kernel launch, value is " << test << std::endl;

    return 0;
}

It was one of the last place in my code where I did not used vector of pointer to objects instead of vector of objects for simplicity, but I see that I should have to avoid these annoying move/copy problems ...

Output now is:

Host TALK: Raw pointer of vector 0 at step 0 0xb043c0000
GPU TALK: Raw pointer is 0xb043c0000
Host TALK: After first kernel launch, value is 2
Host TALK: Raw pointer of vector 0 at step 1 is now 0xb043c0000 != xb043c0000
Host TALK: Raw pointer of vector 1 at step 1 is 0xb07fc0000
Host TALK: Before second kernel launch, value is 2
GPU TALK: Raw pointer is 0xb043c0000
Host TALK: After second kernel launch, value is 3
GPU TALK: Raw pointer is 0xb043c0000
Host TALK: After third kernel launch, value is 4


Solution

  • So I installed CUDA quickly to test my hypothesis: adding a reserve statement preserves the addresses.

    //declare a vector of vector
    std::vector<thrust::device_vector<float> > v;
    v.reserve(2); // <<-- HERE
    float test;
    float* tmp;
    

    And the outputs, first without the patch.

     $ nvcc thrust.cu  -std=c++11 -o test
     $ ./test 
      Host TALK: Raw pointer of vector 0 at step 0 0x700ca0000
     GPU TALK: Raw pointer is 0x700ca0000 
      Host TALK: After first kernel launch, value is 2
      Host TALK: Raw pointer of vector 0 at step 1 is now 0x700d20000 != 0x700ca0000
      Host TALK: Raw pointer of vector 1 at step 1 is 0x700ce0000
      Host TALK: Before second kernel launch, value is 2
     GPU TALK: Raw pointer is 0x700d20000 
      Host TALK: After second kernel launch, value is 3
     GPU TALK: Raw pointer is 0x700ca0000 
      Host TALK: After third kernel launch, value is 3
    

    with the patch:

     $ nvcc thrust.cu  -std=c++11 -o test
     $ ./test 
      Host TALK: Raw pointer of vector 0 at step 0 0x700ca0000
     GPU TALK: Raw pointer is 0x700ca0000 
      Host TALK: After first kernel launch, value is 2
      Host TALK: Raw pointer of vector 0 at step 1 is now 0x700ca0000 != 0x700ca0000
      Host TALK: Raw pointer of vector 1 at step 1 is 0x700ce0000
      Host TALK: Before second kernel launch, value is 2
     GPU TALK: Raw pointer is 0x700ca0000 
      Host TALK: After second kernel launch, value is 3
     GPU TALK: Raw pointer is 0x700ca0000 
      Host TALK: After third kernel launch, value is 4