Search code examples
c++cudathrust

Having thrust::device_vector in global scope


I'm writing a program that calculates a lot of properties of triangle mesh data. Some of these properties, I'd like to calculate using thrust:: methods, other properties need to be calculated using raw memory pointers in CUDA kernels.

To transfer the data to the GPU I've got this in a transfer.cu file, (since creating and manipulating thrust::device_vectors in plain C++ code is not supported):

// thrust vectors (global)
thrust::host_vector<glm::vec3> trianglethrust_host;
thrust::device_vector<glm::vec3> trianglethrust_device;

extern "C" void trianglesToGPU_thrust(const trimesh::TriMesh *mesh, float** triangles) {
// fill host vector
for (size_t i = 0; i < mesh->faces.size(); i++) {
    // PUSHING DATA INTO HOST_VECTOR HERE (OMITTED FOR CLARITY)
} 
// copy to GPU by assigning host vector to device vector, like in the Thrust documentation
trianglethrust_device = trianglethrust_host;
// save raw pointer
*triangles = (float*)thrust::raw_pointer_cast(&(trianglethrust_device[0]));
}

This function trianglestoGPU_thrustis called from the main method of my C++ program. All works fine and dandy, until the program exits, and the (globally defined) trianglethrust_device vector goes out of scope. Thrust tries to free it, but the CUDA context is already gone, resulting in a cudaErrorInvalidDevicePointer

What would be considered best practice for my problem?

TL;DR: I want a thrust::device_vector that lives for the duration of my program, since I want to throw thrust:: functions (like transform etc) at it, as well as read and manipulate it through raw pointer access in CUDA.

Solution: In my case, I appearantly was free-ing using the raw data pointer somewhere further in the process. Removing that free, and ending my main loop with

trianglethrust_device.clear();
trianglethrust_device.shrink_to_fit();
trianglethrust_device.device_vector~;

To force the clearing of that vector before the CUDA runtime gets torn down. This worked, but is probably still a pretty ugly way of doing this.

I recommend Robert's answer on this one, and will mark it as valid.


Solution

  • As you've already discovered, the thrust vector container itself cannot be placed at file scope.

    One possible solution is to simply create the vectors you need at the beginning of main, then pass references to these to whatever functions need them.

    If you really want "global behavior" you could place pointers to vectors at global/file scope, then initialize the needed vectors at the beginning of main, and set the pointers at global scope to point to the vectors created in main.

    Based on the question in the comment I guess it's important/desirable that the main file be a .cpp file compiled with the host compiler. Therefore we can use the previously mentioned concepts combined with allocation of the vectors on the heap so as to avoid deallocation until the program terminates. Here's a full example:

    $ cat main.cpp
    #include "transfer.h"
    
    int main(){
    
      float **triangles, *mesh;
      triangles = new float *[1];
      mesh = new float[4];
      mesh[0] = 0.1f; mesh[1] = 0.2f; mesh[2] = 0.3f;
      trianglesToGPU_thrust(mesh, triangles);
      do_global_work(triangles);
      finish();
    }
    $ cat transfer.h
    void trianglesToGPU_thrust(const float *, float **);
    void do_global_work(float **);
    void finish();
    $ cat transfer.cu
    #include <thrust/host_vector.h>
    #include <thrust/device_vector.h>
    #include "transfer.h"
    #include <iostream>
    #include <cstdio>
    #include <thrust/copy.h>
    
    __global__ void k(float *data, size_t ds){
      for (int i = 0; i < ds; i++) printf("%f,", data[i]);
    }
    
    // thrust vectors (global)
    thrust::host_vector<float> *trianglethrust_host;
    thrust::device_vector<float> *trianglethrust_device;
    
    void trianglesToGPU_thrust(const float *mesh, float** triangles) {
    //create vectors
      trianglethrust_host = new thrust::host_vector<float>;
      trianglethrust_device = new thrust::device_vector<float>;
    
    // fill host vector
      size_t i = 0;
      while (mesh[i] != 0.0f) {
        (*trianglethrust_host).push_back(mesh[i++]);
      }
    // copy to GPU by assigning host vector to device vector, like in the Thrust documentation
      *trianglethrust_device = *trianglethrust_host;
    // save raw pointer
      *triangles = (float*)thrust::raw_pointer_cast(&((*trianglethrust_device)[0]));
    }
    
    void do_global_work(float** triangles){
    
      std::cout << "from device vector:" << std::endl;
      thrust::copy((*trianglethrust_device).begin(), (*trianglethrust_device).end(), std::ostream_iterator<float>(std::cout, ","));
      std::cout << std::endl << "from kernel:" << std::endl;
      k<<<1,1>>>(*triangles, (*trianglethrust_device).size());
      cudaDeviceSynchronize();
      std::cout << std::endl;
    }
    
    void finish(){
      if (trianglethrust_host) delete trianglethrust_host;
      if (trianglethrust_device) delete trianglethrust_device;
    }
    $ nvcc -c transfer.cu
    $ g++ -c main.cpp
    $ g++ -o test main.o transfer.o -L/usr/local/cuda/lib64 -lcudart
    $ ./test
    from device vector:
    0.1,0.2,0.3,
    from kernel:
    0.100000,0.200000,0.300000,
    $
    

    Here's another approach, similar to the previous, using a std::vector of thrust containers, at global scope (only the transfer.cu file is different from the previous example, main.cpp and transfer.h are the same):

    $ cat transfer.cu
    #include <thrust/host_vector.h>
    #include <thrust/device_vector.h>
    #include "transfer.h"
    #include <iostream>
    #include <cstdio>
    #include <thrust/copy.h>
    #include <vector>
    
    __global__ void k(float *data, size_t ds){
      for (int i = 0; i < ds; i++) printf("%f,", data[i]);
    }
    
    // thrust vectors (global)
    std::vector<thrust::host_vector<float> > trianglethrust_host;
    std::vector<thrust::device_vector<float> > trianglethrust_device;
    
    void trianglesToGPU_thrust(const float *mesh, float** triangles) {
    //create vectors
      trianglethrust_host.resize(1);
      trianglethrust_device.resize(1);
    
    // fill host vector
    size_t i = 0;
      while (mesh[i] != 0.0f) {
        trianglethrust_host[0].push_back(mesh[i++]);
      }
    // copy to GPU by assigning host vector to device vector, like in the Thrust documentation
      trianglethrust_device[0] = trianglethrust_host[0];
    // save raw pointer
      *triangles = (float*)thrust::raw_pointer_cast(trianglethrust_device[0].data());
    }
    
    void do_global_work(float** triangles){
    
      std::cout << "from device vector:" << std::endl;
      thrust::copy(trianglethrust_device[0].begin(), trianglethrust_device[0].end(), std::ostream_iterator<float>(std::cout, ","));
      std::cout << std::endl << "from kernel:" << std::endl;
      k<<<1,1>>>(*triangles, trianglethrust_device[0].size());
      cudaDeviceSynchronize();
      std::cout << std::endl;
    }
    
    void finish(){
      trianglethrust_host.clear();
      trianglethrust_device.clear();
    }
    $ nvcc -c transfer.cu
    $ g++ -o test main.o transfer.o -L/usr/local/cuda/lib64 -lcudart
    $ ./test
    from device vector:
    0.1,0.2,0.3,
    from kernel:
    0.100000,0.200000,0.300000,
    $