Search code examples
cudathrust

Using host class member pointing to device memory in device code


I want to have an instance of a Container class allocating some device and host memory on initialization. I want to use the allocated memory in device code, without passing the actual pointer (for API reasons).

How do I create a global __device__ pointer to the member pointing to the device memory? I am happy to use thrust if that helps.

Here is a small example:

#include <iostream>


struct Container {
    int *h_int = (int*)malloc(4*sizeof(int));
    int *d_int;
    Container() {
        h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
        cudaMalloc(&d_int, 4*sizeof(int));
        memcpyHostToDevice();
    }
    void memcpyHostToDevice() {
        cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
    }
    void memcpyDeviceToHost() {
        cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
    }
};

Container stuff;


__device__ auto d_int = &stuff.d_int;  // How do I get that right?


__global__ void edit() {  // To keep the API simple I do not want to pass the pointer
    auto i = blockIdx.x*blockDim.x + threadIdx.x;
    d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}


int main(int argc, char const *argv[]) {
    edit<<<4, 1>>>();
    stuff.memcpyDeviceToHost();
    std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
    return 0;
}

Solution

  • There are two problems here:

    1. You can't statically inititalize a __device__ variable in the way you are trying to (and the value you are trying to apply isn't correct either). The CUDA runtime API contains a function for initialising global scope device symbols. Use that instead.
    2. Your global scope declaration of stuff shouldn't work either for a number of subtle reasons discussed here (it is technically undefined behaviour). Declare it at main scope instead.

    Putting these two things together should lead your to do something like this instead:

    __device__ int* d_int;
    
    // ...
    
    int main(int argc, char const *argv[]) {
    
        Container stuff;
        cudaMemcpyToSymbol(d_int, &stuff.dint, sizeof(int*));
    
        edit<<<4, 1>>>();
    
        // ...
    

    Here is a fully worked example:

    $ cat t1199.cu
    #include <iostream>
    
    
    struct Container {
        int *h_int = (int*)malloc(4*sizeof(int));
        int *d_int;
        Container() {
            h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
            cudaMalloc(&d_int, 4*sizeof(int));
            memcpyHostToDevice();
        }
        void memcpyHostToDevice() {
            cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
        }
        void memcpyDeviceToHost() {
            cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
        }
    };
    
    //Container stuff;
    
    
    __device__ int  *d_int; // = &stuff.d_int;  // How do I get that right?
    
    
    __global__ void edit() {  // To keep the API simple I do not want to pass the pointer
        auto i = blockIdx.x*blockDim.x + threadIdx.x;
        d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
    }
    
    
    int main(int argc, char const *argv[]) {
        Container stuff;
        cudaMemcpyToSymbol(d_int, &stuff.d_int, sizeof(int *));
        edit<<<4, 1>>>();
        stuff.memcpyDeviceToHost();
        std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
        return 0;
    }
    $ nvcc -std=c++11 -o t1199 t1199.cu
    $ cuda-memcheck ./t1199
    ========= CUDA-MEMCHECK
    1337
    ========= ERROR SUMMARY: 0 errors
    $