Search code examples
c++cudanvcc

CUDA pointer inside kernel becomes null


I'm trying to pass a pointer to triangle data to a kernel, but when debugging I find the pointer becomes null, d_list contains the triangles and both d_list and d_world are members of the main window class, also the error checking returns "no error"

d_list is of type hittable* and d_world is hittable_list*

__global__ void create_world(hittable* d_list, hittable_list* d_world, int num_triangles) {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        // the class hittable_list contains a counter for the list size, which no matter the
        // scene size it always becomes zero
        d_world = new hittable_list(&d_list, num_triangles);
    }
}

checkCudaErrors(cudaMalloc((void**)&d_list, num_hittables * sizeof(triangle)));
checkCudaErrors(cudaMalloc((void**)&d_world, sizeof(hittable_list)));

cudaMemcpy(d_list, m_triangles.data(), num_hittables * sizeof(triangle), cudaMemcpyHostToDevice);

create_world << <1, 1 >> > (d_list, d_world, num_hittables);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());

I tried initializing the "world" in the host then cudaMemcpy'ing to the d_world, but it also fails

EDIT: minimal exmple

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>

struct make_list {
    __device__ make_list(float** list, int n) { contents = list; size = n; };
    float** contents;
    int size;
};

__global__ void render(make_list** world) {
    int size = (*world)->size; // set a breakpoint here, the size is 0
}

__global__ void create_world(float* d_list, make_list* d_world, int num_triangles) {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        // the class hittable_list contains a counter for the list size, which no matter the
        // scene size it always becomes zero
        d_world = new make_list(&d_list, num_triangles);
    }
}

int main () {
    float* d_list;
    make_list* d_world;

    int size = 8;
    std::vector<float> m_triangles(size);

    cudaMalloc((void**)&d_list, size * sizeof(float));
    cudaMalloc((void**)&d_world, sizeof(make_list));

    cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);

    create_world << <1, 1 >> > (d_list, d_world, size);
    cudaDeviceSynchronize();

    render << <1, 1 >> > (&d_world);
    cudaDeviceSynchronize();

    return 0;
}

EDIT 2: updated with virtual function call, it's causing crashes

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>
#include <cstdio>

class hittable {
public:
    __device__ virtual int hit() const = 0;
};

struct make_list : public hittable {
    __device__ make_list(float** list, int n) { contents = list; size = n; };
    __device__ virtual int hit() const {
        return size;
    }
    float** contents;
    int size;
};

__global__ void render(make_list** world) {
    int size = (*world)->size; // set a breakpoint here, the size is 0
    printf("size = %d\n", size);
    int new_size = (*world)->hit();
    printf("new size = %d\n", new_size);
}

__global__ void create_world(float* d_list, make_list** d_world, int num_triangles) {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        // the class hittable_list contains a counter for the list size, which no matter the
        // scene size it always becomes zero
        *d_world = new make_list(&d_list, num_triangles);
    }
}

int main() {
    float* d_list;
    make_list** d_world;
    cudaMalloc(&d_world, sizeof(make_list*));
    int size = 8;
    std::vector<float> m_triangles(size);

    cudaMalloc((void**)&d_list, size * sizeof(float));

    cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);

    create_world << <1, 1 >> > (d_list, d_world, size);
    cudaDeviceSynchronize();

    render << <1, 1 >> > (d_world);
    cudaDeviceSynchronize();

    return 0;
}

Solution

  • There are at least a few issues.

    1. In C++, when you pass a variable to a function via the function parameters, a copy of that variable is made for local use by the function. Any modifications made to that variable will not show up globally, i.e. in the calling environment, because the function is operating on a copy of the variable. Therefore this could never do what you want:

       d_world = new make_list(&d_list, num_triangles);
      

      There is nothing illegal about it, per se, but it will not have the desired effect. The global copy of d_world is unchanged by that assignment. This is a C++ concept, not unique or specific to CUDA, and it trips people up from time to time.

    2. This is almost never legal in CUDA:

      render << <1, 1 >> > (&d_world);
                            ^       
      

      In typical usage, it is not possible to pass the address of a host location to device code via a kernel call parameter. Any attempt to dereference that pointer &d_world will result in dereferencing the address of a host location. That is illegal in CUDA device code.

    3. While not necessarily a problem at this point, you should be aware of the fact that in-kernel new operates against the device heap which has a default limit of 8MB, and furthermore allocations created this way cannot take part in host-issued cudaMemcpy* calls. These topics are covered in the programming guide.

    When I make changes to address those first 2 items, I get what appear to be sensible results:

    $ cat t2190.cu
    #include <cuda_runtime.h>
    #include <device_launch_parameters.h>
    #include <vector>
    #include <cstdio>
    
    struct make_list {
        __device__ make_list(float** list, int n) { contents = list; size = n; };
        float** contents;
        int size;
    };
    
    __global__ void render(make_list** world) {
        int size = (*world)->size; // set a breakpoint here, the size is 0
        printf("size = %d\n", size);
    }
    
    __global__ void create_world(float* d_list, make_list** d_world, int num_triangles) {
        if (threadIdx.x == 0 && blockIdx.x == 0) {
            // the class hittable_list contains a counter for the list size, which no matter the
            // scene size it always becomes zero
            *d_world = new make_list(&d_list, num_triangles);
        }
    }
    
    int main () {
        float* d_list;
        make_list** d_world;
        cudaMalloc(&d_world, sizeof(make_list*));
        int size = 8;
        std::vector<float> m_triangles(size);
    
        cudaMalloc((void**)&d_list, size * sizeof(float));
    
        cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);
    
        create_world << <1, 1 >> > (d_list, d_world, size);
        cudaDeviceSynchronize();
    
        render << <1, 1 >> > (d_world);
        cudaDeviceSynchronize();
    
        return 0;
    }
    $ nvcc -o t2190 t2190.cu
    $ compute-sanitizer ./t2190
    ========= COMPUTE-SANITIZER
    size = 8
    ========= ERROR SUMMARY: 0 errors
    $
    

    Although you don't show how you are using the contents member of the make_list object, I'm doubtful that this could possibly do anything useful for you, for the same reason as I have indicated in item 1 above:

    *d_world = new make_list(&d_list,
                             ^^^^^^^
    

    The address you are using there is the address of a temporary local variable made by the function. My guess is you probably want d_list there or possibly *d_list, and this might necessitate changes in your contents object member of the handling of that object member. Whatever you are doing there will almost certainly require changes not unlike the refactoring I have done to address items 1 and 2.

    For now, without knowing anything further about your intent, something that seems sensible to me would be like this:

    #include <cuda_runtime.h>
    #include <device_launch_parameters.h>
    #include <vector>
    #include <cstdio>
    
    struct make_list {
        __device__ make_list(float* list, int n) { contents = list; size = n; };
        float* contents;
        int size;
    };
    
    __global__ void render(make_list** world) {
        int size = (*world)->size; // set a breakpoint here, the size is 0
        printf("size = %d\n", size);
    }
    
    __global__ void create_world(float* d_list, make_list** d_world, int num_triangles) {
        if (threadIdx.x == 0 && blockIdx.x == 0) {
            // the class hittable_list contains a counter for the list size, which no matter the
            // scene size it always becomes zero
            *d_world = new make_list(d_list, num_triangles);
        }
    }
    
    int main () {
        float* d_list;
        make_list** d_world;
        cudaMalloc(&d_world, sizeof(make_list*));
        int size = 8;
        std::vector<float> m_triangles(size);
    
        cudaMalloc((void**)&d_list, size * sizeof(float));
    
        cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);
    
        create_world << <1, 1 >> > (d_list, d_world, size);
        cudaDeviceSynchronize();
    
        render << <1, 1 >> > (d_world);
        cudaDeviceSynchronize();
    
        return 0;
    }