Search code examples
c++structcuda

Cuda C++ Accessing struct from device global memory in kernel causes illegal memory access


I was working on a larger program using Nvidia Cuda toolkit but kept receiving illegal memory access errors. I ended up localizing the problem to my accessing of a struct, however, as far as I understood, the struct and all elements of the structs were allocated on the device so there should have been no illegal memory access.

This example program works fine, outputs the correct value and exits with no error:


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <iostream>


struct test_struct {
    int* arr;

    void allocate(int size);

    void deallocate();
};

void test_struct::allocate(int size) {
    cudaMalloc((void**)&arr, size);
}

void test_struct::deallocate() {
    cudaFree(arr);
}

__device__ struct test_struct* d_struct;

__global__ void print_arr(test_struct* test) {
    printf("%d\t", test->arr[0]);
}

int main()
{
    cudaMalloc((void**)&d_struct, sizeof(test_struct));

    test_struct* h_test;
    h_test = (test_struct*)malloc(sizeof(test_struct));
    h_test->allocate(64 * sizeof(int));

    cudaMemcpy(d_struct, h_test, sizeof(test_struct), cudaMemcpyHostToDevice);

    print_arr << <1, 1 >> > (d_struct);
    std::cout << cudaGetErrorString(cudaDeviceSynchronize());

    h_test->deallocate();
    free(h_test);
    cudaFree(d_struct);


    return 0;
}



However, changing the print_arr method to:

__global__ void print_arr() {
    printf("%d\t", d_struct->arr[0]);
}

and updating the main method accordingly print_arr << <1, 1 >> > ();

causes an illegal memory access. The rest of the code is exactly the same, but instead of accessing the struct through global memory, we pass it directly into the method. So my question is why does the first code run fine, but changing it to use global memory causes it to crash? As far as I understand it, the pointer to the arr array is the same in both cases.


Solution

  • When creating a global variable like this:

    __device__ struct test_struct* d_struct;
    

    in CUDA we should not attempt to use cudaMalloc and cudaMemcpy directly with that symbol (d_struct). There are a few reasons for this, one of which is that both cudaMalloc and cudaMemcpy expect host storage for the pointers they access, and the __device__ decorator indicates device storage.

    To provide an allocation for that device pointer, we need a 2-step process:

    struct test_struct *d_struct_htemp;  // a pointer stored in host memory
    cudaMalloc(&d_struct_htemp, sizeof(test_struct));  // step 1, create device allocation
    cudaMemcpyToSymbol(d_struct, &d_struct_htemp, sizeof(test_struct *)); // step 2
    

    Step 2 uses a different API (cudaMemcpyToSymbol) which is used to access such device variables via their device symbol. We don't use the address-of operator for the device symbol (by convention), even though we use it for the host variable. This operation as written copies the allocated pointer value from host to device, and puts it in the d_struct pointer for further use.

    Finally, for a global symbol such as d_struct we do not and should not pass it explicitly as a kernel argument:

    print_arr << <1, 1 >> > (d_struct);
    

    instead, just use it directly in kernel code, as you would a global variable/symbol.

    Your code actually has pointers embedded in the struct, so an additional layer of allocations is needed, more-or-less what you have mapped out already. Here is an example based on the code you have shown:

    # cat t289.cu
    #include <stdio.h>
    
    #include <iostream>
    
    
    struct test_struct {
        int* arr;
    
        void allocate(int size);
    
        void deallocate();
    };
    
    void test_struct::allocate(int size) {
        cudaMalloc((void**)&arr, size);
        int val = 1234;
        cudaMemcpy(arr, &val, sizeof(int), cudaMemcpyHostToDevice);
    
    }
    
    void test_struct::deallocate() {
        cudaFree(arr);
    }
    
    __device__ struct test_struct* d_struct;
    
    __global__ void print_arr() {
        printf("%d\t\n", d_struct->arr[0]);
    }
    
    int main()
    {
        test_struct *d_struct_htemp;
        cudaMalloc((void**)&d_struct_htemp, sizeof(test_struct));
    
        test_struct* h_test;
        h_test = (test_struct*)malloc(sizeof(test_struct));
        h_test->allocate(64 * sizeof(int));
        cudaMemcpy(d_struct_htemp, h_test, sizeof(test_struct), cudaMemcpyHostToDevice);
        cudaMemcpyToSymbol(d_struct, &d_struct_htemp, sizeof(test_struct *));
    
        print_arr << <1, 1 >> > ();
        std::cout << cudaGetErrorString(cudaDeviceSynchronize()) << std::endl;
    
        h_test->deallocate();
        free(h_test);
        cudaFree(d_struct_htemp);
    
    
        return 0;
    }
    
    # nvcc -o t289 t289.cu
    # compute-sanitizer ./t289
    ========= COMPUTE-SANITIZER
    1234
    no error
    ========= ERROR SUMMARY: 0 errors
    #