Search code examples
c++scopecudadevicequalifiers

Type Qualifiers for a device class in CUDA


I'm currently attempting to make a piece of CUDA code with a class that will be used solely on the device side (i.e. host doesn't need to know of it's existence). However I cannot work out the correct qualifiers for the class (deviceclass below):

__device__ float devicefunction (float *x) {return x[0]+x[1];}

class deviceclass {
    private:
        float _a;

    public:
        deviceclass(float *x) {_a = devicefunction(x);}

        float getvalue () {return _a;}
};    

// Device code
__global__ void VecInit(float* A, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < N) {
        deviceclass *test;

        test = new deviceclass(1.0, 2.0);

        A[i] = test->getvalue();
    }
}

// Standard CUDA guff below: Variables
float *h_A, *d_A;

// Host code
int main(int argc, char** argv)
{
    printf("Vector initialization...\n");
    int N = 10000;
    size_t size = N * sizeof(float);

    // Allocate
    h_A = (float*)malloc(size);
    cudaMalloc(&d_A, size);

    printf("Computing...\n");
    // Invoke kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    VecInit<<<blocksPerGrid, threadsPerBlock>>>(d_A, N);

    // Copy result from device memory to host memory
    cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);

    //...etc
}

Setting Deviceclass as solely a __device__ throws an error as it's called from a global function, however setting it as __device__ __host__ or __global__ seems unnecessary. Can someone point me in the right direction?


Solution

  • It turns out the qualifiers have to go on the member functions of the class, below is a fully working version:

    #include <iostream>
    #include <stdio.h>
    #include <stdlib.h>
    
    using namespace std;
    
    void Cleanup(void);
    
    
    // Functions to be pointed to
    __device__ float Plus (float a, float b) {return a+b;}
    
    class deviceclass {
    
        private:
            float test;
    
        public:
            __device__ deviceclass(float a, float b) {
                test = Plus(a,b);
            }
    
            __device__ float getvalue() {return test;}
    };
    
    // Device code
    __global__ void VecInit(float* A, int N)
    {
        int i = blockDim.x * blockIdx.x + threadIdx.x;
        if (i < N) {
            deviceclass test(1.0, 2.0);
    
            A[i] = test.getvalue();
        }
    }
    
    // Standard CUDA guff below: Variables
    float *h_A, *d_A;
    
    // Host code
    int main(int argc, char** argv)
    {
        printf("Vector initialization...\n");
        int N = 10000;
        size_t size = N * sizeof(float);
    
        // Allocate
        h_A = (float*)malloc(size);
        cudaMalloc(&d_A, size);
    
        printf("Computing...\n");
        // Invoke kernel
        int threadsPerBlock = 256;
        int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
        VecInit<<<blocksPerGrid, threadsPerBlock>>>(d_A, N);
    
        // Copy result from device memory to host memory
        cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
    
    
    
        // Verify result
        int i;
        for (i = 0; i < N; ++i) {
            cout << endl << h_A[i];
        }
    
        cout << endl;
    
        Cleanup();
    }
    
    void Cleanup(void)
    {
        // Free device memory
        if (d_A)
            cudaFree(d_A);
    
        // Free host memory
        if (h_A)
            free(h_A);
    
        cudaThreadExit();
    
        exit(0);
    }