How to better encapsulate GPU memory to achieve RAII-like characteristics

I use the GpuMatrix to wrap the gpu memory malloc/free and gpu memory copy, just like this

template <typename T, size_t N, size_t M>
class GpuMatrix {
 public:
  const size_t HeightDim = N;
  const size_t WidthDim = M;
  using DataType = T;

  const size_t ByteSize = HeightDim * WidthDim * sizeof(DataType);

  GpuMatrix() = default;

  ~GpuMatrix() = default;

  bool CopyFromCpuData(const DataType* cpu_data, size_t size) {
    assert(size * sizeof(DataType) == ByteSize);
    CUDA_CHECK(cudaMemcpy(_data, cpu_data, size * sizeof(DataType),
                          cudaMemcpyHostToDevice));
    return true;
  }

  bool CopyToCpuData(DataType* cpu_data, size_t size) {
    assert(size * sizeof(DataType) == ByteSize);
    CUDA_CHECK(cudaMemcpy(cpu_data, _data, size * sizeof(DataType),
                          cudaMemcpyDeviceToHost));
    return true;
  }
  __device__ size_t GetByteSize() { return ByteSize; }

  bool DestroyGpuMemory() {
    CUDA_CHECK(cudaFree(_data));
    return true;
  }

  bool MallocGpuMemory() {
    CUDA_CHECK(cudaMalloc((void**)&_data, ByteSize));
    return true;
  }
  // not saft
  __device__ DataType& at(size_t y, size_t x) {
    return _data[y * WidthDim + x];
  }

 private:
  DataType* _data;
};

but it has some problem, such as the memory can't free memory cannot be automatically released in the destructor function, and the struct must use Value-passing method parameter, because in the kernel function, no variables from the CPU can be used ,such as


template <typename T, size_t N, size_t M, size_t Z>
__global__ void _multiplication_matrix(GpuMatrix<T, N, M> x,
                                       GpuMatrix<T, M, Z> y,
                                       GpuMatrix<T, N, Z> z) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;

so what can i do ?

I want to get a Class which has better wrap for gpu memory

Solution

For non-trivially copyable or non-trivially-destructible kernel arguments, kernel launches do not follow standard C++ behavior.

The programming guide lists the restriction. In your case, the implicit wrapper function for the kernel call copy-constructs a copy of the matrix, and deallocates it. No constructor or destructor is called from device code.

I would recommend using an existing library for memory management such as Thrust, and use a data view in the kernel which holds the pointers, but does not do memory management. Following code gives an example.

#include <cstdio>
#include <thrust/device_vector.h>

struct B{
    thrust::device_vector<int> buffer;

    struct View{
        int* buffer;

        __device__ 
        int myFunc1(){

        }
    };

    B(int N) : buffer(N){}

    View view(){
        return View{buffer.data().get()};
    }

    operator View(){
        return view();
    }
};

__global__
void kernel(B::View b){
    b.buffer[0] = 42;
    b.myFunc1();
}

int main(){
    B b(1000);
    kernel<<<1,1>>>(b.view());
    //kernel<<<1,1>>>(b); //this is also fine, performs implicit conversion to view
    cudaDeviceSynchronize();

    int i;
    cudaMemcpy(&i, b.buffer.data().get(), sizeof(int), cudaMemcpyDeviceToHost);
    printf("buffer[0] is %d\n", i); //prints 42
}