I use the GpuMatrix to wrap the gpu memory malloc/free and gpu memory copy, just like this
template <typename T, size_t N, size_t M>
class GpuMatrix {
public:
const size_t HeightDim = N;
const size_t WidthDim = M;
using DataType = T;
const size_t ByteSize = HeightDim * WidthDim * sizeof(DataType);
GpuMatrix() = default;
~GpuMatrix() = default;
bool CopyFromCpuData(const DataType* cpu_data, size_t size) {
assert(size * sizeof(DataType) == ByteSize);
CUDA_CHECK(cudaMemcpy(_data, cpu_data, size * sizeof(DataType),
cudaMemcpyHostToDevice));
return true;
}
bool CopyToCpuData(DataType* cpu_data, size_t size) {
assert(size * sizeof(DataType) == ByteSize);
CUDA_CHECK(cudaMemcpy(cpu_data, _data, size * sizeof(DataType),
cudaMemcpyDeviceToHost));
return true;
}
__device__ size_t GetByteSize() { return ByteSize; }
bool DestroyGpuMemory() {
CUDA_CHECK(cudaFree(_data));
return true;
}
bool MallocGpuMemory() {
CUDA_CHECK(cudaMalloc((void**)&_data, ByteSize));
return true;
}
// not saft
__device__ DataType& at(size_t y, size_t x) {
return _data[y * WidthDim + x];
}
private:
DataType* _data;
};
but it has some problem, such as the memory can't free memory cannot be automatically released in the destructor function, and the struct must use Value-passing method parameter, because in the kernel function, no variables from the CPU can be used ,such as
template <typename T, size_t N, size_t M, size_t Z>
__global__ void _multiplication_matrix(GpuMatrix<T, N, M> x,
GpuMatrix<T, M, Z> y,
GpuMatrix<T, N, Z> z) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
so what can i do ?
I want to get a Class which has better wrap for gpu memory
For non-trivially copyable or non-trivially-destructible kernel arguments, kernel launches do not follow standard C++ behavior.
The programming guide lists the restriction. In your case, the implicit wrapper function for the kernel call copy-constructs a copy of the matrix, and deallocates it. No constructor or destructor is called from device code.
I would recommend using an existing library for memory management such as Thrust, and use a data view in the kernel which holds the pointers, but does not do memory management. Following code gives an example.
#include <cstdio>
#include <thrust/device_vector.h>
struct B{
thrust::device_vector<int> buffer;
struct View{
int* buffer;
__device__
int myFunc1(){
}
};
B(int N) : buffer(N){}
View view(){
return View{buffer.data().get()};
}
operator View(){
return view();
}
};
__global__
void kernel(B::View b){
b.buffer[0] = 42;
b.myFunc1();
}
int main(){
B b(1000);
kernel<<<1,1>>>(b.view());
//kernel<<<1,1>>>(b); //this is also fine, performs implicit conversion to view
cudaDeviceSynchronize();
int i;
cudaMemcpy(&i, b.buffer.data().get(), sizeof(int), cudaMemcpyDeviceToHost);
printf("buffer[0] is %d\n", i); //prints 42
}