I am a little confused on how to use the cula device interface. Right now , I am using the cula interface on a cpp file and I am generating some random numbers from a cu file.
cu file:
...
__global__ void kernel( double * A,double * B, curandState * globalState, int Asize, int Bsize )
{
// generate random numbers
...
void kernel_wrapper(
double ** const A_host,
double ** const B_host,
const int Asize ,
const int Bsize )
{
...
// create random states
curandState * devStates;
gpuErrchk( cudaMalloc( &devStates, N * sizeof(curandState) ) );
// allocate host memory
*A_host = (double*) malloc( Asize * sizeof(double) );
*B_host = (double*) malloc( Bsize * sizeof(double) );
// allocate device memory
double * A_dev, * B_dev;
gpuErrchk( cudaMalloc( (void**) &A_dev, Asize * sizeof(double) ) );
gpuErrchk( cudaMalloc( (void**) &B_dev, Bsize * sizeof(double) ) );
// setup seeds
setup_kernel<<<1,N>>>( devStates, unsigned( time(NULL)) );
...
// generate random numbers
kernel<<<1,1>>>( A_dev, B_dev, devStates, Asize, Bsize );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
// copy result from device to host
gpuErrchk( cudaMemcpy( *A_host, A_dev, Asize * sizeof(double), cudaMemcpyDeviceToHost ) );
gpuErrchk( cudaMemcpy( *B_host, B_dev, Bsize * sizeof(double), cudaMemcpyDeviceToHost ) );
// clean up device memory
gpuErrchk( cudaFree( A_dev ) );
gpuErrchk( cudaFree( B_dev ) );
gpuErrchk( cudaFree( devStates ) );
return;
}
cpp file:
...
extern void kernel_wrapper(double** A,double** B, int Asize ,int Bsize);
...
culaDouble* A;
culaDouble* B;
kernel_wrapper( &A, &B, Asize, Bsize );
...
status = culaDgels('N',N,N, NRHS, A, N, B, N);
So , I am allocating host memory from cu file and pass it to cpp file.
If I want to use cula device?
I can't figure how to manage memory transfers.
I don't know cula. However, after a brief look at the reference guide (which I suggest to consult prior to SO) you can use cula device functions just as host functions. However, you have to pass device memory pointers to the function.
__global__ void kernel( double * A,double * B, curandState * globalState, int Asize, int Bsize )
{
// generate random numbers
...
void kernel_wrapper(
double * const A,
double * const B,
const int Asize ,
const int Bsize )
{
...
// create random states
curandState * devStates;
gpuErrchk( cudaMalloc( &devStates, N * sizeof(curandState) ) );
// setup seeds
setup_kernel<<<1,N>>>( devStates, unsigned( time(NULL)) );
...
// generate random numbers
kernel<<<1,1>>>( A, B, devStates, Asize, Bsize );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
// clean up device memory
gpuErrchk( cudaFree( devStates ) );
return;
}
and in your cpp:
extern void kernel_wrapper(double** A,double** B, int Asize ,int Bsize);
...
culaDouble* A;
culaDouble* B;
gpuErrchk( cudaMalloc( (void**) &A, Asize * sizeof(double) ) );
gpuErrchk( cudaMalloc( (void**) &B, Bsize * sizeof(double) ) );
kernel_wrapper( A, B, Asize, Bsize );
...
status = culaDeviceDgels('N',N,N, NRHS, A, N, B, N);
gpuErrchk( cudaFree( A ) );
gpuErrchk( cudaFree( B ) );
That's it you don't even need host memory as long as everything shall remain in device memory.
Finaly, may I suggest that you take a look at the CUDA Programming Guide? I think this will help you understand the differences in host and device memory and in "memory transfers" to and from a CUDA device.