I am using cuda to accelarate my code, in which I process every images in a loop. Each image is processed on GPU via cuda.
I refered to cuda-samples to write the code below:
my_cuda.cu
#include "cuda_runtime.h"
int process_one_image(args)
{
// note that declaration of some params is omitted.
unsigned char *h_data = (unsigned char *)malloc(size);
unsigned char *h_rgb = (unsigned char *)malloc(size_result);
// initialize the host memory as an image info.
...
unsigned char *d_data;
unsigned char *d_rgb;
cudaMalloc((void **)&d_data, size);
cudaMalloc((void **)&d_rgb, size_result);
cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
// process the d_data on GPU
...
// copy the result from device to host.
cudaMemcpy(h_rgb, d_rgb, size_result, cudaMemcpyDeviceToHost);
free(h_rgb);
free(h_data)
cudaFree(d_rgb);
cudaFree(d_data);
}
in the code above, cudaMalloc
and cudaMemcpy
are in the same function process_one_image
. and the code is run correctly.
But I want to run this code repeatedly, for like in a loop of more than 10000 times.
So I do not wanna do cudaMalloc
and cudaFree
every time I process images.
So I wanna change my code into the below arrangement.
my_cuda.cu
#include "cuda_runtime.h"
int initCuda(unsigned char *h_data, unsigned char *h_rgb, unsigned char *d_data, unsigned char *d_rgb)
{
// note that declaration of some params is omitted.
unsigned char *h_data = (unsigned char *)malloc(size);
unsigned char *h_rgb = (unsigned char *)malloc(size_result);
cudaMalloc((void **)&d_data, size);
cudaMalloc((void **)&d_rgb, size);
}
int FinalizeCuda(unsigned char *h_data, unsigned char *h_rgb, unsigned char *d_data, unsigned char *d_rgb)
{
cudaFree(d_data);
cudaFree(d_rgb);
free(h_data);
free(h_rgb);
}
int process_one_image(unsigned char *h_data, unsigned char *h_rgb, unsigned char *d_data, unsigned char *d_rgb) // note some args are omitted such as size etc.
{
cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
// process the d_data on GPU
...
// copy the result from device to host.
cudaMemcpy(h_rgb, d_rgb, size, cudaMemcpyDeviceToHost);
}
c_code.c
#include "my_cuda.cu"
int processing_loop(args) // specific args are omitted
{
// declaration of host and device memory
unsigned char *h_data;
unsigned char *h_rgb;
unsigned char *d_data;
unsigned char *d_rgb;
initCuda(h_data, h_rgb, d_data, d_rgb);
while (1)
{
int ret = process_one_image(h_data, h_rgb, d_data, d_rgb);
}
FinalizeCuda(h_data, h_rgb, d_data, d_rgb);
}
Here, you can notice that I wanna cudaMalloc
only once in C file, in order to accelarate this code, but I find it did not work correctly. It reported no bugs, but I get nothing from h_rgb
.
It seems that(I guess) when processing cudaMemcpy
, It could not find the correct address of d_data
and copy to d_data.
So how can I fix this bug, OR, is it a proper way to execute cudaMalloc
only once?
The whole code is located in ffio key_file_path:
ffio/ffio/ffio_c/ffio.c
- corresponding to the c_code.c
example file.ffio/ffio/ffio_c/yuv2rgb.cu
- corresponding to the my_cuda.cu
example../compiler.sh
to build the executable file main
main
located in ffio/ffio/ffio_c/test
ffio->cudaFrame
using Ctrl+FBasic C/C++: You want the initCuda
function to set a pointer in the processing_loop
function. So you have to pass a pointer to the pointer to the initCuda
function. Here is a minimal fix.
void initCuda(unsigned char **h_data, unsigned char **h_rgb,
unsigned char **d_data, unsigned char **d_rgb)
{
*h_data = (unsigned char *)malloc(size);
*h_rgb = (unsigned char *)malloc(size_result);
cudaMalloc((void **)d_data, size);
cudaMalloc((void **)d_rgb, size);
}
int processing_loop(args) // specific args are omitted
{
// declaration of host and device memory
unsigned char *h_data;
unsigned char *h_rgb;
unsigned char *d_data;
unsigned char *d_rgb;
initCuda(&h_data, &h_rgb, &d_data, &d_rgb);
while (1)
{
int ret = process_one_image(h_data, h_rgb, d_data, d_rgb);
}
FinalizeCuda(h_data, h_rgb, d_data, d_rgb);
}
Note that there are other issues with your code such as your lack of error checking.