Separate `cudaMalloc` and `cudaMemcpy` in different functions?

I am using cuda to accelarate my code, in which I process every images in a loop. Each image is processed on GPU via cuda.

I refered to cuda-samples to write the code below:

file name: my_cuda.cu

#include "cuda_runtime.h"


int process_one_image(args)
{
    // note that declaration of some params is omitted.
    unsigned char *h_data = (unsigned char *)malloc(size);
    unsigned char *h_rgb = (unsigned char *)malloc(size_result);
    // initialize the host memory as an image info.
    ...

    unsigned char *d_data;
    unsigned char *d_rgb;

    cudaMalloc((void **)&d_data, size);
    cudaMalloc((void **)&d_rgb, size_result);
    cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);

    // process the d_data on GPU
    ...

    // copy the result from device to host.
    cudaMemcpy(h_rgb, d_rgb, size_result, cudaMemcpyDeviceToHost);


    free(h_rgb);
    free(h_data)
    cudaFree(d_rgb);
    cudaFree(d_data);
}

in the code above, cudaMalloc and cudaMemcpy are in the same function process_one_image. and the code is run correctly.

But I want to run this code repeatedly, for like in a loop of more than 10000 times. So I do not wanna do cudaMalloc and cudaFree every time I process images.

So I wanna change my code into the below arrangement.

cuda_file: my_cuda.cu

#include "cuda_runtime.h"

int initCuda(unsigned char *h_data, unsigned char *h_rgb, unsigned char *d_data, unsigned char *d_rgb)
{
    // note that declaration of some params is omitted.
    unsigned char *h_data = (unsigned char *)malloc(size);
    unsigned char *h_rgb = (unsigned char *)malloc(size_result);
    cudaMalloc((void **)&d_data, size);
    cudaMalloc((void **)&d_rgb, size);
}

int FinalizeCuda(unsigned char *h_data, unsigned char *h_rgb, unsigned char *d_data, unsigned char *d_rgb)
{
    cudaFree(d_data);
    cudaFree(d_rgb);
    free(h_data);
    free(h_rgb);
}

int process_one_image(unsigned char *h_data, unsigned char *h_rgb, unsigned char *d_data, unsigned char *d_rgb) // note some args are omitted such as size etc.
{

    cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);

    // process the d_data on GPU
    ...

    // copy the result from device to host.
    cudaMemcpy(h_rgb, d_rgb, size, cudaMemcpyDeviceToHost);
}

my_c_code: c_code.c

#include "my_cuda.cu"


int processing_loop(args)  // specific args are omitted
{
    // declaration of host and device memory
    unsigned char *h_data;
    unsigned char *h_rgb;
    unsigned char *d_data;
    unsigned char *d_rgb;

    initCuda(h_data, h_rgb, d_data, d_rgb);
    while (1)
    {
        int ret = process_one_image(h_data, h_rgb, d_data, d_rgb);
    }
    FinalizeCuda(h_data, h_rgb, d_data, d_rgb);

}

Here, you can notice that I wanna cudaMalloc only once in C file, in order to accelarate this code, but I find it did not work correctly. It reported no bugs, but I get nothing from h_rgb.

It seems that(I guess) when processing cudaMemcpy, It could not find the correct address of d_data and copy to d_data.

So how can I fix this bug, OR, is it a proper way to execute cudaMalloc only once?

The whole code is located in ffio key_file_path:

ffio/ffio/ffio_c/ffio.c - corresponding to the c_code.c example file.
ffio/ffio/ffio_c/yuv2rgb.cu - corresponding to the my_cuda.cu example.

how to run the whole example:

./compiler.sh to build the executable file main
execute main located in ffio/ffio/ffio_c/test
check the variable ffio->cudaFrame using Ctrl+F

Solution

Basic C/C++: You want the initCuda function to set a pointer in the processing_loop function. So you have to pass a pointer to the pointer to the initCuda function. Here is a minimal fix.

void initCuda(unsigned char **h_data, unsigned char **h_rgb,
             unsigned char **d_data, unsigned char **d_rgb)
{
    *h_data = (unsigned char *)malloc(size);
    *h_rgb = (unsigned char *)malloc(size_result);
    cudaMalloc((void **)d_data, size);
    cudaMalloc((void **)d_rgb, size);
}
int processing_loop(args)  // specific args are omitted
{
    // declaration of host and device memory
    unsigned char *h_data;
    unsigned char *h_rgb;
    unsigned char *d_data;
    unsigned char *d_rgb;

    initCuda(&h_data, &h_rgb, &d_data, &d_rgb);
    while (1)
    {
        int ret = process_one_image(h_data, h_rgb, d_data, d_rgb);
    }
    FinalizeCuda(h_data, h_rgb, d_data, d_rgb);
}

Note that there are other issues with your code such as your lack of error checking.