Search code examples
cuda

Separate `cudaMalloc` and `cudaMemcpy` in different functions?


I am using cuda to accelarate my code, in which I process every images in a loop. Each image is processed on GPU via cuda.

I refered to cuda-samples to write the code below:

  • file name: my_cuda.cu
#include "cuda_runtime.h"


int process_one_image(args)
{
    // note that declaration of some params is omitted.
    unsigned char *h_data = (unsigned char *)malloc(size);
    unsigned char *h_rgb = (unsigned char *)malloc(size_result);
    // initialize the host memory as an image info.
    ...

    unsigned char *d_data;
    unsigned char *d_rgb;

    cudaMalloc((void **)&d_data, size);
    cudaMalloc((void **)&d_rgb, size_result);
    cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);

    // process the d_data on GPU
    ...

    // copy the result from device to host.
    cudaMemcpy(h_rgb, d_rgb, size_result, cudaMemcpyDeviceToHost);


    free(h_rgb);
    free(h_data)
    cudaFree(d_rgb);
    cudaFree(d_data);
}

in the code above, cudaMalloc and cudaMemcpy are in the same function process_one_image. and the code is run correctly.

But I want to run this code repeatedly, for like in a loop of more than 10000 times. So I do not wanna do cudaMalloc and cudaFree every time I process images.

So I wanna change my code into the below arrangement.

  • cuda_file: my_cuda.cu
#include "cuda_runtime.h"

int initCuda(unsigned char *h_data, unsigned char *h_rgb, unsigned char *d_data, unsigned char *d_rgb)
{
    // note that declaration of some params is omitted.
    unsigned char *h_data = (unsigned char *)malloc(size);
    unsigned char *h_rgb = (unsigned char *)malloc(size_result);
    cudaMalloc((void **)&d_data, size);
    cudaMalloc((void **)&d_rgb, size);
}

int FinalizeCuda(unsigned char *h_data, unsigned char *h_rgb, unsigned char *d_data, unsigned char *d_rgb)
{
    cudaFree(d_data);
    cudaFree(d_rgb);
    free(h_data);
    free(h_rgb);
}

int process_one_image(unsigned char *h_data, unsigned char *h_rgb, unsigned char *d_data, unsigned char *d_rgb) // note some args are omitted such as size etc.
{

    cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);

    // process the d_data on GPU
    ...

    // copy the result from device to host.
    cudaMemcpy(h_rgb, d_rgb, size, cudaMemcpyDeviceToHost);
}
  • my_c_code: c_code.c
#include "my_cuda.cu"


int processing_loop(args)  // specific args are omitted
{
    // declaration of host and device memory
    unsigned char *h_data;
    unsigned char *h_rgb;
    unsigned char *d_data;
    unsigned char *d_rgb;

    initCuda(h_data, h_rgb, d_data, d_rgb);
    while (1)
    {
        int ret = process_one_image(h_data, h_rgb, d_data, d_rgb);
    }
    FinalizeCuda(h_data, h_rgb, d_data, d_rgb);

}

Here, you can notice that I wanna cudaMalloc only once in C file, in order to accelarate this code, but I find it did not work correctly. It reported no bugs, but I get nothing from h_rgb.

It seems that(I guess) when processing cudaMemcpy, It could not find the correct address of d_data and copy to d_data.

So how can I fix this bug, OR, is it a proper way to execute cudaMalloc only once?

The whole code is located in ffio key_file_path:

  • ffio/ffio/ffio_c/ffio.c - corresponding to the c_code.c example file.
  • ffio/ffio/ffio_c/yuv2rgb.cu - corresponding to the my_cuda.cu example.

how to run the whole example:

  • ./compiler.sh to build the executable file main
  • execute main located in ffio/ffio/ffio_c/test
  • check the variable ffio->cudaFrame using Ctrl+F

Solution

  • Basic C/C++: You want the initCuda function to set a pointer in the processing_loop function. So you have to pass a pointer to the pointer to the initCuda function. Here is a minimal fix.

    void initCuda(unsigned char **h_data, unsigned char **h_rgb,
                 unsigned char **d_data, unsigned char **d_rgb)
    {
        *h_data = (unsigned char *)malloc(size);
        *h_rgb = (unsigned char *)malloc(size_result);
        cudaMalloc((void **)d_data, size);
        cudaMalloc((void **)d_rgb, size);
    }
    int processing_loop(args)  // specific args are omitted
    {
        // declaration of host and device memory
        unsigned char *h_data;
        unsigned char *h_rgb;
        unsigned char *d_data;
        unsigned char *d_rgb;
    
        initCuda(&h_data, &h_rgb, &d_data, &d_rgb);
        while (1)
        {
            int ret = process_one_image(h_data, h_rgb, d_data, d_rgb);
        }
        FinalizeCuda(h_data, h_rgb, d_data, d_rgb);
    }
    

    Note that there are other issues with your code such as your lack of error checking.