How to implement nearest neighbours image resizing algorithm in CUDA?

My main purpose is to load frames from a video with OpenCV, then copy it Nvidia Gpu memory, resize it with a Cuda based nearest neighbour algorithm, then copy it back to the host side and visualise it with cv::imshow()

Unfortunately, I always got segmentation faults. There could be a problem with defining the amount of bytes to be copied or with the data conversions. Below, you can find the main parts of the source code, but here is the repo for the full project: https://github.com/foxakarmi/imageResize

Main function:

#include <iostream>
#include "cuda_utils.h"
#include "yololayer.h"
#include <opencv2/highgui/highgui.hpp>

void *buffers[3];

int main() {

    cv::VideoCapture capture;
    cv::Mat frame;

    capture.open("/p.mp4");

    if (!capture.isOpened()) {
        std::cout << "can not open" << std::endl;
        return -1;
    }
    capture.read(frame);

    CUDA_CHECK(cudaMalloc(&buffers[0], frame.cols * frame.step[0]));
    CUDA_CHECK(cudaMalloc(&buffers[1], 3 * 640 * 640));
    buffers[2] = malloc(3 * 640 * 640);

    while (capture.read(frame)) {
        CUDA_CHECK(cudaMemcpy(buffers[0], frame.ptr(), frame.step[0] * frame.rows, cudaMemcpyHostToDevice))

        cudaNearestResize((uchar *) buffers[0], (uchar *) buffers[1], frame.cols, frame.rows, 640, 640);

        CUDA_CHECK(cudaMemcpy(buffers[2], buffers[1], 640 * 640 * 3, cudaMemcpyDeviceToHost))

        cv::Mat foo;
        foo.data = static_cast<uchar *>(buffers[2]);
        cv::imshow("img", foo);
        cv::waitKey(1);
    }

    capture.release();
    return 0;
}

The .cu file containing the kernel and a wrapper function:

#include <opencv2/core/hal/interface.h>
#include "yololayer.h"
#include "cuda_utils.h"

__global__ void kernelNearestNeighbourResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    int channel = 3;

    if (i < dst_h && j < dst_w) {
        int iIn = i * src_h / dst_h;
        int jIn = j * src_w / dst_h;

        dst_img[(i * dst_w + j) * channel + 0] = src_img[(iIn * src_w + jIn) * channel + 0];
        dst_img[(i * dst_w + j) * channel + 1] = src_img[(iIn * src_w + jIn) * channel + 1];
        dst_img[(i * dst_w + j) * channel + 2] = src_img[(iIn * src_w + jIn) * channel + 2];
    }
}

cudaError_t cudaNearestResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
    if (!src_img || !dst_img)
        return cudaErrorInvalidDevicePointer;

    if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
        return cudaErrorInvalidValue;

    kernelNearestNeighbourResize <<< 3600, 256>>>(
            src_img, dst_img, src_w,
            src_h, dst_w, dst_h);

    return cudaGetLastError();
}

Solution

Below you can see a complete working solution.

There are 3 main issues in your code:

The setup for the CUDA grid is incorrect. See an example how to set it in my code below (just an initial working version that you can further improve). See some general info here: The CUDA Programming Model.
Note: the grid setup can have a meaningful effect on the overall performance, and it is not trivial to optimize. See more info here: How do I choose grid and block dimensions for CUDA kernels?.
When copying the data to the device, you used frame.ptr() instead of frame.data.
You only set the data pointer for the output cv::Mat foo, without properly initializing it. So the cv::Mat metadata (rows, cols etc.) were not set and cv::imshow could not show it properly. In my code it is not required - see below.

Note that your code skips the first frame. I kept this behavior. You could include the first frame by checking if dst_img was already initialized, and if not (since it's the first frame) - initialize it and the CUDA buffers.

Some more notes on the code below:

There's no need to allocate buffer[2] for the host output image. Instead I initialized the cv::Mat with the proper size and use it's allocated buffer.
I renamed the device buffers, and added cudaFree for them.
It is safer to pass the number of channels to the kernel, rather than making it assume it is 3.
I passed the step (AKA stride) of the images to the kernel. This will support the case where the images have padding (see about it here: stride and padding of an image).

Code for main:

#include <iostream>
#include <opencv2/highgui/highgui.hpp>
#include "cuda_runtime.h"
#include <assert.h>

#define CUDA_CHECK(x) { cudaError_t cudaStatus = x; assert(cudaStatus == cudaSuccess); }

cudaError_t cudaNearestResize(unsigned char *src_img, unsigned char *dst_img, int channel,
    int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step);

int main()
{
    cv::VideoCapture capture;
    cv::Mat frame;
    capture.open("/p.mp4");
    if (!capture.isOpened()) 
    {
        std::cout << "can not open" << std::endl;
        return -1;
    }
    capture.read(frame);

    int src_w = frame.cols;
    int src_h = frame.rows;
    int src_step = (int)frame.step[0];
    int channels = frame.channels();
    int data_type = frame.type();
    assert((data_type & CV_MAT_DEPTH_MASK) == CV_8U);   // assert that it is a uchar image

    // Parameters you can change:
    int dst_w = 640;
    int dst_h = 640;

    cv::Mat dst_img(dst_h, dst_w, data_type);
    int dst_step = (int)dst_img.step[0];

    void * src_dev_buffer;
    void * dst_dev_buffer;
    CUDA_CHECK(cudaMalloc(&src_dev_buffer, src_h * src_step));
    CUDA_CHECK(cudaMalloc(&dst_dev_buffer, dst_h * dst_step));

    while (capture.read(frame))
    {
        // assert that the current frame has the same type and dimensions as the first one (should be guaranteed by the video decoder):
        assert(frame.cols == src_w);
        assert(frame.rows == src_h);
        assert((int)frame.step[0] == src_step);
        assert(frame.type() == data_type);

        CUDA_CHECK(cudaMemcpy(src_dev_buffer, frame.data, src_h * src_step, cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaNearestResize((unsigned char *)src_dev_buffer, (unsigned char *)dst_dev_buffer, channels, src_w, src_h, src_step, dst_w, dst_h, dst_step));
        CUDA_CHECK(cudaMemcpy(dst_img.data, dst_dev_buffer, dst_h * dst_step, cudaMemcpyDeviceToHost));
        cv::imshow("dst_img", dst_img);
        cv::waitKey(1);
    }

    CUDA_CHECK(cudaFree(src_dev_buffer));
    CUDA_CHECK(cudaFree(dst_dev_buffer));

    capture.release();
    return 0;
}

Code for the CUDA kernel and the wrapping function:

#include "cuda_runtime.h"

__global__ void kernelNearestNeighbourResize(unsigned char *src_img, unsigned char *dst_img, int channels,
    int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step)
{
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < dst_h && j < dst_w) 
    {
        int iIn = i * src_h / dst_h;
        int jIn = j * src_w / dst_w;

        int src_offset = i * dst_step + j * channels;
        int dst_offset = iIn * src_step + jIn * channels;
        for (int c = 0; c < channels; ++c) 
        {
            dst_img[src_offset + c] = src_img[dst_offset + c];
        }
    }
}

cudaError_t cudaNearestResize(unsigned char *src_img, unsigned char *dst_img, int channels,
    int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step)
{
    if (!src_img || !dst_img)
        return cudaErrorInvalidDevicePointer;

    if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
        return cudaErrorInvalidValue;

    // The grid dimensions
    dim3 dimBlock(32, 32);
    dim3 dimGrid(dst_w / 32 + 1, dst_h / 32 + 1);

    kernelNearestNeighbourResize << < dimGrid, dimBlock >> >(
        src_img, dst_img, channels,
        src_w, src_h, src_step, dst_w, dst_h, dst_step);

    return cudaGetLastError();
}