Convolution using CUDA with Sobel Filter

So I have implemented (or at least tried to) a Sobel filter in CUDA and my code is as follows. When I execute this file, i get half of a correctly Sobel filtered image and the other half is black. I am not able to upload the pictures because they are in .pgm format. So what the code does is read in a grayscale image which is in .pgm format and convolves Sobel filter mask with it using shared memory concept. I used a 1024 by 1024 .pgm image as input and it return a Sobel filtered image with edges that has half of it cut horizontally so it is black in the bottom half. Can someone please help me out here. Also, I am a little curious about the code, I do not really understand what the second batch loading does, so can you please explain that too.

sobel.cu

/* sobel.cu */

#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <time.h>
#include "mypgm.h"

#define Mask_width  3
#define Mask_radius Mask_width/2
#define TILE_WIDTH 16
#define w (TILE_WIDTH + Mask_width - 1)
#define clamp(x) (min(max((x), 0.0), 1.0))


__global__ void convolution(float *I, const float* __restrict__ M, float *P, int width, int height) {
    __shared__ float N_ds[w][w];
    int k;

    // First batch loading
    int dest = threadIdx.y * TILE_WIDTH + threadIdx.x,
        destY = dest / w, destX = dest % w,
        srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius,
        srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius,
        src = srcY * width + srcX;
    if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
        N_ds[destY][destX] = I[src];
    else
        N_ds[destY][destX] = 0;
    for (int iter = 1; iter <= (w*w) / (TILE_WIDTH*TILE_WIDTH); iter++)
    {
        // Second batch loading
        dest = threadIdx.y * TILE_WIDTH + threadIdx.x + TILE_WIDTH * TILE_WIDTH;
        destY = dest / w, destX = dest % w;
        srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius;
        srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius;
        src = srcY * width + srcX;
        if (destY < w) {
            if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
                N_ds[destY][destX] = I[src];
            else
                N_ds[destY][destX] = 0;
        }
    }
    __syncthreads();

    float accum = 0;
    int y, x;
    for (y = 0; y < Mask_width; y++)
        for (x = 0; x < Mask_width; x++)
            accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x];

    y = blockIdx.y * TILE_WIDTH + threadIdx.y;
    x = blockIdx.x * TILE_WIDTH + threadIdx.x;
    if (y < height && x < width)
        P[y * width + x] = accum;

    __syncthreads();

}

void sobel_filtering()
/* Spatial filtering of image data */
/* Sobel filter (horizontal differentiation */
/* Input: image1[y][x] ---- Outout: image2[y][x] */

{
    /* Definition of Sobel filter in horizontal direction */
    float weight[3][3] = { { -1,  0,  1 },
              { -2,  0,  2 },
              { -1,  0,  1 } };
    float pixel_value;

    int x, y, i, j;  /* Loop variable */
    float * deviceInputImageData;
    float * deviceOutputImageData;
    float * deviceMaskData;

    cudaMalloc((void **)&deviceInputImageData, x_size1 * y_size1 * sizeof(float));
    cudaMalloc((void **)&deviceOutputImageData, x_size1 * y_size1 * sizeof(float));
    cudaMalloc((void **)&deviceMaskData, 3 * 3 * sizeof(float));

    cudaMemcpy(deviceInputImageData, image1, x_size1 * y_size1 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceMaskData, weight, 3 * 3 * sizeof(float), cudaMemcpyHostToDevice);

    /* Maximum values calculation after filtering*/
    printf("Now, filtering of input image is performed\n\n");
    x_size2 = x_size1;
    y_size2 = y_size1;
    for (y = 0; y < y_size2; y++) {
        for (x = 0; x < x_size2; x++) {
            image2[y][x] = 0;
        }
    }

    dim3 dimGrid(ceil((float)x_size1 / TILE_WIDTH), ceil((float)y_size1 / TILE_WIDTH));
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
    convolution<<<dimGrid, dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, x_size1, y_size1);


    cudaMemcpy(image2,
        deviceOutputImageData,
        x_size2 * y_size2 * sizeof(float),
        cudaMemcpyDeviceToHost);

    cudaFree(deviceInputImageData);
    cudaFree(deviceOutputImageData);
    cudaFree(deviceMaskData);

}


int main()
{
    load_image_data();   /* Input of image1 */

    clock_t begin = clock();
    sobel_filtering();   /* Sobel filter is applied to image1 */
    clock_t end = clock();
    double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    printf("\n\nTiming result of multiplication of matrix-vector: %f\n", time_spent);
    save_image_data();   /* Output of image2 */
    return 0;
}

mypgm.h

/* pgm file IO headerfile ------ mypgm.h */

/* Constant declaration */

//#define MAX_IMAGESIZE  1024

#define MAX_IMAGEWIDTH  3840
#define MAX_IMAGEHEIGHT  2160
#define MAX_BRIGHTNESS  255 /* Maximum gray level */
#define GRAYLEVEL       256 /* No. of gray levels */
#define MAX_FILENAME    256 /* Filename length limit */
#define MAX_BUFFERSIZE  256

/* Global constant declaration */
/* Image storage arrays */
float image1[MAX_IMAGEWIDTH][MAX_IMAGEHEIGHT],
image2[MAX_IMAGEWIDTH][MAX_IMAGEHEIGHT];
int x_size1, y_size1, /* width & height of image1*/
x_size2, y_size2; /* width & height of image2 */

/* Prototype declaration of functions */
void load_image_data( ); /* image input */
void save_image_data( ); /* image output*/
void load_image_file(char *); /* image input */
void save_image_file(char *); /* image output*/


/* Main body of functions */

void load_image_data()
/* Input of header & body information of pgm file */
/* for image1[ ][ ]，x_size1，y_size1 */
{
    char file_name[MAX_FILENAME];
    char buffer[MAX_BUFFERSIZE];
    FILE *fp; /* File pointer */
    int max_gray; /* Maximum gray level */
    int x, y; /* Loop variable */

    /* Input file open */
    printf("\n-----------------------------------------------------\n");
    printf("Monochromatic image file input routine \n");
    printf("-----------------------------------------------------\n\n");
    printf("     Only pgm binary file is acceptable\n\n");
    printf("Name of input image file? (*.pgm) : ");
    scanf("%s", file_name);
    fp = fopen(file_name, "rb");
    if (NULL == fp) {
        printf("     The file doesn't exist!\n\n");
        exit(1);
    }
    /* Check of file-type ---P5 */
    fgets(buffer, MAX_BUFFERSIZE, fp);
    if (buffer[0] != 'P' || buffer[1] != '5') {
        printf("     Mistaken file format, not P5!\n\n");
        exit(1);
    }
    /* input of x_size1, y_size1 */
    x_size1 = 0;
    y_size1 = 0;
    while (x_size1 == 0 || y_size1 == 0) {
        fgets(buffer, MAX_BUFFERSIZE, fp);
        if (buffer[0] != '#') {
            sscanf(buffer, "%d %d", &x_size1, &y_size1);
        }
    }
    /* input of max_gray */
    max_gray = 0;
    while (max_gray == 0) {
        fgets(buffer, MAX_BUFFERSIZE, fp);
        if (buffer[0] != '#') {
            sscanf(buffer, "%d", &max_gray);
        }
    }
    /* Display of parameters */
    printf("\n     Image width = %d, Image height = %d\n", x_size1, y_size1);
    printf("     Maximum gray level = %d\n\n", max_gray);
    if (x_size1 > MAX_IMAGEWIDTH || y_size1 > MAX_IMAGEHEIGHT) {
        printf("     Image size exceeds %d x %d\n\n",
            MAX_IMAGEWIDTH, MAX_IMAGEHEIGHT);
        printf("     Please use smaller images!\n\n");
        exit(1);
    }
    if (max_gray != MAX_BRIGHTNESS) {
        printf("     Invalid value of maximum gray level!\n\n");
        exit(1);
    }
    /* Input of image data*/
    for (y = 0; y < y_size1; y++) {
        for (x = 0; x < x_size1; x++) {
            image1[y][x] = (unsigned char)fgetc(fp);
        }
    }
    printf("-----Image data input OK-----\n\n");
    printf("-----------------------------------------------------\n\n");
    fclose(fp);
}

void save_image_data()
/* Output of image2[ ][ ], x_size2, y_size2 in pgm format*/

{
    char file_name[MAX_FILENAME];
    FILE *fp; /* File pointer */
    int x, y; /* Loop variable */

    /* Output file open */
    printf("-----------------------------------------------------\n");
    printf("Monochromatic image file output routine\n");
    printf("-----------------------------------------------------\n\n");
    printf("Name of output image file? (*.pgm) : ");
    scanf("%s", file_name);
    fp = fopen(file_name, "wb");
    /* output of pgm file header information */
    fputs("P5\n", fp);
    fputs("# Created by Image Processing\n", fp);
    fprintf(fp, "%d %d\n", x_size2, y_size2);
    fprintf(fp, "%d\n", MAX_BRIGHTNESS);
    /* Output of image data */
    for (y = 0; y < y_size2; y++) {
        for (x = 0; x < x_size2; x++) {
            fputc(image2[y][x], fp);
        }
    }
    printf("\n-----Image data output OK-----\n\n");
    printf("-----------------------------------------------------\n\n");
    fclose(fp);
}

void load_image_file(char *filename)
/* Input of header & body information of pgm file */
/* for image1[ ][ ]，x_size1，y_size1 */
{
    char buffer[MAX_BUFFERSIZE];
    FILE *fp; /* File pointer */
    int max_gray; /* Maximum gray level */
    int x, y; /* Loop variable */

    /* Input file open */
    fp = fopen(filename, "rb");
    if (NULL == fp) {
        printf("     The file doesn't exist!\n\n");
        exit(1);
    }
    /* Check of file-type ---P5 */
    fgets(buffer, MAX_BUFFERSIZE, fp);
    if (buffer[0] != 'P' || buffer[1] != '5') {
        printf("     Mistaken file format, not P5!\n\n");
        exit(1);
    }
    /* input of x_size1, y_size1 */
    x_size1 = 0;
    y_size1 = 0;
    while (x_size1 == 0 || y_size1 == 0) {
        fgets(buffer, MAX_BUFFERSIZE, fp);
        if (buffer[0] != '#') {
            sscanf(buffer, "%d %d", &x_size1, &y_size1);
        }
    }
    /* input of max_gray */
    max_gray = 0;
    while (max_gray == 0) {
        fgets(buffer, MAX_BUFFERSIZE, fp);
        if (buffer[0] != '#') {
            sscanf(buffer, "%d", &max_gray);
        }
    }
    if (x_size1 > MAX_IMAGEWIDTH || y_size1 > MAX_IMAGEHEIGHT) {
        printf("     Image size exceeds %d x %d\n\n",
            MAX_IMAGEWIDTH, MAX_IMAGEHEIGHT);
        printf("     Please use smaller images!\n\n");
        exit(1);
    }
    if (max_gray != MAX_BRIGHTNESS) {
        printf("     Invalid value of maximum gray level!\n\n");
        exit(1);
    }
    /* Input of image data*/
    for (y = 0; y < y_size1; y++) {
        for (x = 0; x < x_size1; x++) {
            image1[y][x] = (float)fgetc(fp);
        }
    }
    fclose(fp);
}

void save_image_file(char *filename)
/* Output of image2[ ][ ], x_size2, y_size2 */
/* into pgm file with header & body information */
{
    FILE *fp; /* File pointer */
    int x, y; /* Loop variable */

    fp = fopen(filename, "wb");
    /* output of pgm file header information */
    fputs("P5\n", fp);
    fputs("# Created by Image Processing\n", fp);
    fprintf(fp, "%d %d\n", x_size2, y_size2);
    fprintf(fp, "%d\n", MAX_BRIGHTNESS);
    /* Output of image data */
    for (y = 0; y < y_size2; y++) {
        for (x = 0; x < x_size2; x++) {
            fputc(image2[y][x], fp);
        }
    }
    fclose(fp);
}

Solution

The reason you are only seeing part of the image is because you have a mismatch between your host image buffer sizes and your device image buffer sizes.

On the host, you have the image buffers defined like this:

#define MAX_IMAGEWIDTH  3840
#define MAX_IMAGEHEIGHT  2160
...
float image1[MAX_IMAGEWIDTH][MAX_IMAGEHEIGHT],
image2[MAX_IMAGEWIDTH][MAX_IMAGEHEIGHT];

You then proceed to load a PGM image of dimensions 1024x1024. You then create your device storage of size 1024x1024:

cudaMalloc((void **)&deviceInputImageData, x_size1 * y_size1 * sizeof(float));
cudaMalloc((void **)&deviceOutputImageData, x_size1 * y_size1 * sizeof(float));

where x_size and y_size1 here are defined by your PGM load routine, which will be 1024 each for a 1024x1024 image.

Then when you do a copy from host to device (a similar problem occurs on device->host copy as well):

cudaMemcpy(deviceInputImageData, image1, x_size1 * y_size1 * sizeof(float), cudaMemcpyHostToDevice);

you will be copying contiguous bytes from your host buffer to the device buffer. This means each host buffer line up to the full width of MAX_IMAGEWIDTH are copied to the device. But this is not really what you want. You only want each host buffer line copied up to x_size1 copied to the device.

There are several possible ways to fix this. I'm pretty sure the simplest would be simply to set MAX_IMAGEWIDTH and MAX_IMAGEHEIGHT to the actual values of the image you intend to use. When I did that, I got a plausible looking filtered result.

Since this limits you to processing a single image size, a better approach would be to dynamically define the size of your host image buffers, after you have read the PGM header data.

Or you could also use a method that involves cudaMemcpy2D, but this seems needlessly complex.

Regarding your second question, the reason for the "second batch loading" is because the first batch load only loads shared memory up to the dimensions of the threadblock, so it loads a 16x16 "patch" of shared memory, one element per thread. However we need the full shared memory array loaded, so we must do an extra "batch" of loading to fill the halo regions associated with the filter width and height.

Here is a modified file that seems to work correctly for me, demonstrating the method of dynamically allocating the host image buffers:

#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <time.h>

#define MAX_IMAGEWIDTH  2048
#define MAX_IMAGEHEIGHT 2048
#define MAX_BRIGHTNESS  255 /* Maximum gray level */
#define GRAYLEVEL       256 /* No. of gray levels */
#define MAX_FILENAME    256 /* Filename length limit */
#define MAX_BUFFERSIZE  256

/* Global constant declaration */
/* Image storage arrays */

float *image1, *image2;
int x_size1, y_size1, /* width & height of image1*/
x_size2, y_size2; /* width & height of image2 */

/* Prototype declaration of functions */
void load_image_data( ); /* image input */
void save_image_data( ); /* image output*/
void load_image_file(char *); /* image input */
void save_image_file(char *); /* image output*/


/* Main body of functions */

void load_image_data()
/* Input of header & body information of pgm file */
/* for image1[ ][ ]ï¼Œx_size1ï¼Œy_size1 */
{
    char file_name[MAX_FILENAME];
    char buffer[MAX_BUFFERSIZE];
    FILE *fp; /* File pointer */
    int max_gray; /* Maximum gray level */
    int x, y; /* Loop variable */

    /* Input file open */
    printf("\n-----------------------------------------------------\n");
    printf("Monochromatic image file input routine \n");
    printf("-----------------------------------------------------\n\n");
    printf("     Only pgm binary file is acceptable\n\n");
    printf("Name of input image file? (*.pgm) : ");
    scanf("%s", file_name);
    fp = fopen(file_name, "rb");
    if (NULL == fp) {
        printf("     The file doesn't exist!\n\n");
        exit(1);
    }
    /* Check of file-type ---P5 */
    fgets(buffer, MAX_BUFFERSIZE, fp);
    if (buffer[0] != 'P' || buffer[1] != '5') {
        printf("     Mistaken file format, not P5!\n\n");
        exit(1);
    }
    /* input of x_size1, y_size1 */
    x_size1 = 0;
    y_size1 = 0;
    while (x_size1 == 0 || y_size1 == 0) {
        fgets(buffer, MAX_BUFFERSIZE, fp);
        if (buffer[0] != '#') {
            sscanf(buffer, "%d %d", &x_size1, &y_size1);
        }
    }
    /* input of max_gray */
    max_gray = 0;
    while (max_gray == 0) {
        fgets(buffer, MAX_BUFFERSIZE, fp);
        if (buffer[0] != '#') {
            sscanf(buffer, "%d", &max_gray);
        }
    }
    /* Display of parameters */
    printf("\n     Image width = %d, Image height = %d\n", x_size1, y_size1);
    printf("     Maximum gray level = %d\n\n", max_gray);
    if (x_size1 > MAX_IMAGEWIDTH || y_size1 > MAX_IMAGEHEIGHT) {
        printf("     Image size exceeds %d x %d\n\n",
            MAX_IMAGEWIDTH, MAX_IMAGEHEIGHT);
        printf("     Please use smaller images!\n\n");
        exit(1);
    }
    if (max_gray != MAX_BRIGHTNESS) {
        printf("     Invalid value of maximum gray level!\n\n");
        exit(1);
    }
    image1 = (float *)malloc(x_size1*y_size1*sizeof(float));
    /* Input of image data*/
    for (y = 0; y < y_size1; y++) {
        for (x = 0; x < x_size1; x++) {
            image1[y*x_size1+x] = (unsigned char)fgetc(fp);
        }
    }
    printf("-----Image data input OK-----\n\n");
    printf("-----------------------------------------------------\n\n");
    fclose(fp);
}

void save_image_data()
/* Output of image2[ ][ ], x_size2, y_size2 in pgm format*/

{
    char file_name[MAX_FILENAME];
    FILE *fp; /* File pointer */
    int x, y; /* Loop variable */

    /* Output file open */
    printf("-----------------------------------------------------\n");
    printf("Monochromatic image file output routine\n");
    printf("-----------------------------------------------------\n\n");
    printf("Name of output image file? (*.pgm) : ");
    scanf("%s", file_name);
    fp = fopen(file_name, "wb");
    /* output of pgm file header information */
    fputs("P5\n", fp);
    fputs("# Created by Image Processing\n", fp);
    fprintf(fp, "%d %d\n", x_size2, y_size2);
    fprintf(fp, "%d\n", MAX_BRIGHTNESS);
    /* Output of image data */
    for (y = 0; y < y_size2; y++) {
        for (x = 0; x < x_size2; x++) {
            fputc(image2[y*x_size2+x], fp);
        }
    }
    printf("\n-----Image data output OK-----\n\n");
    printf("-----------------------------------------------------\n\n");
    fclose(fp);
}

#define Mask_width  3
#define Mask_radius Mask_width/2
#define TILE_WIDTH 16
#define w (TILE_WIDTH + Mask_width - 1)
#define clamp(x) (min(max((x), 0.0), 1.0))


__global__ void convolution(float *I, const float* __restrict__ M, float *P, int width, int height) {
    __shared__ float N_ds[w][w];

    // First batch loading
    int dest = threadIdx.y * TILE_WIDTH + threadIdx.x,
        destY = dest / w, destX = dest % w,
        srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius,
        srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius,
        src = srcY * width + srcX;
    if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
        N_ds[destY][destX] = I[src];
    else
        N_ds[destY][destX] = 0;
    for (int iter = 1; iter <= (w*w) / (TILE_WIDTH*TILE_WIDTH); iter++)
    {
        // Second batch loading
        dest = threadIdx.y * TILE_WIDTH + threadIdx.x + TILE_WIDTH * TILE_WIDTH;
        destY = dest / w, destX = dest % w;
        srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius;
        srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius;
        src = srcY * width + srcX;
        if (destY < w) {
            if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
                N_ds[destY][destX] = I[src];
            else
                N_ds[destY][destX] = 0;
        }
    }
    __syncthreads();

    float accum = 0;
    int y, x;
    for (y = 0; y < Mask_width; y++)
        for (x = 0; x < Mask_width; x++)
            accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x];

    y = blockIdx.y * TILE_WIDTH + threadIdx.y;
    x = blockIdx.x * TILE_WIDTH + threadIdx.x;
    if (y < height && x < width)
        P[y * width + x] = accum;

}

void sobel_filtering()
/* Spatial filtering of image data */
/* Sobel filter (horizontal differentiation */
/* Input: image1[y][x] ---- Outout: image2[y][x] */

{
    /* Definition of Sobel filter in horizontal direction */
    float weight[3][3] = { { -1,  0,  1 },
              { -2,  0,  2 },
              { -1,  0,  1 } };

    int x, y;  /* Loop variable */
    float * deviceInputImageData;
    float * deviceOutputImageData;
    float * deviceMaskData;

    cudaMalloc((void **)&deviceInputImageData, x_size1 * y_size1 * sizeof(float));
    cudaMalloc((void **)&deviceOutputImageData, x_size1 * y_size1 * sizeof(float));
    cudaMalloc((void **)&deviceMaskData, 3 * 3 * sizeof(float));

    cudaMemcpy(deviceInputImageData, image1, x_size1 * y_size1 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceMaskData, weight, 3 * 3 * sizeof(float), cudaMemcpyHostToDevice);

    /* Maximum values calculation after filtering*/
    printf("Now, filtering of input image is performed\n\n");
    x_size2 = x_size1;
    y_size2 = y_size1;
    image2 = (float *)malloc(x_size2*y_size2*sizeof(float));
    for (y = 0; y < y_size2; y++) {
        for (x = 0; x < x_size2; x++) {
            image2[y*x_size2+x] = 0;
        }
    }

    dim3 dimGrid(ceil((float)x_size1 / TILE_WIDTH), ceil((float)y_size1 / TILE_WIDTH));
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
    convolution<<<dimGrid, dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, x_size1, y_size1);


    cudaMemcpy(image2,
        deviceOutputImageData,
        x_size2 * y_size2 * sizeof(float),
        cudaMemcpyDeviceToHost);

    cudaFree(deviceInputImageData);
    cudaFree(deviceOutputImageData);
    cudaFree(deviceMaskData);

}


int main()
{
    load_image_data();   /* Input of image1 */

    clock_t begin = clock();
    sobel_filtering();   /* Sobel filter is applied to image1 */
    clock_t end = clock();
    double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    printf("\n\nTiming result of multiplication of matrix-vector: %f\n", time_spent);
    save_image_data();   /* Output of image2 */
    return 0;
}

Note the above code (your PGM load routine) has (IMO) a defect in that it requires the x and y sizes to be specified on the same line in the PGM file, but as far as I know this is not a requirement for P5 PGM files. If you pass it a valid P5 PGM file that has x and y sizes specified on different lines in the file, it will hang. I didn't attempt to fix that, as it doesn't appear to be the question you are asking.