Search code examples
cpointersimage-processingcudafractals

Handling Image pointer to pointer in CUDA


I have been trying to put in place a CUDA code (taken in part from Fractal Image Compression by Yuval Fisher) that has a double pointer to a 2D image. After taking care of the pointer to pointer allocation in this , I am still getting segmentation fault error along with "Warning: Cannot tell what pointer points to, assuming global memory space" warning. Here is the entire code. I am also posting it here as under: (My apologies for duplicating the posted code)

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#define hsize 256
#define vsize 256

#define IMAGE_TYPE unsigned char


__global__ void kernel(IMAGE_TYPE matrixin[][hsize], IMAGE_TYPE matrixout[][hsize]) {
int tid=threadIdx.x;
int bid=blockIdx.x;

matrixout[bid][tid]=matrixin[bid][tid];
}

int fatal(char* s) {
fprintf(stderr,"%s\n",s);
return 1;
}

#define matrix_allocate(matrix,hsize,vsize,TYPE) {\
    TYPE *imptr;\
    int _i;\
    matrix=(TYPE**)malloc((vsize)*sizeof(TYPE*));\
    imptr=(TYPE*)malloc((long)(hsize)*(long)(vsize)*sizeof(TYPE));\
    if(imptr==NULL)\
    fatal("\nNo memory in matrix allocate.");\
    for(_i=0;_i<vsize;++_i,imptr+=hsize)\
    matrix[_i] = imptr;\
}\


int main() {
typedef IMAGE_TYPE IMarray[vsize][hsize];
IMAGE_TYPE **hin_image,**hout_image;

IMarray *din_image,*dout_image;


//allocate host memory
matrix_allocate(hin_image,hsize,vsize,IMAGE_TYPE)
for(int i=0;i<vsize;i++)
    for(int j=0;j<hsize;j++)
        hin_image[i][j]='a';

matrix_allocate(hout_image,hsize,vsize,IMAGE_TYPE)


//allocate device memory

cudaMalloc((void**)&din_image,(vsize*hsize)*sizeof(IMAGE_TYPE));
cudaMalloc((void**)&dout_image,(vsize*hsize)*sizeof(IMAGE_TYPE));

cudaMemcpy(din_image,hin_image, (vsize*hsize)*sizeof(IMAGE_TYPE),cudaMemcpyHostToDevice);

dim3 threads(hsize,1,1);
dim3 blocks(vsize,1,1);

kernel<<<blocks,threads>>>(din_image,dout_image);

cudaMemcpy(hout_image,dout_image,(vsize*hsize)*sizeof(IMAGE_TYPE),cudaMemcpyDeviceToHost);

for(int i=0;i<10;i++) {
    printf("\n");
    for(int j=0;j<10;j++)
        printf("%c\t",hout_image[i][j]);
}
printf("\n");

cudaFree(din_image);
cudaFree(dout_image);

free(hin_image);
free(hout_image);

return 0;
}

I intend to know what is wrong with the standard 2D access of image inside the kernel function. Any help would be highly welcome.


Solution

  • I'm not going to try and sort out your complex matrix allocation scheme. The purpose of my suggestion was so that you can simplify things to simple 1-line allocations.

    Furthermore, I don't think you really grasped the example I gave. It was a 3D example, and the typedefs had 2 subscripts. A 2D version would have typedefs with a single subscript.

    Really none of this has to do with CUDA. It revolves around understanding of C arrays and pointers.

    Those were the major changes I made to get your code working:

    #include <stdio.h>
    #include <stdlib.h>
    #define hsize 256
    #define vsize 256
    
    #define IMAGE_TYPE unsigned char
    
    
    __global__ void kernel(IMAGE_TYPE matrixin[][hsize], IMAGE_TYPE matrixout[][hsize]) {
      int tid=threadIdx.x;
      int bid=blockIdx.x;
    
      matrixout[bid][tid]=matrixin[bid][tid];
    }
    
    int fatal(char* s) {
      fprintf(stderr,"%s\n",s);
      return 1;
    }
    
    
    int main() {
      typedef IMAGE_TYPE IMarray[hsize];
      IMarray *hin_image,*hout_image;
    
      IMarray *din_image,*dout_image;
    
    
    //allocate host memory
      hin_image = (IMarray *)malloc(hsize*vsize*sizeof(IMAGE_TYPE));
      hout_image = (IMarray *)malloc(hsize*vsize*sizeof(IMAGE_TYPE));
    
      for(int i=0;i<vsize;i++)
        for(int j=0;j<hsize;j++)
            hin_image[i][j]='a';
    
    
    //allocate device memory
    
      cudaMalloc((void**)&din_image,(vsize*hsize)*sizeof(IMAGE_TYPE));
      cudaMalloc((void**)&dout_image,(vsize*hsize)*sizeof(IMAGE_TYPE));
      cudaMemset(dout_image, 0, (vsize*hsize)*sizeof(IMAGE_TYPE));
      cudaMemcpy(din_image,hin_image, (vsize*hsize)*sizeof(IMAGE_TYPE),cudaMemcpyHostToDevice);
    
      dim3 threads(hsize,1,1);
      dim3 blocks(vsize,1,1);
    
      kernel<<<blocks,threads>>>(din_image,dout_image);
    
      cudaMemcpy(hout_image,dout_image,(vsize*hsize)*sizeof(IMAGE_TYPE),cudaMemcpyDeviceToHost);
    
      for(int i=0;i<10;i++) {
        printf("\n");
        for(int j=0;j<10;j++)
            printf("%c\t",hout_image[i][j]);
      }
      printf("\n");
    
      cudaFree(din_image);
      cudaFree(dout_image);
    
      free(hin_image);
      free(hout_image);
    
      return 0;
    }