c pointers image-processing cuda fractals

Handling Image pointer to pointer in CUDA

I have been trying to put in place a CUDA code (taken in part from Fractal Image Compression by Yuval Fisher) that has a double pointer to a 2D image. After taking care of the pointer to pointer allocation in this , I am still getting segmentation fault error along with "Warning: Cannot tell what pointer points to, assuming global memory space" warning. Here is the entire code. I am also posting it here as under: (My apologies for duplicating the posted code)

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#define hsize 256
#define vsize 256

#define IMAGE_TYPE unsigned char


__global__ void kernel(IMAGE_TYPE matrixin[][hsize], IMAGE_TYPE matrixout[][hsize]) {
int tid=threadIdx.x;
int bid=blockIdx.x;

matrixout[bid][tid]=matrixin[bid][tid];
}

int fatal(char* s) {
fprintf(stderr,"%s\n",s);
return 1;
}

#define matrix_allocate(matrix,hsize,vsize,TYPE) {\
    TYPE *imptr;\
    int _i;\
    matrix=(TYPE**)malloc((vsize)*sizeof(TYPE*));\
    imptr=(TYPE*)malloc((long)(hsize)*(long)(vsize)*sizeof(TYPE));\
    if(imptr==NULL)\
    fatal("\nNo memory in matrix allocate.");\
    for(_i=0;_i<vsize;++_i,imptr+=hsize)\
    matrix[_i] = imptr;\
}\


int main() {
typedef IMAGE_TYPE IMarray[vsize][hsize];
IMAGE_TYPE **hin_image,**hout_image;

IMarray *din_image,*dout_image;


//allocate host memory
matrix_allocate(hin_image,hsize,vsize,IMAGE_TYPE)
for(int i=0;i<vsize;i++)
    for(int j=0;j<hsize;j++)
        hin_image[i][j]='a';

matrix_allocate(hout_image,hsize,vsize,IMAGE_TYPE)


//allocate device memory

cudaMalloc((void**)&din_image,(vsize*hsize)*sizeof(IMAGE_TYPE));
cudaMalloc((void**)&dout_image,(vsize*hsize)*sizeof(IMAGE_TYPE));

cudaMemcpy(din_image,hin_image, (vsize*hsize)*sizeof(IMAGE_TYPE),cudaMemcpyHostToDevice);

dim3 threads(hsize,1,1);
dim3 blocks(vsize,1,1);

kernel<<<blocks,threads>>>(din_image,dout_image);

cudaMemcpy(hout_image,dout_image,(vsize*hsize)*sizeof(IMAGE_TYPE),cudaMemcpyDeviceToHost);

for(int i=0;i<10;i++) {
    printf("\n");
    for(int j=0;j<10;j++)
        printf("%c\t",hout_image[i][j]);
}
printf("\n");

cudaFree(din_image);
cudaFree(dout_image);

free(hin_image);
free(hout_image);

return 0;
}

I intend to know what is wrong with the standard 2D access of image inside the kernel function. Any help would be highly welcome.

Solution

I'm not going to try and sort out your complex matrix allocation scheme. The purpose of my suggestion was so that you can simplify things to simple 1-line allocations.

Furthermore, I don't think you really grasped the example I gave. It was a 3D example, and the typedefs had 2 subscripts. A 2D version would have typedefs with a single subscript.

Really none of this has to do with CUDA. It revolves around understanding of C arrays and pointers.

Those were the major changes I made to get your code working:

#include <stdio.h>
#include <stdlib.h>
#define hsize 256
#define vsize 256

#define IMAGE_TYPE unsigned char


__global__ void kernel(IMAGE_TYPE matrixin[][hsize], IMAGE_TYPE matrixout[][hsize]) {
  int tid=threadIdx.x;
  int bid=blockIdx.x;

  matrixout[bid][tid]=matrixin[bid][tid];
}

int fatal(char* s) {
  fprintf(stderr,"%s\n",s);
  return 1;
}


int main() {
  typedef IMAGE_TYPE IMarray[hsize];
  IMarray *hin_image,*hout_image;

  IMarray *din_image,*dout_image;


//allocate host memory
  hin_image = (IMarray *)malloc(hsize*vsize*sizeof(IMAGE_TYPE));
  hout_image = (IMarray *)malloc(hsize*vsize*sizeof(IMAGE_TYPE));

  for(int i=0;i<vsize;i++)
    for(int j=0;j<hsize;j++)
        hin_image[i][j]='a';


//allocate device memory

  cudaMalloc((void**)&din_image,(vsize*hsize)*sizeof(IMAGE_TYPE));
  cudaMalloc((void**)&dout_image,(vsize*hsize)*sizeof(IMAGE_TYPE));
  cudaMemset(dout_image, 0, (vsize*hsize)*sizeof(IMAGE_TYPE));
  cudaMemcpy(din_image,hin_image, (vsize*hsize)*sizeof(IMAGE_TYPE),cudaMemcpyHostToDevice);

  dim3 threads(hsize,1,1);
  dim3 blocks(vsize,1,1);

  kernel<<<blocks,threads>>>(din_image,dout_image);

  cudaMemcpy(hout_image,dout_image,(vsize*hsize)*sizeof(IMAGE_TYPE),cudaMemcpyDeviceToHost);

  for(int i=0;i<10;i++) {
    printf("\n");
    for(int j=0;j<10;j++)
        printf("%c\t",hout_image[i][j]);
  }
  printf("\n");

  cudaFree(din_image);
  cudaFree(dout_image);

  free(hin_image);
  free(hout_image);

  return 0;
}