for loop in cuda kernel function give wrong value

i have some cuda code with a 2d kernel function like this:

#include <stdio.h>
#include <stdlib.h>

#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define row 65
#define col 13824

__global__ void tt(int *pp){
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;

    for(unsigned ig=0;ig<33;ig++){
        pp[i*col+j]+=1;
    }
    return;
}

int main(){

    int *pp;
    int *rr;

    pp=(int*)malloc(sizeof(int)*col*row);
    rr=(int*)malloc(sizeof(int)*col*row);

    memset(pp,0,sizeof(int)*row*col);

    int *pp_g;
    cudaMalloc((void**)&pp_g,sizeof(int)*row*col);
    cudaMemcpy(pp_g,pp,sizeof(int)*row*col,cudaMemcpyHostToDevice);

    dim3 block(32,32,1);
    dim3 grid(row/32+1,col/32+1,1);

    tt<<<grid,block>>>(pp_g);
    cudaDeviceSynchronize();

    cudaMemcpy(rr,pp_g,sizeof(int)*row*col,cudaMemcpyDeviceToHost);

    int ct=0;
    for(unsigned i=0;i<row*col;i++){
        if(rr[i]!=33){
            //printf("%d\n",rr[i]);
            ct++;
        }
        //printf("%d\n",rr[i]);
    }
    printf("%d\n",ct);

    return 0;

}

the excepted results in rr array should all be 33, however, the actual results are all 0. But when I change block and grid to block(8,8,1) and grid(row/8+1,col/8+1,1), some results in rr array become correct while there are still 512 wrong results being 0 in rr array.

I cannot figure out what's wrong with my code. I want to know what happend with my results.

Solution

The threads of your kernel perform illegal memory accesses when i >= row and j >= col. Also, when i >= row and j < col, the threads perform memory writes concurrently with the "legal" threads.

Checking CUDA errors would have helped you understand the problem. See this post.