C++/CUDA weird behavior with large arrays

I try to implement some sort of an Jacobi Algorithm and measure the time taken for different grid sizes.

For having the same amount of Iterations, regardless, how large the grid is, I don't use some sort of residuum, but let the Algorithm run 4000 Iterations always (but with different size of the arrays). That works exactly as it should, until the point I exceed a 510x510 grid (double). 510x510 takes about 2763498 microseconds and then 520x520 takes 1778 microseconds.

I already tried to change from double to float arrays, to make sure it's not some kind of memory shortage, but I can't figure out, where my problem really is hidden.

__global__ void Jacobi(double *a, double *b, double *c, int L){
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if(row > 0 && col > 0 && row < L-1 && col < L-1){
    a[row * L + col] = 1.0/4.0 * (b[col+1 + row*L] + b[col - 1 + row*L] + b[col + (row+1)*L] + b[col + (row-1)*L] - c[col + row*L]);
    __syncthreads();
    b[row*L + col] = a[row*L+col];
    }
}

int main(){
int L;
int Iterations;
double *h_phi1;
double *h_phi2;

double *h_f;
FILE * temp = fopen("Timings.out", "w");
for (L=10;L<10000;L+=10){
    long long int size = L*L*sizeof(double);
    h_f = (double*) malloc(size);
    h_phi1 = (double*) malloc(size);
    h_phi2 = (double*) malloc(size);


    for(int i=0;i<L;i++){
        for(int j=0;j<L;j++){
            h_f[j+i*L] = (pow(1.0/float(L),2.0))*exp(-100.0*(pow((float(i)/float(L) - float(1.0/3.0) ),2.0) + pow(( float(j)/float(L) - float(1.0/3.0) ),2.0))) - 
            (pow(1.0/ float(L),2.0))*exp(- 100.0*(pow(( float(i)/ float(L) -(1.0- 1.0/3.0)),2.0) + pow(( float(j)/float(L)-(1.0-float(1.0/3.0))),2.0)));
            h_phi1[j+i*L] = 0;
            h_phi2[j+i*L] = 0;
        }
    }

    //allocate memory on GPU
    double *d_phi1;
    cudaMalloc(&d_phi1, size);
    double *d_phi2;
    cudaMalloc(&d_phi2, size);
    double *d_f;
    cudaMalloc(&d_f, size);
    
    //set CTA
    int threads = 16;
    int blocks = (L+threads-1)/threads;
    double epsCalc;

    //Setup Kernel launch parameters
    dim3 dimBlock(threads, threads);
    dim3 dimGrid(blocks, blocks);

    //Setup timing and Cpy Memory from Host to Device
    Iterations = 0;
    auto t1 = std::chrono::high_resolution_clock::now();
    cudaMemcpy(d_phi2, h_phi2, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_f, h_f, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_phi1, h_phi2, size, cudaMemcpyHostToDevice);

    //Launch Kernel
    for (int j=0;j<4000;j++){
        Iterations += 1;
        Jacobi<<<dimBlock, dimGrid>>>(d_phi2,d_phi1,d_f,L);

    }
    auto t2 = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();
    
    fprintf(temp, "%lf % %d \n", L, duration);
    printf("I reached the end of Jacobi after %d Iterations!\n Time taken was %d in milliseconds", Iterations, duration);
    cudaFree(d_f); cudaFree(d_phi2), cudaFree(d_phi1);
    free(h_f); free(h_phi2); free(h_phi1);
}

return 0;
}

I hope, somebody can guide me to where my mistake is made.

Solution

In CUDA, when specifying the kernel execution configuration arguments, the grid configuration comes first, followed by the block configuration.

Therefore, given the definition and usage of the variables dimGrid and dimBlock, this is incorrect:

    Jacobi<<<dimBlock, dimGrid>>>

it should be:

    Jacobi<<<dimGrid, dimBlock>>>

The grid and block configuration variables have different limits, so as it happens the first launch failure that occurred due to the mixup violating a hardware limit occurred at problem dimensions of 520,520