I try to implement some sort of an Jacobi Algorithm and measure the time taken for different grid sizes.
For having the same amount of Iterations, regardless, how large the grid is, I don't use some sort of residuum, but let the Algorithm run 4000 Iterations always (but with different size of the arrays). That works exactly as it should, until the point I exceed a 510x510 grid (double). 510x510 takes about 2763498 microseconds and then 520x520 takes 1778 microseconds.
I already tried to change from double to float arrays, to make sure it's not some kind of memory shortage, but I can't figure out, where my problem really is hidden.
__global__ void Jacobi(double *a, double *b, double *c, int L){
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if(row > 0 && col > 0 && row < L-1 && col < L-1){
a[row * L + col] = 1.0/4.0 * (b[col+1 + row*L] + b[col - 1 + row*L] + b[col + (row+1)*L] + b[col + (row-1)*L] - c[col + row*L]);
__syncthreads();
b[row*L + col] = a[row*L+col];
}
}
int main(){
int L;
int Iterations;
double *h_phi1;
double *h_phi2;
double *h_f;
FILE * temp = fopen("Timings.out", "w");
for (L=10;L<10000;L+=10){
long long int size = L*L*sizeof(double);
h_f = (double*) malloc(size);
h_phi1 = (double*) malloc(size);
h_phi2 = (double*) malloc(size);
for(int i=0;i<L;i++){
for(int j=0;j<L;j++){
h_f[j+i*L] = (pow(1.0/float(L),2.0))*exp(-100.0*(pow((float(i)/float(L) - float(1.0/3.0) ),2.0) + pow(( float(j)/float(L) - float(1.0/3.0) ),2.0))) -
(pow(1.0/ float(L),2.0))*exp(- 100.0*(pow(( float(i)/ float(L) -(1.0- 1.0/3.0)),2.0) + pow(( float(j)/float(L)-(1.0-float(1.0/3.0))),2.0)));
h_phi1[j+i*L] = 0;
h_phi2[j+i*L] = 0;
}
}
//allocate memory on GPU
double *d_phi1;
cudaMalloc(&d_phi1, size);
double *d_phi2;
cudaMalloc(&d_phi2, size);
double *d_f;
cudaMalloc(&d_f, size);
//set CTA
int threads = 16;
int blocks = (L+threads-1)/threads;
double epsCalc;
//Setup Kernel launch parameters
dim3 dimBlock(threads, threads);
dim3 dimGrid(blocks, blocks);
//Setup timing and Cpy Memory from Host to Device
Iterations = 0;
auto t1 = std::chrono::high_resolution_clock::now();
cudaMemcpy(d_phi2, h_phi2, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_f, h_f, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_phi1, h_phi2, size, cudaMemcpyHostToDevice);
//Launch Kernel
for (int j=0;j<4000;j++){
Iterations += 1;
Jacobi<<<dimBlock, dimGrid>>>(d_phi2,d_phi1,d_f,L);
}
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();
fprintf(temp, "%lf % %d \n", L, duration);
printf("I reached the end of Jacobi after %d Iterations!\n Time taken was %d in milliseconds", Iterations, duration);
cudaFree(d_f); cudaFree(d_phi2), cudaFree(d_phi1);
free(h_f); free(h_phi2); free(h_phi1);
}
return 0;
}
I hope, somebody can guide me to where my mistake is made.
In CUDA, when specifying the kernel execution configuration arguments, the grid configuration comes first, followed by the block configuration.
Therefore, given the definition and usage of the variables dimGrid
and dimBlock
, this is incorrect:
Jacobi<<<dimBlock, dimGrid>>>
it should be:
Jacobi<<<dimGrid, dimBlock>>>
The grid and block configuration variables have different limits, so as it happens the first launch failure that occurred due to the mixup violating a hardware limit occurred at problem dimensions of 520,520