What is a correct way to implement memcpy inside a CUDA kernel?

I am implementing a PDE solver (Lax-Friedrichs) in CUDA that I previously wrote in C. Please find the C code below:

void solve(int M, double u[M+3][M+3], double unp1[M+3][M+3], double params[3]){
 int i;
 int j;
 int n;


 for (n=0; n<params[0]; n++){
        for (i=0; i<M+2; i++)
           for(j=0; j<M+2; j++){
              unp1[i][j] = 0.25*(u[i+1][j] + u[i-1][j] + u[i][j+1] + u[i][j-1])
                 - params[1]*(u[i+1][j] - u[i-1][j])
                 - params[2]*(u[i][j+1] - u[i][j-1]);
           }
 
        memcpy(u, unp1, pow(M+3,2)*sizeof(double));
 
 
        /*Periodic Boundary Conditions*/
        for (i=0; i<M+2; i++){
           u[0][i] = u[N+1][i];
           u[i][0] = u[i][N+1];
           u[N+2][i] = u[1][i];
           u[i][N+2] = u[i][1];
        }
     }
  }

And it works fine. But when I am trying to implement it in CUDA I do not get the same data. Unfortunately I cannot exactly pinpoint the exact problem since I am a totally beginner to the whole parallel programming thing, but I think it might have to do with the u[i*(N+3) + j] = unp1[i*(N+3) + j] on the solver, since I cannot really perform a memcpy inside the kernel, because it doesn't change anything, I don't know how to proceed. I took a look at This previous answer, but it unfortunately couldn't help solving my problem. Here is the solver in CUDA I am trying to code:

#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <iostream>
#include <algorithm>

/*Configuration of the grid*/
const int N = 100; //Number of nodes  
const double xmin = -1.0;
const double ymin = -1.0;
const double xmax = 1.0;
const double ymax = 1.0;
const double tmax = 0.5;

/*Configuration of the simulation physics*/
const double dx = (xmax - xmin)/N;
const double dy = (ymax - ymin)/N;
const double dt = 0.009;
const double vx = 1.0;
const double vy = 1.0;


__global__ void initializeDomain(double *x, double *y){
    /*Initializes the grid of size (N+3)x(N+3) to better accomodate Boundary Conditions*/
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int j=index ; j<N+3; j+=stride){
        x[j] = xmin + (j-1)*dx;
        y[j] = ymin + (j-1)*dy;
    }
}


__global__ void initializeU(double *x, double *y, double *u0){
    
    double sigma_x = 2.0;
    double sigma_y = 6.0;
    
    int index_x = blockIdx.x * blockDim.x + threadIdx.x;
    int stride_x = blockDim.x * gridDim.x;
    int index_y = blockIdx.y * blockDim.y + threadIdx.y;
    int stride_y = blockDim.y * gridDim.y;
    
    for (int i = index_x; i < N+3; i += stride_x)
        for (int j = index_y; j < N+3; j+= stride_y){
            u0[i*(N+3) + j] = exp(-200*(pow(x[i],2)/(2*pow(sigma_x,2)) + pow(y[j],2)/(2*pow(sigma_y,2))));
            u0[i*(N+3) + j] *= 1/(2*M_PI*sigma_x*sigma_y);
               //u[i*(N+3) + j] = u0[i*(N+3) + j];
               //unp1[i*(N+3) + j] = u0[i*(N+3) + j];
    }
}


void initializeParams(double params[3]){
    params[0] = round(tmax/dt);
    params[1] = vx*dt/(2*dx);
    params[2] = vy*dt/(2*dy);
}


__global__ void solve(double *u, double *unp1, double params[3]){

     int index_x = blockIdx.x * blockDim.x + threadIdx.x;
     int stride_x = blockDim.x * gridDim.x;
     int index_y = blockIdx.y * blockDim.y + threadIdx.y;
     int stride_y = blockDim.y * gridDim.y; 

     for (int i = index_x; i < N+2; i += stride_x)
          for (int j = index_y; j < N+2; j += stride_y){
               unp1[i*(N+3) + j] = 0.25*(u[(i+1)*(N+3) + j] + u[(i-1)*(N+3) + j] + u[i*(N+3) + (j+1)] + u[i*(N+3) + (j-1)]) \
               - params[1]*(u[(i+1)*(N+3) + j] - u[(i-1)*(N+3) + j]) \
               - params[2]*(u[i*(N+3) + (j+1)] - u[i*(N+3) + (j-1)]);
          }

}


__global__ void bc(double *u){
     int index_x = blockIdx.x * blockDim.x + threadIdx.x;
     int stride_x = blockDim.x * gridDim.x;


     /*Also BC are set on parallel */
     for (int i = index_x; i < N+2; i += stride_x){
          u[0*(N+3) + i] = u[(N+1)*(N+3) + i];
          u[i*(N+3) + 0] = u[i*(N+3) + (N+1)];
          u[(N+2)*(N+3) + i] = u[1*(N+3) + i];
          u[i*(N+3) + (N+2)] = u[i*(N+3) + 1];
     }
}


int main(){
     int i;
     int j;

     double *x = (double *)malloc((N+3)*sizeof(double));
     double *y = (double *)malloc((N+3)*sizeof(double));

     double *d_x, *d_y;
     cudaMalloc(&d_x, (N+3)*sizeof(double));
     cudaMalloc(&d_y, (N+3)*sizeof(double));

     initializeDomain<<<1, 1>>>(d_x, d_y);
     cudaDeviceSynchronize();

     cudaMemcpy(x, d_x, (N+3)*sizeof(double), cudaMemcpyDeviceToHost);
     cudaMemcpy(y, d_y, (N+3)*sizeof(double), cudaMemcpyDeviceToHost);

     FILE *fout1 = fopen("data_x.csv", "w");
    FILE *fout2 = fopen("data_y.csv", "w");

    for (i=0; i<N+3; i++){
        if (i==N+2){
            fprintf(fout1, "%.5f", x[i]);
            fprintf(fout2, "%.5f", y[i]);
        }
        else{
            fprintf(fout1, "%.5f, ", x[i]);
            fprintf(fout2, "%.5f, ", y[i]);
        }
    }


     dim3 Block2D(1,1);
     dim3 ThreadsPerBlock(1,1);

     double *d_u0;
     double *u0 = (double *)malloc((N+3)*(N+3)*sizeof(double));
     cudaMalloc(&d_u0, (N+3)*(N+3)*sizeof(double));

     initializeU<<<Block2D, ThreadsPerBlock>>>(d_x, d_y, d_u0);
     cudaDeviceSynchronize();
     cudaMemcpy(u0, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToHost);

     /*Initialize parameters*/
     double params[3];
     initializeParams(params);

     /*Allocate memory for u and unp1 on device for the solver*/
     double *d_u, *d_unp1;
     cudaMalloc(&d_u, (N+3)*(N+3)*sizeof(double));
     cudaMalloc(&d_unp1, (N+3)*(N+3)*sizeof(double));
     cudaMemcpy(d_u, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToDevice);
     cudaMemcpy(d_unp1, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToDevice);
     
     /*Solve*/
     for (int n=0; n<params[0]; n++){
          solve<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
          double *temp = d_u;
          d_u = d_unp1;
          d_unp1 = temp;
          bc<<<1,1>>>(d_u);
          cudaDeviceSynchronize();
     }

     /*Copy results on host*/
     double *u = (double *)malloc((N+3)*(N+3)*sizeof(double));
     cudaMemcpy(u, d_u, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToHost);

     FILE *fu = fopen("data_u.csv", "w");
    for (i=0; i<N+3; i++){
          for(j=0; j<N+3; j++)
            if (j==N+2)
                fprintf(fu, "%.5f", u[i*(N+3) + j]);
            else
                fprintf(fu, "%.5f, ", u[i*(N+3) + j]);
        fprintf(fu, "\n");
    }
     fclose(fu);

     free(x);
     free(y);
     free(u0);
     free(u);
     cudaFree(d_x);
     cudaFree(d_y);
     cudaFree(d_u0);
     cudaFree(d_u);
     cudaFree(d_unp1);

     return 0;
}

I unfortunately keep having the same issue: The data I get is 0.0000.

Solution

One thing that is tripping you up is that your original algorithm has an ordering that is required for correctness:

Update unp from u
copy unp to u
enforce boundary condition
repeat

Your algorithm requires that step 1 be completed entirely before step 2 begins, and likewise for step 2 before step 3. Your CUDA realization (putting steps 1 and 3, or 1,2,3) in a single kernel does not preserve or guarantee that ordering. CUDA threads can execute in any order. If you apply that rigorously to your code (for example, imagine that thread with index 0 executes completely before any other thread begins. That would be valid CUDA execution) then you will see that your kernel design does not preserve the ordering required.

So do something like this:

Create a solve kernel that is just the first step:

 __global__ void solve(double *u, double *unp1, double params[3]){

      int index_x = blockIdx.x * blockDim.x + threadIdx.x;
      int stride_x = blockDim.x * gridDim.x;
      int index_y = blockIdx.y * blockDim.y + threadIdx.y;
      int stride_y = blockDim.y * gridDim.y;

      for (int i = index_x; i < N+2; i += stride_x)
          for (int j = index_y; j < N+2; j += stride_y){
               unp1[i*(N+3) + j] = 0.25*(u[(i+1)*(N+3) + j] + u[(i-1)*(N+3) + j] + u[i*(N+3) + (j+1)] + u[i*(N+3) + (j-1)]) \
               - params[1]*(u[(i+1)*(N+3) + j] - u[(i-1)*(N+3) + j]) \
               - params[2]*(u[i*(N+3) + (j+1)] - u[i*(N+3) + (j-1)]);
               u[i*(N+3) + j] = unp1[i*(N+3) + j];
          }
 }

don't bother with the memcpy operation. the better way to do that is swapping pointers (in host code).

Create a separate kernel to enforce the boundary:

 __global__ void bc(double *u, double *unp1, double params[3]){

      int index_x = blockIdx.x * blockDim.x + threadIdx.x;
      int stride_x = blockDim.x * gridDim.x;
      int index_y = blockIdx.y * blockDim.y + threadIdx.y;
      int stride_y = blockDim.y * gridDim.y;
      /*Also BC are set on parallel */
      for (int i = index_x; i < N+2; i += stride_x){
           u[0*(N+3) + i] = u[(N+1)*(N+3) + i];
           u[i*(N+3) + 0] = u[i*(N+3) + (N+1)];
           u[(N+2)*(N+3) + i] = u[1*(N+3) + i];
           u[i*(N+3) + (N+2)] = u[i*(N+3) + 1];
      }
 }

Modify your host code to call these kernels in sequence, with the pointer swap in-between:

   /*Solve*/
   for(int n = 0; n<params[0]; n++){
        solve<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
        double *temp = d_u;
        d_u = d_unp1;
        d_unp1 = temp;
        bc<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
        cudaDeviceSynchronize();
   }

(coded in browser, not tested)

This will enforce the ordering that your algorithm requires.

NOTE: As identified in the comments below, the solve kernel as depicted above (and in OP's original post, and in their posted CPU code version) has indexing errors at least associated with i-1 and j-1 indexing patterns. These should be fixed otherwise the code is broken. Fixing them requires some decision as to what to do for the edge cases, which OP provides no guidance on, therefore I have left that code as-is.