Search code examples
cudamemcpy

cudamemcpyasync, memcpy fails to copy inside kernel while direct copying works


I am trying to copy from a source float array(containing 1.0f) to a destination float array(containing 2.0f) inside a cuda kernel. I try three different ways using:

  • cudamemcpysync
  • memcpy
  • direct copy (dst[i] = src[i])

When i read the results after the kernel has been executed I found that both cudamemcpyasync and memcpy has failed to copy while the direct copy method has worked.

Why has the cudamemcpyasync and memcpy method failed?

I am using GTX TitanX(SM_52).

compiled using: nvcc -arch=compute_52 main.cu

main.cu:

#include <stdio.h>
#include <iostream>


__global__
void cudamemcpy_inside_kernel(float *src, float *dst, int size)
{
  int idx = blockIdx.x*blockDim.x + threadIdx.x;

    if(idx < size){
//        memcpy(dst +idx*sizeof(float), src + idx*sizeof(float), 1); // FAILS TO COPY
//        cudaMemcpyAsync(dst +idx*sizeof(float), src + idx*sizeof(float), 1, cudaMemcpyDeviceToDevice); // FAILS TO COPY
//          dst[idx] = src[idx]; // COPIES SUCCESSFULLY
    }

}

int current = 0;
int UniqueNumber () { return ++current; }

int main(void)
{
  int N = 1000;

  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));



  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));


  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);


//  cudamemcpy_inside_kernel<<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudamemcpy_inside_kernel<<<2, 512>>>(d_x, d_y, N);
  cudaDeviceSynchronize();

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();

  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]); // y[i] should have all 1.0f

}

Solution

  • You have the source, destination, and size arguments wrong in both memcpy calls. Something like this:

    #include <stdio.h>
    #include <iostream>
    
    template<int action>
    __global__
    void cudamemcpy_inside_kernel(float *src, float *dst, int size)
    {
      int idx = blockIdx.x*blockDim.x + threadIdx.x;
      if(idx < size)
        switch(action) {
          case 1:
            memcpy(dst+idx, src+idx, sizeof(float));
            break;
          case 2:
            cudaMemcpyAsync(dst+idx, src+idx, sizeof(float), cudaMemcpyDeviceToDevice);
            break;
          default:
            dst[idx] = src[idx];
        }
    }
    
    int main(void)
    {
      int N = 10;
    
      float *x, *y, *d_x, *d_y;
      x = (float*)malloc(N*sizeof(float));
      y = (float*)malloc(N*sizeof(float));
    
      cudaMalloc(&d_x, N*sizeof(float)); 
      cudaMalloc(&d_y, N*sizeof(float));
    
    
      for (int i = 0; i < N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
      }
      cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
    
      printf("Assignment \n");
      cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
      cudamemcpy_inside_kernel<0><<<(N+255)/256, 256>>>(d_x, d_y, N);
      cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
      for (int i = 0; i < N; i++)
         printf(" %f\n", y[i]);
    
      printf("\n Memcpy \n");
      cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
      cudamemcpy_inside_kernel<1><<<(N+255)/256, 256>>>(d_x, d_y, N);
      cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
      for (int i = 0; i < N; i++)
         printf(" %f\n", y[i]);
    
      printf("\n cudaMemcpyAsync \n");
      cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
      cudamemcpy_inside_kernel<2><<<(N+255)/256, 256>>>(d_x, d_y, N);
      cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
      for (int i = 0; i < N; i++)
         printf(" %f\n", y[i]);
    
      cudaFree(d_x);
      cudaFree(d_y);
      free(x);
      free(y);
    }
    

    will work as you expect:

    $ nvcc -arch=sm_52 -dc -o memcpy_kernel.o memcpy_kernel.cu
    $ nvcc -arch=sm_52 -o memcpy_kernel memcpy_kernel.o
    $ ./memcpy_kernel 
    Assignment 
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
    
     Memcpy 
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
    
     cudaMemcpyAsync 
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000
     1.000000