cudamemcpyasync, memcpy fails to copy inside kernel while direct copying works

I am trying to copy from a source float array(containing 1.0f) to a destination float array(containing 2.0f) inside a cuda kernel. I try three different ways using:

cudamemcpysync
memcpy
direct copy (dst[i] = src[i])

When i read the results after the kernel has been executed I found that both cudamemcpyasync and memcpy has failed to copy while the direct copy method has worked.

Why has the cudamemcpyasync and memcpy method failed?

I am using GTX TitanX(SM_52).

compiled using: nvcc -arch=compute_52 main.cu

main.cu:

#include <stdio.h>
#include <iostream>


__global__
void cudamemcpy_inside_kernel(float *src, float *dst, int size)
{
  int idx = blockIdx.x*blockDim.x + threadIdx.x;

    if(idx < size){
//        memcpy(dst +idx*sizeof(float), src + idx*sizeof(float), 1); // FAILS TO COPY
//        cudaMemcpyAsync(dst +idx*sizeof(float), src + idx*sizeof(float), 1, cudaMemcpyDeviceToDevice); // FAILS TO COPY
//          dst[idx] = src[idx]; // COPIES SUCCESSFULLY
    }

}

int current = 0;
int UniqueNumber () { return ++current; }

int main(void)
{
  int N = 1000;

  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));



  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));


  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);


//  cudamemcpy_inside_kernel<<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudamemcpy_inside_kernel<<<2, 512>>>(d_x, d_y, N);
  cudaDeviceSynchronize();

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();

  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]); // y[i] should have all 1.0f

}

Solution

You have the source, destination, and size arguments wrong in both memcpy calls. Something like this:

#include <stdio.h>
#include <iostream>

template<int action>
__global__
void cudamemcpy_inside_kernel(float *src, float *dst, int size)
{
  int idx = blockIdx.x*blockDim.x + threadIdx.x;
  if(idx < size)
    switch(action) {
      case 1:
        memcpy(dst+idx, src+idx, sizeof(float));
        break;
      case 2:
        cudaMemcpyAsync(dst+idx, src+idx, sizeof(float), cudaMemcpyDeviceToDevice);
        break;
      default:
        dst[idx] = src[idx];
    }
}

int main(void)
{
  int N = 10;

  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));


  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);

  printf("Assignment \n");
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudamemcpy_inside_kernel<0><<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]);

  printf("\n Memcpy \n");
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudamemcpy_inside_kernel<1><<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]);

  printf("\n cudaMemcpyAsync \n");
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudamemcpy_inside_kernel<2><<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]);

  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
}

will work as you expect:

$ nvcc -arch=sm_52 -dc -o memcpy_kernel.o memcpy_kernel.cu
$ nvcc -arch=sm_52 -o memcpy_kernel memcpy_kernel.o
$ ./memcpy_kernel 
Assignment 
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000

 Memcpy 
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000

 cudaMemcpyAsync 
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000