I am trying to copy from a source float array(containing 1.0f) to a destination float array(containing 2.0f) inside a cuda kernel. I try three different ways using:
When i read the results after the kernel has been executed I found that both cudamemcpyasync and memcpy has failed to copy while the direct copy method has worked.
Why has the cudamemcpyasync and memcpy method failed?
I am using GTX TitanX(SM_52).
compiled using: nvcc -arch=compute_52 main.cu
main.cu:
#include <stdio.h>
#include <iostream>
__global__
void cudamemcpy_inside_kernel(float *src, float *dst, int size)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx < size){
// memcpy(dst +idx*sizeof(float), src + idx*sizeof(float), 1); // FAILS TO COPY
// cudaMemcpyAsync(dst +idx*sizeof(float), src + idx*sizeof(float), 1, cudaMemcpyDeviceToDevice); // FAILS TO COPY
// dst[idx] = src[idx]; // COPIES SUCCESSFULLY
}
}
int current = 0;
int UniqueNumber () { return ++current; }
int main(void)
{
int N = 1000;
float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
// cudamemcpy_inside_kernel<<<(N+255)/256, 256>>>(d_x, d_y, N);
cudamemcpy_inside_kernel<<<2, 512>>>(d_x, d_y, N);
cudaDeviceSynchronize();
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int i = 0; i < N; i++)
printf(" %f\n", y[i]); // y[i] should have all 1.0f
}
You have the source, destination, and size arguments wrong in both memcpy calls. Something like this:
#include <stdio.h>
#include <iostream>
template<int action>
__global__
void cudamemcpy_inside_kernel(float *src, float *dst, int size)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx < size)
switch(action) {
case 1:
memcpy(dst+idx, src+idx, sizeof(float));
break;
case 2:
cudaMemcpyAsync(dst+idx, src+idx, sizeof(float), cudaMemcpyDeviceToDevice);
break;
default:
dst[idx] = src[idx];
}
}
int main(void)
{
int N = 10;
float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
printf("Assignment \n");
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
cudamemcpy_inside_kernel<0><<<(N+255)/256, 256>>>(d_x, d_y, N);
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++)
printf(" %f\n", y[i]);
printf("\n Memcpy \n");
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
cudamemcpy_inside_kernel<1><<<(N+255)/256, 256>>>(d_x, d_y, N);
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++)
printf(" %f\n", y[i]);
printf("\n cudaMemcpyAsync \n");
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
cudamemcpy_inside_kernel<2><<<(N+255)/256, 256>>>(d_x, d_y, N);
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++)
printf(" %f\n", y[i]);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
}
will work as you expect:
$ nvcc -arch=sm_52 -dc -o memcpy_kernel.o memcpy_kernel.cu
$ nvcc -arch=sm_52 -o memcpy_kernel memcpy_kernel.o
$ ./memcpy_kernel
Assignment
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
Memcpy
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
cudaMemcpyAsync
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000