Here is a minimal program illustrating the problem. I'm using a GTS 250 with 128 CUDA cores, CUDA 5.0 on Windows 7.
void cuda_ops_test(int N, float* R)
{
//Values of input matrix in CPU
fprintf(stderr, "\nValues of R: \n");
for (int i=0; i<N; ++i)
fprintf(stderr, "%f, ", R[i]);
fprintf(stderr, "\n");
//Initialize CUDA/CUBLAS
cublasHandle_t handle;
cublasStatus_t status;
status = cublasCreate(&handle);
if (status == CUBLAS_STATUS_SUCCESS)
fprintf(stderr, "CUBLAS initialization succeeded.\n");
//Allocate device memory
float *dR = 0;
cudaError_t alloc_status;
alloc_status = cudaMalloc((void**)&dR, N*sizeof(dR[0]));
if(alloc_status == cudaSuccess)
fprintf (stderr, "\nDevice memory allocation succeeded.\n");
//Load array into memory
status = cublasSetMatrix(1, N, sizeof(R[0]), R, N, dR, N);
if(status == CUBLAS_STATUS_SUCCESS)
fprintf (stderr, "\nDevice write succeeded.\n");
//First operation: summation
float ans;
status = cublasSasum(handle, N, dR, 1, &ans);
if (status == CUBLAS_STATUS_SUCCESS)
fprintf(stderr, "\ncublasSasum produced no error. Sum of dR: %d\n", ans);
else
fprintf(stderr, "\ncublasSasum error: %d.\n", status);
//Second operation: y = ax+y
const float alpha = 2.0;
status = cublasSaxpy(handle, N,
&alpha,
dR, 1,
dR, 1);
if (status == CUBLAS_STATUS_SUCCESS)
fprintf(stderr, "\ncublasSaxpy produced no error.\n");
else
fprintf(stderr, "\ncublasSaxpy error: %d.\n", status);
// transfer device dR to host R
status = cublasGetMatrix (1, N, sizeof(dR[0]), dR, N, R, N);
if(status == CUBLAS_STATUS_SUCCESS)
fprintf (stderr, "\nDevice read succeded\n");
//Display post-op values of R
fprintf(stderr, "\nValues of R, after cublasSaxpy: \n");
for (int i=0; i<N; ++i)
fprintf(stderr, "%f, ", R[i]);
fprintf(stderr, "\n");
//Attempt to zero with cudaMemset
cudaError_t stat = cudaMemset(dR, 0, N*sizeof(dR[0]));
if (stat==cudaSuccess)
fprintf(stderr, "\nZeroing with cudaMemset on R produced no error.\n");
//Again transfer device dR to host R, after zeroing
status = cublasGetMatrix (1, N, sizeof(dR[0]), dR, N, R, N);
if(status == CUBLAS_STATUS_SUCCESS)
fprintf (stderr, "\nDevice read succeded.\n");
//Display values of R again
fprintf(stderr, "\nValues of R, after zeroing with cudaMemset: \n");
for (int i=0; i<N; ++i)
fprintf(stderr, "%f, ", R[i]);
fprintf(stderr, "\n");
cudaFree(dR);
}
Here's the output, indicating that while the data were loaded into GPU memory, no operations actually occurred:
Values of R: 0.123020, 0.367809, 0.834681, 0.035096, 0.517014, 0.662984, 0.426221, 0.104678,
CUBLAS initialization succeeded.
Device memory allocation succeeded.
cublasSasum produced no error. Sum of dR: 0
cublasSaxpy produced no error.
Values of R, after cublasSaxpy: 0.123020, 0.367809, 0.834681, 0.035096, 0.517014, 0.662984, 0.426221, 0.104678,
Zeroing with cudaMemset on R produced no error.
Values of R, after zeroing with cudaMemset: 0.123020, 0.367809, 0.834681, 0.035096, 0.517014, 0.662984, 0.426221, 0.104678,
What's going on? (And happy holidays. :) )
Your code had several errors. As I mention in the comments, you missed the fact that both your device read and device write messages were not being printed out because those functions (cublasSetMatrix, cublasGetMatrix) were in fact failing.
To fix the cublasSetMatrix and cublasGetMatrix calls, change the lda
and ldb
parameters to 1:
status = cublasSetMatrix(1, N, sizeof(R[0]), R, 1, dR, 1);
...
status = cublasGetMatrix (1, N, sizeof(dR[0]), dR, 1, R, 1);
The documentation for these functions says: "with the leading dimension of the source matrix A and destination matrix B given in lda and ldb, respectively. The leading dimension indicates the number of rows of the allocated matrix"
In your line printing out the result of the cublasSasum operation, your printf statement is incorrectly using a int format specifier to print out a float value. This won't work. Change the %d
to %f
:
fprintf(stderr, "\ncublasSasum produced no error. Sum of dR: %f\n", ans);
With those changes, I was able to get a sensible result:
Values of R:
0.123020, 0.367809, 0.834681, 0.035096, 0.517014, 0.662984, 0.426221, 0.104678,
CUBLAS initialization succeeded.
Device memory allocation succeeded.
Device write succeeded.
cublasSasum produced no error. Sum of dR: 3.071503
cublasSaxpy produced no error.
Device read succeded
Values of R, after cublasSaxpy:
0.369060, 1.103427, 2.504043, 0.105288, 1.551042, 1.988952, 1.278663, 0.314034,
Zeroing with cudaMemset on R produced no error.
Device read succeded.
Values of R, after zeroing with cudaMemset:
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
Note that this SO question/answer provides a tip for a useful, convenient cublas error parser function. It's not difficult to build this into a wrapper or error check macro for your cublas function calls.