I have a device matrix U of dimensions MxN in column major ordering. Now I'd like to extract the row K into a vector u. Is there a function to accomplish this? Note the copy would need to take into account an offset of K and a stride of M.
I was looking at the function cudaMemcpy2D but it rings no bells, coming from a more LAPACK style API I don't understand what these pitch parameters are, why are they not called simply rows and cols or M and N?
You can use
cublas<t>copy(handle, N, U+K, M, u, 1);
as
#include<stdio.h>
#include<conio.h>
#include<assert.h>
#include<cublas_v2.h>
/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*************************/
/* cuBLAS ERROR CHECKING */
/*************************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif
inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
{
if( CUBLAS_STATUS_SUCCESS != err) {
fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err);
getch(); cudaDeviceReset(); assert(0);
}
}
int main() {
const int M = 5;
const int N = 4;
const int K = 2;
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
float* U = (float*)malloc(M*N*sizeof(float));
float* d_U;
gpuErrchk(cudaMalloc((void**)&d_U,M*N*sizeof(float)));
float* u = (float*)malloc(M*sizeof(float));
float* d_u;
gpuErrchk(cudaMalloc((void**)&d_u,N*sizeof(float)));
for (int j=0; j<N; j++)
for (int i=0; i<M; i++)
U[j*M+i] = (float)(i*j); // Column-major ordering
printf("K-th row - Input\n");
for (int j=0; j<N; j++) printf("U(K,%i) = %f\n",j,U[j*M+K]);
printf("\n\n");
gpuErrchk(cudaMemcpy(d_U,U,M*N*sizeof(float),cudaMemcpyHostToDevice));
cublasSafeCall(cublasScopy(handle, N, d_U+K, M, d_u, 1));
gpuErrchk(cudaMemcpy(u,d_u,N*sizeof(float),cudaMemcpyDeviceToHost));
printf("K-th row - Output\n");
for (int j=0; j<N; j++) printf("u(%i) = %f\n",j,u[j]);
getchar();
}