Search code examples
cudacublas

Is there a cuda function to copy a row from a Matrix in column major?


I have a device matrix U of dimensions MxN in column major ordering. Now I'd like to extract the row K into a vector u. Is there a function to accomplish this? Note the copy would need to take into account an offset of K and a stride of M.

I was looking at the function cudaMemcpy2D but it rings no bells, coming from a more LAPACK style API I don't understand what these pitch parameters are, why are they not called simply rows and cols or M and N?


Solution

  • You can use

    cublas<t>copy(handle, N, U+K, M, u, 1);
    

    as

    #include<stdio.h>
    #include<conio.h>
    #include<assert.h>
    
    #include<cublas_v2.h>
    
    /***********************/
    /* CUDA ERROR CHECKING */
    /***********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess) 
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);     
            if (abort) exit(code);
        }
    }
    
    /*************************/
    /* cuBLAS ERROR CHECKING */
    /*************************/
    #ifndef cublasSafeCall
    #define cublasSafeCall(err)     __cublasSafeCall(err, __FILE__, __LINE__)
    #endif
    
    inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
    {
        if( CUBLAS_STATUS_SUCCESS != err) {
            fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err); 
            getch(); cudaDeviceReset(); assert(0); 
        }
    }
    
    int main() {
    
        const int M = 5;
        const int N = 4;
        const int K = 2;
    
        cublasHandle_t handle;
        cublasSafeCall(cublasCreate(&handle));
    
        float* U = (float*)malloc(M*N*sizeof(float));
        float* d_U; 
        gpuErrchk(cudaMalloc((void**)&d_U,M*N*sizeof(float)));
    
        float* u = (float*)malloc(M*sizeof(float));
        float* d_u;
        gpuErrchk(cudaMalloc((void**)&d_u,N*sizeof(float)));
    
        for (int j=0; j<N; j++) 
            for (int i=0; i<M; i++) 
                U[j*M+i] = (float)(i*j); // Column-major ordering
    
        printf("K-th row - Input\n");
        for (int j=0; j<N; j++) printf("U(K,%i) =  %f\n",j,U[j*M+K]);
        printf("\n\n");
    
        gpuErrchk(cudaMemcpy(d_U,U,M*N*sizeof(float),cudaMemcpyHostToDevice));
    
        cublasSafeCall(cublasScopy(handle, N, d_U+K, M, d_u, 1));
    
        gpuErrchk(cudaMemcpy(u,d_u,N*sizeof(float),cudaMemcpyDeviceToHost));
    
        printf("K-th row - Output\n");
        for (int j=0; j<N; j++) printf("u(%i) =  %f\n",j,u[j]);
    
        getchar();
    
    }