eigenvalue and eigenvector calculation with cusolver from cuda 7.0 RC

I'm trying to calculate the largest eigenvalue/eigenvector pair with cuSolver released in CUDA 7.0 RC. The problem is that I'm getting CUSOLVER_INTERNAL_ERROR, and I don't know what can I do about it.

This is my handy stuff, used to call cuda/cusparse/cusolver functions.

// my handy stuff
#define CUDA_CALL(value) do {                                                     \
    cudaError_t _m_cudaStat = value;                                                  \
    if (_m_cudaStat != cudaSuccess) {                                               \
        fprintf(stderr, "Error %s at line %d in file %s\N",                 \
                cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__);       \
        exit(-1);                                                                             \
    } 
} while(0)

#define CUSPARSE_CALL(value) do {                                                                 \
    cusparseStatus_t _m_status = value;                                                             \
    if (_m_status != CUSPARSE_STATUS_SUCCESS){                                                      \
        fprintf(stderr, "Error %d at line %d in file %s\N", (int)_m_status, __LINE__, __FILE__);    \
    exit(-5);                                                                                     \
    }                                                                                             \
} while(0)

#define CUSOLVER_CALL(value) do {                                                                 \
    cusolverStatus_t _m_status = value;                                                             \
    if (_m_status != CUSOLVER_STATUS_SUCCESS){                                                      \
        fprintf(stderr, "Error %d at line %d in file %s\N", (int)_m_status, __LINE__, __FILE__);    \
    exit(-5);                                                                                     \
    }                                                                                             \
} while(0)

And this is my code

#include "cusparse.h"
#include "cusolverSp.h"
#include <cuda_runtime.h>
#include <math.h>

void dpss( size_t N, double NW, double *eigenvector );

int main()
{
    // parameters for generation of dpss
    size_t N = 128;
    double NW = 1;

    double *eigenvector = NULL;
    eigenvector = new double[ N*sizeof( double ) ];
    dpss( N, NW, eigenvector );

    return 0;
}

void dpss( size_t N, double NW, double *eigenvector )
{
    // define matrix T (NxN) 
    double** T = new double*[ N ];
    for(int i = 0; i < N; ++i)
        T[ i ] = new double[ N ];

    // fill in T as function of ( N, W ) 
    // T is a tridiagonal matrix, i. e., it has diagonal, subdiagonal and superdiagonal
    // the others elements are 0
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            if( j == i - 1 ) // subdiagonal
                T[ i ][ j ] = ( (double)N - i )*i/2;
            else if( j == i ) // diagonal
                T[ i ][ j ] = pow( (double)(N-1)/2 - i, 2 )*std::cos( 2*NW/(double)N*M_PI )*( j == i );
            else if( j == i + 1 ) // superdiagonal
                T[ i ][ j ] = ( i + 1 )*( (double)N - 1 - i )/2*( j == i + 1 );
            else // others elements
                T[ i ][ j ] = 0;
        }
    }

    // declarations needed
    cusolverStatus_t statCusolver = CUSOLVER_STATUS_SUCCESS;
    cusolverSpHandle_t handleCusolver = NULL;
    cusparseHandle_t handleCusparse = NULL;
    cusparseMatDescr_t descrA = NULL;
    int *h_cooRowIndex = NULL, *h_cooColIndex = NULL;
    double *h_cooVal = NULL; 
    int *d_cooRowIndex = NULL, *d_cooColIndex = NULL, *d_csrRowPtr = NULL; 
    double *d_cooVal = NULL; 
    int nnz; 
    double *h_eigenvector0 = NULL, *d_eigenvector0 = NULL, *d_eigenvector = NULL;
    double max_lambda;

    // define interval of eigenvalues of T
    // interval is [-max_lambda,max_lambda]
    max_lambda = ( N - 1 )*( N + 2 ) + N*( N + 1 )/8 + 0.25;

    // amount of nonzero elements of T
    nnz = 3*N - 2;

    // allocate host memory
    h_cooRowIndex = new int[ nnz*sizeof( int ) ];
    h_cooColIndex = new int[ nnz*sizeof( int ) ];
    h_cooVal = new double[ nnz*sizeof( double ) ];
    h_eigenvector0 = new double[ N*sizeof( double ) ];

    // fill in vectors that describe T as a sparse matrix
    int counter = 0;
    for (int i = 0; i < N; i++ ) {
        for( int j = 0; j < N; j++ ) {
            if( T[ i ][ j ] != 0 ) {
                h_cooColIndex[counter] = j;
                h_cooRowIndex[counter] = i;
                h_cooVal[counter++] = T[ i ][ j ];
            }
        }
    }

    // fill in initial eigenvector guess  
    for( int i = 0; i < N; i++ )
        h_eigenvector0[ i ] =  (double)1/(i+1);

    // allocate device memory
    CUDA_CALL( cudaMalloc((void**)&d_cooRowIndex,nnz*sizeof( int )) ); 
    CUDA_CALL( cudaMalloc((void**)&d_cooColIndex,nnz*sizeof( int )) ); 
    CUDA_CALL( cudaMalloc((void**)&d_cooVal, nnz*sizeof( double )) );
    CUDA_CALL( cudaMalloc((void**)&d_csrRowPtr, (N+1)*sizeof( int )) );
    CUDA_CALL( cudaMalloc((void**)&d_eigenvector0, N*sizeof( double )) );
    CUDA_CALL( cudaMalloc((void**)&d_eigenvector, N*sizeof(d_eigenvector[0])) );

    // copy data to device
    CUDA_CALL( cudaMemcpy( d_cooRowIndex, h_cooRowIndex, (size_t)(nnz*sizeof( int )), cudaMemcpyHostToDevice ) );
    CUDA_CALL( cudaMemcpy( d_cooColIndex, h_cooColIndex, (size_t)(nnz*sizeof( int )), cudaMemcpyHostToDevice ) );
    CUDA_CALL( cudaMemcpy( d_cooVal, h_cooVal, (size_t)(nnz*sizeof( double )), cudaMemcpyHostToDevice ) );
    CUDA_CALL( cudaMemcpy( d_eigenvector0, h_eigenvector0, (size_t)(N*sizeof( double )), cudaMemcpyHostToDevice ) );

    // initialize cusparse and cusolver
    CUSOLVER_CALL( cusolverSpCreate( &handleCusolver ) );
    CUSPARSE_CALL( cusparseCreate( &handleCusparse ) );

    // create and define cusparse matrix descriptor
    CUSPARSE_CALL( cusparseCreateMatDescr(&descrA) );
    CUSPARSE_CALL( cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL ) );
    CUSPARSE_CALL( cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO ) );

    // transform from coordinates (COO) values to compressed row pointers (CSR) values
    CUSPARSE_CALL( cusparseXcoo2csr( handleCusparse, d_cooRowIndex, nnz, N, d_csrRowPtr, CUSPARSE_INDEX_BASE_ZERO ) );

    // define some parameters and call cusolverSpScsreigvsi
    int maxite = 1e6;
    double tol = 1;
    double mu = 0;
    statCusolver = cusolverSpDcsreigvsi( handleCusolver, N ,nnz, descrA, d_cooVal, d_csrRowPtr, d_cooColIndex, max_lambda, d_eigenvector0, maxite, tol, &mu, d_eigenvector );
// here statCusolver = CUSOLVER_INTERNAL_ERROR

    cudaDeviceSynchronize();
    CUDA_CALL( cudaGetLastError() );

    // copy from device to host
    CUDA_CALL( cudaMemcpy( h_eigenvector0, d_eigenvector, (size_t)(N*sizeof( double )), cudaMemcpyDeviceToHost ) );

    // destroy and free stuff
    CUSPARSE_CALL( cusparseDestroyMatDescr( descrA ) );
    CUSPARSE_CALL( cusparseDestroy( handleCusparse ) );
    CUSOLVER_CALL( cusolverSpDestroy( handleCusolver ) );
    CUDA_CALL( cudaFree( d_cooRowIndex ) );
    CUDA_CALL( cudaFree( d_cooColIndex ) );
    CUDA_CALL( cudaFree( d_cooVal ) );
    CUDA_CALL( cudaFree( d_csrRowPtr ) );
    CUDA_CALL( cudaFree( d_eigenvector0 ) );
    CUDA_CALL( cudaFree( d_eigenvector ) );
    delete[] h_eigenvector0;
    delete[] h_cooRowIndex;
    delete[] h_cooColIndex;
    delete[] h_cooVal;
}

I already tried different choices for initial eigenvalue guess (namely max_lambda - or mu0 in the cuSolver library tutorial), initial eigenvector guess (h_eigenvector0 or d_eigenvector0), tolerance (tol), even amount of maximum iteration (maxite).

I already checked if the sparse matrix was properly written (it seemed correct to me). I also checked the returned eigenvector with Matlab, and they are completely different (and I think they shouldn't be).

I don't know what else can I do, but if someone does, please let me konw!!

Thanks in advance.

Solution

It appears (to me) that the cuSolver documentation may be incorrect with respect to the mu parameter.

The documentation appears to indicate that this is in the host memory space, i.e. the 2nd to last parameter should be a host pointer.

If I change it to be a device pointer:

double mu = 0;
double *d_mu;
CUDA_CALL(cudaMalloc(&d_mu, sizeof(double)));
CUDA_CALL(cudaMemset(d_mu, 0, sizeof(double)));
CUSOLVER_CALL(cusolverSpDcsreigvsi( handleCusolver, N ,nnz, descrA, d_cooVal, d_csrRowPtr, d_cooColIndex, max_lambda, d_eigenvector0, maxite, tol, d_mu, d_eigenvector ));

...
CUDA_CALL(cudaMemcpy(&mu, d_mu, sizeof(double), cudaMemcpyDeviceToHost));

I can get a version of your code to run without any API errors or cuda-memcheck errors. (I can't vouch for the results.)

I have already filed a documentation inquiry with NVIDIA. If you can confirm that you get sensible results with this change, that might also be useful. I took a look at the resultant eigenvalue and eigenvector, and although they did not appear to be bizarre, I could not get them to exactly correlate with my attempt to do the same thing in Octave.