I'm trying to calculate the largest eigenvalue/eigenvector pair with cuSolver released in CUDA 7.0 RC. The problem is that I'm getting CUSOLVER_INTERNAL_ERROR, and I don't know what can I do about it.
This is my handy stuff, used to call cuda/cusparse/cusolver functions.
// my handy stuff
#define CUDA_CALL(value) do { \
cudaError_t _m_cudaStat = value; \
if (_m_cudaStat != cudaSuccess) { \
fprintf(stderr, "Error %s at line %d in file %s\N", \
cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \
exit(-1); \
}
} while(0)
#define CUSPARSE_CALL(value) do { \
cusparseStatus_t _m_status = value; \
if (_m_status != CUSPARSE_STATUS_SUCCESS){ \
fprintf(stderr, "Error %d at line %d in file %s\N", (int)_m_status, __LINE__, __FILE__); \
exit(-5); \
} \
} while(0)
#define CUSOLVER_CALL(value) do { \
cusolverStatus_t _m_status = value; \
if (_m_status != CUSOLVER_STATUS_SUCCESS){ \
fprintf(stderr, "Error %d at line %d in file %s\N", (int)_m_status, __LINE__, __FILE__); \
exit(-5); \
} \
} while(0)
And this is my code
#include "cusparse.h"
#include "cusolverSp.h"
#include <cuda_runtime.h>
#include <math.h>
void dpss( size_t N, double NW, double *eigenvector );
int main()
{
// parameters for generation of dpss
size_t N = 128;
double NW = 1;
double *eigenvector = NULL;
eigenvector = new double[ N*sizeof( double ) ];
dpss( N, NW, eigenvector );
return 0;
}
void dpss( size_t N, double NW, double *eigenvector )
{
// define matrix T (NxN)
double** T = new double*[ N ];
for(int i = 0; i < N; ++i)
T[ i ] = new double[ N ];
// fill in T as function of ( N, W )
// T is a tridiagonal matrix, i. e., it has diagonal, subdiagonal and superdiagonal
// the others elements are 0
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
if( j == i - 1 ) // subdiagonal
T[ i ][ j ] = ( (double)N - i )*i/2;
else if( j == i ) // diagonal
T[ i ][ j ] = pow( (double)(N-1)/2 - i, 2 )*std::cos( 2*NW/(double)N*M_PI )*( j == i );
else if( j == i + 1 ) // superdiagonal
T[ i ][ j ] = ( i + 1 )*( (double)N - 1 - i )/2*( j == i + 1 );
else // others elements
T[ i ][ j ] = 0;
}
}
// declarations needed
cusolverStatus_t statCusolver = CUSOLVER_STATUS_SUCCESS;
cusolverSpHandle_t handleCusolver = NULL;
cusparseHandle_t handleCusparse = NULL;
cusparseMatDescr_t descrA = NULL;
int *h_cooRowIndex = NULL, *h_cooColIndex = NULL;
double *h_cooVal = NULL;
int *d_cooRowIndex = NULL, *d_cooColIndex = NULL, *d_csrRowPtr = NULL;
double *d_cooVal = NULL;
int nnz;
double *h_eigenvector0 = NULL, *d_eigenvector0 = NULL, *d_eigenvector = NULL;
double max_lambda;
// define interval of eigenvalues of T
// interval is [-max_lambda,max_lambda]
max_lambda = ( N - 1 )*( N + 2 ) + N*( N + 1 )/8 + 0.25;
// amount of nonzero elements of T
nnz = 3*N - 2;
// allocate host memory
h_cooRowIndex = new int[ nnz*sizeof( int ) ];
h_cooColIndex = new int[ nnz*sizeof( int ) ];
h_cooVal = new double[ nnz*sizeof( double ) ];
h_eigenvector0 = new double[ N*sizeof( double ) ];
// fill in vectors that describe T as a sparse matrix
int counter = 0;
for (int i = 0; i < N; i++ ) {
for( int j = 0; j < N; j++ ) {
if( T[ i ][ j ] != 0 ) {
h_cooColIndex[counter] = j;
h_cooRowIndex[counter] = i;
h_cooVal[counter++] = T[ i ][ j ];
}
}
}
// fill in initial eigenvector guess
for( int i = 0; i < N; i++ )
h_eigenvector0[ i ] = (double)1/(i+1);
// allocate device memory
CUDA_CALL( cudaMalloc((void**)&d_cooRowIndex,nnz*sizeof( int )) );
CUDA_CALL( cudaMalloc((void**)&d_cooColIndex,nnz*sizeof( int )) );
CUDA_CALL( cudaMalloc((void**)&d_cooVal, nnz*sizeof( double )) );
CUDA_CALL( cudaMalloc((void**)&d_csrRowPtr, (N+1)*sizeof( int )) );
CUDA_CALL( cudaMalloc((void**)&d_eigenvector0, N*sizeof( double )) );
CUDA_CALL( cudaMalloc((void**)&d_eigenvector, N*sizeof(d_eigenvector[0])) );
// copy data to device
CUDA_CALL( cudaMemcpy( d_cooRowIndex, h_cooRowIndex, (size_t)(nnz*sizeof( int )), cudaMemcpyHostToDevice ) );
CUDA_CALL( cudaMemcpy( d_cooColIndex, h_cooColIndex, (size_t)(nnz*sizeof( int )), cudaMemcpyHostToDevice ) );
CUDA_CALL( cudaMemcpy( d_cooVal, h_cooVal, (size_t)(nnz*sizeof( double )), cudaMemcpyHostToDevice ) );
CUDA_CALL( cudaMemcpy( d_eigenvector0, h_eigenvector0, (size_t)(N*sizeof( double )), cudaMemcpyHostToDevice ) );
// initialize cusparse and cusolver
CUSOLVER_CALL( cusolverSpCreate( &handleCusolver ) );
CUSPARSE_CALL( cusparseCreate( &handleCusparse ) );
// create and define cusparse matrix descriptor
CUSPARSE_CALL( cusparseCreateMatDescr(&descrA) );
CUSPARSE_CALL( cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL ) );
CUSPARSE_CALL( cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO ) );
// transform from coordinates (COO) values to compressed row pointers (CSR) values
CUSPARSE_CALL( cusparseXcoo2csr( handleCusparse, d_cooRowIndex, nnz, N, d_csrRowPtr, CUSPARSE_INDEX_BASE_ZERO ) );
// define some parameters and call cusolverSpScsreigvsi
int maxite = 1e6;
double tol = 1;
double mu = 0;
statCusolver = cusolverSpDcsreigvsi( handleCusolver, N ,nnz, descrA, d_cooVal, d_csrRowPtr, d_cooColIndex, max_lambda, d_eigenvector0, maxite, tol, &mu, d_eigenvector );
// here statCusolver = CUSOLVER_INTERNAL_ERROR
cudaDeviceSynchronize();
CUDA_CALL( cudaGetLastError() );
// copy from device to host
CUDA_CALL( cudaMemcpy( h_eigenvector0, d_eigenvector, (size_t)(N*sizeof( double )), cudaMemcpyDeviceToHost ) );
// destroy and free stuff
CUSPARSE_CALL( cusparseDestroyMatDescr( descrA ) );
CUSPARSE_CALL( cusparseDestroy( handleCusparse ) );
CUSOLVER_CALL( cusolverSpDestroy( handleCusolver ) );
CUDA_CALL( cudaFree( d_cooRowIndex ) );
CUDA_CALL( cudaFree( d_cooColIndex ) );
CUDA_CALL( cudaFree( d_cooVal ) );
CUDA_CALL( cudaFree( d_csrRowPtr ) );
CUDA_CALL( cudaFree( d_eigenvector0 ) );
CUDA_CALL( cudaFree( d_eigenvector ) );
delete[] h_eigenvector0;
delete[] h_cooRowIndex;
delete[] h_cooColIndex;
delete[] h_cooVal;
}
I already tried different choices for initial eigenvalue guess (namely max_lambda - or mu0 in the cuSolver library tutorial), initial eigenvector guess (h_eigenvector0 or d_eigenvector0), tolerance (tol), even amount of maximum iteration (maxite).
I already checked if the sparse matrix was properly written (it seemed correct to me). I also checked the returned eigenvector with Matlab, and they are completely different (and I think they shouldn't be).
I don't know what else can I do, but if someone does, please let me konw!!
Thanks in advance.
It appears (to me) that the cuSolver documentation may be incorrect with respect to the mu
parameter.
The documentation appears to indicate that this is in the host memory space, i.e. the 2nd to last parameter should be a host pointer.
If I change it to be a device pointer:
double mu = 0;
double *d_mu;
CUDA_CALL(cudaMalloc(&d_mu, sizeof(double)));
CUDA_CALL(cudaMemset(d_mu, 0, sizeof(double)));
CUSOLVER_CALL(cusolverSpDcsreigvsi( handleCusolver, N ,nnz, descrA, d_cooVal, d_csrRowPtr, d_cooColIndex, max_lambda, d_eigenvector0, maxite, tol, d_mu, d_eigenvector ));
...
CUDA_CALL(cudaMemcpy(&mu, d_mu, sizeof(double), cudaMemcpyDeviceToHost));
I can get a version of your code to run without any API errors or cuda-memcheck
errors. (I can't vouch for the results.)
I have already filed a documentation inquiry with NVIDIA. If you can confirm that you get sensible results with this change, that might also be useful. I took a look at the resultant eigenvalue and eigenvector, and although they did not appear to be bizarre, I could not get them to exactly correlate with my attempt to do the same thing in Octave.