cuda-memcheck error in cusolverDnCgesvdjBatched function using CUDA

I am using cusolverDnCgesvdjBatched function to calculate singular value decomposition (SVD) of multiple matrices, I use cuda-memcheck to check any memory issues, I am getting an error like this in the cusolverDnCgesvdjBatched function.

========= Invalid __global__ write of size 4
=========     at 0x000062f8 in void batched_svd_parallel_jacobi_32x16<float2, float>(int, int, int, int, float2*, __int64, int, float*, float2*, __int64, int, float2*, __int64, int, float, int, int*, float, int, int*, int, float)
=========     by thread (0,0,0) in block (4,0,0)
=========     Address 0x701019010 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host 
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuModuleUnload.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuProfilerStop + 0x904ce) [0x2ae05e]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuProfilerStop + 0x92e73) [0x2b0a03]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll [0x84cb7]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll [0x86e03]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuProfilerStop + 0x11473a) [0x3322ca]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuModuleUnload + 0x1d6) [0x1d5d36]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::module::unload + 0x115) [0x9535]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::contextState::unloadAllModules + 0x196) [0x9b36]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::contextStateManager::destroyAllContextStatesOnRuntimeUnload + 0x78) [0xa188]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::globalState::~globalState + 0x3d) [0x24dd]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::set<cudart::globalModule * __ptr64>::rehash + 0x106) [0x74c6]
=========     Host Frame:C:\WINDOWS\System32\ucrtbase.dll (execute_onexit_table + 0x156) [0x142d6]
=========     Host Frame:C:\WINDOWS\System32\ucrtbase.dll (execute_onexit_table + 0x7b) [0x141fb]
=========     Host Frame:C:\WINDOWS\System32\ucrtbase.dll (execute_onexit_table + 0x34) [0x141b4]
=========     Host Frame:C:\WINDOWS\System32\ucrtbase.dll (exit + 0x142) [0x20522]
=========     Host Frame:C:\WINDOWS\System32\ucrtbase.dll (exit + 0xcb) [0x204ab]
=========     Host Frame:C:\WINDOWS\System32\ucrtbase.dll (exit + 0x6e) [0x2044e]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (gpuErrchk + 0x4c) [0xf0dc]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (main + 0x3ef) [0xebaf]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (__scrt_common_main_seh + 0x10c) [0xf5c4]
=========     Host Frame:C:\WINDOWS\System32\KERNEL32.dll (BaseThreadInitThunk + 0x14) [0x17034]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x52651]
=========
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuModuleUnload.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuProfilerStop + 0x904ce) [0x2ae05e]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuProfilerStop + 0x92e73) [0x2b0a03]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll [0x84cb7]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll [0x86e03]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuProfilerStop + 0x11473a) [0x3322ca]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuModuleUnload + 0x1d6) [0x1d5d36]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::module::unload + 0x115) [0x9535]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::contextState::unloadAllModules + 0x196) [0x9b36]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::contextStateManager::destroyAllContextStatesOnRuntimeUnload + 0x78) [0xa188]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::globalState::~globalState + 0x3d) [0x24dd]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::set<cudart::globalModule * __ptr64>::rehash + 0x106) [0x74c6]
=========     Host Frame:C:\WINDOWS\System32\ucrtbase.dll (execute_onexit_table + 0x156) [0x142d6]
=========     Host Frame:C:\WINDOWS\System32\ucrtbase.dll (execute_onexit_table + 0x7b) [0x141fb]
=========     Host Frame:C:\WINDOWS\System32\ucrtbase.dll (execute_onexit_table + 0x34) [0x141b4]
=========     Host Frame:C:\WINDOWS\System32\ucrtbase.dll (exit + 0x142) [0x20522]
=========     Host Frame:C:\WINDOWS\System32\ucrtbase.dll (exit + 0xcb) [0x204ab]
=========     Host Frame:C:\WINDOWS\System32\ucrtbase.dll (exit + 0x6e) [0x2044e]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (gpuErrchk + 0x4c) [0xf0dc]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (main + 0x3ef) [0xebaf]
=========     Host Frame:D:\SVD\x64\Release\SVD.exe (__scrt_common_main_seh + 0x10c) [0xf5c4]
=========     Host Frame:C:\WINDOWS\System32\KERNEL32.dll (BaseThreadInitThunk + 0x14) [0x17034]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x52651]
=========
========= ERROR SUMMARY: 8 errors

I am attaching the whole code I am using.

kernel.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cusolverDn.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define FULLSVD
#define PRINTRESULTS

/********/
/* MAIN */
/********/
int main() {

    const int           M = 10;
    const int           N = 5;
    const int           lda = M;
    //const int         numMatrices = 3;
    const int           numMatrices = 256;

    TimingGPU timerGPU;

    // --- Setting the host matrix
    cuComplex *h_A = (cuComplex *)malloc(lda * N * numMatrices * sizeof(double));
    for (unsigned int k = 0; k < numMatrices; k++)
        for (unsigned int i = 0; i < M; i++)
        {
            for (unsigned int j = 0; j < N; j++)
            {
                h_A[k * M * N + j * M + i] = make_float2((1. / (k + 1)) * (i + j * j) * (i + j), (1. / (k + 1)) * (i + j * j) * (i + j));
                //printf("[%d, %d] %f\n", i, j, h_A[j*M + i]);
                //printf("%f %f", h_A[j*M + i].x, h_A[j * M + i].y);
            }
            //printf("\n");
        }

    // --- Setting the device matrix and moving the host matrix to the device
    cuComplex *d_A;         gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(cuComplex)));
    gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(cuComplex), cudaMemcpyHostToDevice));

    // --- host side SVD results space
    float *h_S = (float *)malloc(N * numMatrices * sizeof(float));
    cuComplex *h_U = NULL;
    cuComplex *h_V = NULL;
#ifdef FULLSVD
    h_U = (cuComplex *)malloc(M * M * numMatrices * sizeof(cuComplex));
    h_V = (cuComplex *)malloc(N * N * numMatrices * sizeof(cuComplex));
#endif

    // --- device side SVD workspace and matrices
    int work_size = 0;

    int *devInfo;        gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
    float *d_S;         gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(float)));
    cuComplex *d_U = NULL;
    cuComplex *d_V = NULL;
#ifdef FULLSVD
    gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(cuComplex)));
    gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(cuComplex)));
#endif

    cuComplex *d_work = NULL; /* devie workspace for gesvdj */
    int devInfo_h = 0; /* host copy of error devInfo_h */

    // --- Parameters configuration of Jacobi-based SVD
    const double            tol = 1.e-7;
    const int               maxSweeps = 15;
    cusolverEigMode_t jobz;                                   // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
    jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
    jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif

    const int               econ = 0;                            // --- econ = 1 for economy size 

    // --- Numerical result parameters of gesvdj 
    double                  residual = 0;
    int                     executedSweeps = 0;

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle = NULL;
    cusolveSafeCall(cusolverDnCreate(&solver_handle));

    // --- Configuration of gesvdj
    gesvdjInfo_t gesvdj_params = NULL;
    cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));

    // --- Set the computation tolerance, since the default tolerance is machine precision
    cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));

    // --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
    cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));

    // --- Query the SVD workspace 
    cusolveSafeCall(cusolverDnCgesvdjBatched_bufferSize(
        solver_handle,
        jobz,                                       // --- Compute the singular vectors or not
        M,                                          // --- Number of rows of A, 0 <= M
        N,                                          // --- Number of columns of A, 0 <= N 
        d_A,                                        // --- M x N
        lda,                                        // --- Leading dimension of A
        d_S,                                        // --- Square matrix of size min(M, N) x min(M, N)
        d_U,                                        // --- M x M if econ = 0, M x min(M, N) if econ = 1
        lda,                                        // --- Leading dimension of U, ldu >= max(1, M)
        d_V,                                        // --- N x N if econ = 0, N x min(M,N) if econ = 1
        lda,                                        // --- Leading dimension of V, ldv >= max(1, N)
        &work_size,
        gesvdj_params,
        numMatrices));

    gpuErrchk(cudaMalloc(&d_work, sizeof(cuComplex) * work_size));

    // --- Compute SVD
    timerGPU.StartCounter();
    cusolveSafeCall(cusolverDnCgesvdjBatched(
        solver_handle,
        jobz,                                       // --- Compute the singular vectors or not
        M,                                          // --- Number of rows of A, 0 <= M
        N,                                          // --- Number of columns of A, 0 <= N 
        d_A,                                        // --- M x N
        lda,                                        // --- Leading dimension of A
        d_S,                                        // --- Square matrix of size min(M, N) x min(M, N)
        d_U,                                        // --- M x M if econ = 0, M x min(M, N) if econ = 1
        lda,                                        // --- Leading dimension of U, ldu >= max(1, M)
        d_V,                                        // --- N x N if econ = 0, N x min(M, N) if econ = 1
        N,                                          // --- Leading dimension of V, ldv >= max(1, N)
        d_work,
        work_size,
        devInfo,
        gesvdj_params,
        numMatrices));

    printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());

    gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(float) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
    gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(cuComplex) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(cuComplex) * N * N * numMatrices, cudaMemcpyDeviceToHost));
#endif

#ifdef PRINTRESULTS
    printf("SINGULAR VALUES \n");
    printf("_______________ \n");
    for (int k = 0; k < numMatrices; k++)
    {
        for (int p = 0; p < N; p++)
            printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
        printf("\n");
    }
#if 0 //FULLSVD
    printf("SINGULAR VECTORS U \n");
    printf("__________________ \n");
    for (int k = 0; k < numMatrices; k++)
    {
        for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
            for (int p = 0; p < M; p++)
                printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
        printf("\n");
    }

    printf("SINGULAR VECTORS V \n");
    printf("__________________ \n");
    for (int k = 0; k < numMatrices; k++)
    {
        for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
            for (int p = 0; p < N; p++)
                printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
        printf("\n");
    }
#endif
#endif

    if (0 == devInfo_h)
    {
        printf("gesvdj converges \n");
    }
    else if (0 > devInfo_h)
    {
        printf("%d-th parameter is wrong \n", -devInfo_h);
        exit(1);
    }
    else
    {
        printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
    }

    // --- Free resources
    if (d_A) gpuErrchk(cudaFree(d_A));
    if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
    if (d_U) gpuErrchk(cudaFree(d_U));
    if (d_V) gpuErrchk(cudaFree(d_V));
#endif
    if (devInfo) gpuErrchk(cudaFree(devInfo));
    if (d_work) gpuErrchk(cudaFree(d_work));
    if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
    if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));

    gpuErrchk(cudaDeviceReset());

    return 0;
}

TimingCPU.cpp

/* TIMING CPU */
/**************/

#include "TimingCPU.h"

#ifdef __linux__

#include <sys/time.h>
#include <stdio.h>

TimingCPU::TimingCPU() : cur_time_(0) {
    StartCounter();
}

TimingCPU::~TimingCPU() { }

void TimingCPU::StartCounter()
{
    struct timeval time;
    if (gettimeofday(&time, 0)) return;
    cur_time_ = 1000000 * time.tv_sec + time.tv_usec;
}

double TimingCPU::GetCounter()
{
    struct timeval time;
    if (gettimeofday(&time, 0)) return -1;

    long cur_time = 1000000 * time.tv_sec + time.tv_usec;
    double sec = (cur_time - cur_time_) / 1000000.0;
    if (sec < 0) sec += 86400;
    cur_time_ = cur_time;

    return 1000. * sec;
}

#elif _WIN32 || _WIN64
#include <windows.h>
#include <iostream>

struct PrivateTimingCPU {
    double  PCFreq;
    __int64 CounterStart;
};

// --- Default constructor
TimingCPU::TimingCPU() {
    privateTimingCPU = new PrivateTimingCPU; (*privateTimingCPU).PCFreq = 0.0; (*privateTimingCPU).CounterStart = 0;
}

// --- Default destructor
TimingCPU::~TimingCPU() { }

// --- Starts the timing
void TimingCPU::StartCounter()
{
    LARGE_INTEGER li;
    if (!QueryPerformanceFrequency(&li)) std::cout << "QueryPerformanceFrequency failed!\n";

    (*privateTimingCPU).PCFreq = double(li.QuadPart) / 1000.0;

    QueryPerformanceCounter(&li);
    (*privateTimingCPU).CounterStart = li.QuadPart;
}

// --- Gets the timing counter in ms
double TimingCPU::GetCounter()
{
    LARGE_INTEGER li;
    QueryPerformanceCounter(&li);
    return double(li.QuadPart - (*privateTimingCPU).CounterStart) / (*privateTimingCPU).PCFreq;
}
#endif

TimingCPU.h

// 1 micro-second accuracy
// Returns the time in seconds

#ifndef __TIMINGCPU_H__
#define __TIMINGCPU_H__

#ifdef __linux__

class TimingCPU {

private:
    long cur_time_;

public:

    TimingCPU();

    ~TimingCPU();

    void StartCounter();

    double GetCounter();
};

#elif _WIN32 || _WIN64

struct PrivateTimingCPU;

class TimingCPU
{
private:
    PrivateTimingCPU *privateTimingCPU;

public:

    TimingCPU();

    ~TimingCPU();

    void StartCounter();

    double GetCounter();

}; // TimingCPU class

#endif

#endif

TimingGPU.cu

/**************/
/* TIMING GPU */
/**************/

#include "TimingGPU.cuh"

#include <cuda.h>
#include <cuda_runtime.h>

struct PrivateTimingGPU {
    cudaEvent_t     start;
    cudaEvent_t     stop;
};

// default constructor
TimingGPU::TimingGPU() {
    privateTimingGPU = new PrivateTimingGPU;
}

// default destructor
TimingGPU::~TimingGPU() { }

void TimingGPU::StartCounter()
{
    cudaEventCreate(&((*privateTimingGPU).start));
    cudaEventCreate(&((*privateTimingGPU).stop));
    cudaEventRecord((*privateTimingGPU).start, 0);
}

void TimingGPU::StartCounterFlags()
{
    int eventflags = cudaEventBlockingSync;

    cudaEventCreateWithFlags(&((*privateTimingGPU).start), eventflags);
    cudaEventCreateWithFlags(&((*privateTimingGPU).stop), eventflags);
    cudaEventRecord((*privateTimingGPU).start, 0);
}

// Gets the counter in ms
float TimingGPU::GetCounter()
{
    float   time;
    cudaEventRecord((*privateTimingGPU).stop, 0);
    cudaEventSynchronize((*privateTimingGPU).stop);
    cudaEventElapsedTime(&time, (*privateTimingGPU).start, (*privateTimingGPU).stop);
    return time;
}

TimingGPU.cuh

#ifndef __TIMING_CUH__
#define __TIMING_CUH__

/**************/
/* TIMING GPU */
/**************/

// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
// microsecond precision.

struct PrivateTimingGPU;

class TimingGPU
{
private:
    PrivateTimingGPU *privateTimingGPU;

public:

    TimingGPU();

    ~TimingGPU();

    void StartCounter();
    void StartCounterFlags();

    float GetCounter();

}; // TimingCPU class

#endif

Utilities.cu

#include <assert.h>

#include "cuda_runtime.h"
#include <cuda.h>

#include <cusolverDn.h>

/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b) {
    return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) {
            exit(code);
        }
    }
}

extern "C" void gpuErrchk(cudaError_t ans) {
    gpuAssert((ans), __FILE__, __LINE__);
}

/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
    switch (error)
    {
        case CUSOLVER_STATUS_SUCCESS:
            return "CUSOLVER_SUCCESS";

        case CUSOLVER_STATUS_NOT_INITIALIZED:
            return "CUSOLVER_STATUS_NOT_INITIALIZED";

        case CUSOLVER_STATUS_ALLOC_FAILED:
            return "CUSOLVER_STATUS_ALLOC_FAILED";

        case CUSOLVER_STATUS_INVALID_VALUE:
            return "CUSOLVER_STATUS_INVALID_VALUE";

        case CUSOLVER_STATUS_ARCH_MISMATCH:
            return "CUSOLVER_STATUS_ARCH_MISMATCH";

        case CUSOLVER_STATUS_EXECUTION_FAILED:
            return "CUSOLVER_STATUS_EXECUTION_FAILED";

        case CUSOLVER_STATUS_INTERNAL_ERROR:
            return "CUSOLVER_STATUS_INTERNAL_ERROR";

        case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
            return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    }

    return "<unknown>";
}

inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
    if (CUSOLVER_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n", __FILE__, __LINE__, err, \
                _cudaGetErrorEnum(err)); \
            cudaDeviceReset(); assert(0); \
    }
}

extern "C" void cusolveSafeCall(cusolverStatus_t err) {
    __cusolveSafeCall(err, __FILE__, __LINE__);
}

Utilities.cuh

#ifndef UTILITIES_CUH
#define UTILITIES_CUH

extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);

#ifndef DEVICE_RESET
#define DEVICE_RESET cudaDeviceReset();
#endif

template< typename T >
void check(T result, char const *const func, const char *const file, int const line)
{
    if (result)
    {
        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
                file, line);
        //fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
        //        file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
        DEVICE_RESET
            // Make sure we call CUDA Device Reset before exiting
            exit(EXIT_FAILURE);
    }
}

// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )

// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg)      __getLastCudaError (msg, __FILE__, __LINE__)

#ifndef MAX
#define MAX(a,b) (a > b ? a : b)
#endif
#endif

Could anyone suggest fixes for the errors I am getting in svd function and the errors after that.

Solution

Referring to the documentation, for the info parameter:

info device output an integer array of dimension batchSize

So this is expected to be an array of integers of size equal to the number of matrices in the batch. This makes sense because we expect one of these info reports for each matrix. But your allocation does not do that:

int *devInfo;        gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));

When I fix that:

int *devInfo;        gpuErrchk(cudaMalloc(&devInfo, sizeof(int) * numMatrices));

the error goes away for me. This also has implications for your host-side allocation and also the copying of this data from device to host, later in the code.