I am using cusolverDnCgesvdjBatched function to calculate singular value decomposition (SVD) of multiple matrices, I use cuda-memcheck to check any memory issues, I am getting an error like this in the cusolverDnCgesvdjBatched function.
========= Invalid __global__ write of size 4
========= at 0x000062f8 in void batched_svd_parallel_jacobi_32x16<float2, float>(int, int, int, int, float2*, __int64, int, float*, float2*, __int64, int, float2*, __int64, int, float, int, int*, float, int, int*, int, float)
========= by thread (0,0,0) in block (4,0,0)
========= Address 0x701019010 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuModuleUnload.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuProfilerStop + 0x904ce) [0x2ae05e]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuProfilerStop + 0x92e73) [0x2b0a03]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll [0x84cb7]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll [0x86e03]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuProfilerStop + 0x11473a) [0x3322ca]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuModuleUnload + 0x1d6) [0x1d5d36]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::module::unload + 0x115) [0x9535]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::contextState::unloadAllModules + 0x196) [0x9b36]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::contextStateManager::destroyAllContextStatesOnRuntimeUnload + 0x78) [0xa188]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::globalState::~globalState + 0x3d) [0x24dd]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::set<cudart::globalModule * __ptr64>::rehash + 0x106) [0x74c6]
========= Host Frame:C:\WINDOWS\System32\ucrtbase.dll (execute_onexit_table + 0x156) [0x142d6]
========= Host Frame:C:\WINDOWS\System32\ucrtbase.dll (execute_onexit_table + 0x7b) [0x141fb]
========= Host Frame:C:\WINDOWS\System32\ucrtbase.dll (execute_onexit_table + 0x34) [0x141b4]
========= Host Frame:C:\WINDOWS\System32\ucrtbase.dll (exit + 0x142) [0x20522]
========= Host Frame:C:\WINDOWS\System32\ucrtbase.dll (exit + 0xcb) [0x204ab]
========= Host Frame:C:\WINDOWS\System32\ucrtbase.dll (exit + 0x6e) [0x2044e]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (gpuErrchk + 0x4c) [0xf0dc]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (main + 0x3ef) [0xebaf]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (__scrt_common_main_seh + 0x10c) [0xf5c4]
========= Host Frame:C:\WINDOWS\System32\KERNEL32.dll (BaseThreadInitThunk + 0x14) [0x17034]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x52651]
=========
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuModuleUnload.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuProfilerStop + 0x904ce) [0x2ae05e]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuProfilerStop + 0x92e73) [0x2b0a03]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll [0x84cb7]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll [0x86e03]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuProfilerStop + 0x11473a) [0x3322ca]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvami.inf_amd64_72390dc4652f28fa\nvcuda64.dll (cuModuleUnload + 0x1d6) [0x1d5d36]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::module::unload + 0x115) [0x9535]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::contextState::unloadAllModules + 0x196) [0x9b36]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::contextStateManager::destroyAllContextStatesOnRuntimeUnload + 0x78) [0xa188]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::globalState::~globalState + 0x3d) [0x24dd]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (cudart::set<cudart::globalModule * __ptr64>::rehash + 0x106) [0x74c6]
========= Host Frame:C:\WINDOWS\System32\ucrtbase.dll (execute_onexit_table + 0x156) [0x142d6]
========= Host Frame:C:\WINDOWS\System32\ucrtbase.dll (execute_onexit_table + 0x7b) [0x141fb]
========= Host Frame:C:\WINDOWS\System32\ucrtbase.dll (execute_onexit_table + 0x34) [0x141b4]
========= Host Frame:C:\WINDOWS\System32\ucrtbase.dll (exit + 0x142) [0x20522]
========= Host Frame:C:\WINDOWS\System32\ucrtbase.dll (exit + 0xcb) [0x204ab]
========= Host Frame:C:\WINDOWS\System32\ucrtbase.dll (exit + 0x6e) [0x2044e]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (gpuErrchk + 0x4c) [0xf0dc]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (main + 0x3ef) [0xebaf]
========= Host Frame:D:\SVD\x64\Release\SVD.exe (__scrt_common_main_seh + 0x10c) [0xf5c4]
========= Host Frame:C:\WINDOWS\System32\KERNEL32.dll (BaseThreadInitThunk + 0x14) [0x17034]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x52651]
=========
========= ERROR SUMMARY: 8 errors
I am attaching the whole code I am using.
kernel.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define FULLSVD
#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 10;
const int N = 5;
const int lda = M;
//const int numMatrices = 3;
const int numMatrices = 256;
TimingGPU timerGPU;
// --- Setting the host matrix
cuComplex *h_A = (cuComplex *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++)
{
for (unsigned int j = 0; j < N; j++)
{
h_A[k * M * N + j * M + i] = make_float2((1. / (k + 1)) * (i + j * j) * (i + j), (1. / (k + 1)) * (i + j * j) * (i + j));
//printf("[%d, %d] %f\n", i, j, h_A[j*M + i]);
//printf("%f %f", h_A[j*M + i].x, h_A[j * M + i].y);
}
//printf("\n");
}
// --- Setting the device matrix and moving the host matrix to the device
cuComplex *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(cuComplex)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(cuComplex), cudaMemcpyHostToDevice));
// --- host side SVD results space
float *h_S = (float *)malloc(N * numMatrices * sizeof(float));
cuComplex *h_U = NULL;
cuComplex *h_V = NULL;
#ifdef FULLSVD
h_U = (cuComplex *)malloc(M * M * numMatrices * sizeof(cuComplex));
h_V = (cuComplex *)malloc(N * N * numMatrices * sizeof(cuComplex));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
float *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(float)));
cuComplex *d_U = NULL;
cuComplex *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(cuComplex)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(cuComplex)));
#endif
cuComplex *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnCgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(cuComplex) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnCgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M, N) if econ = 1
N, // --- Leading dimension of V, ldv >= max(1, N)
d_work,
work_size,
devInfo,
gesvdj_params,
numMatrices));
printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(float) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(cuComplex) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(cuComplex) * N * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
#ifdef PRINTRESULTS
printf("SINGULAR VALUES \n");
printf("_______________ \n");
for (int k = 0; k < numMatrices; k++)
{
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
printf("\n");
}
#if 0 //FULLSVD
printf("SINGULAR VECTORS U \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++)
{
for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
for (int p = 0; p < M; p++)
printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
printf("\n");
}
printf("SINGULAR VECTORS V \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++)
{
for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
printf("\n");
}
#endif
#endif
if (0 == devInfo_h)
{
printf("gesvdj converges \n");
}
else if (0 > devInfo_h)
{
printf("%d-th parameter is wrong \n", -devInfo_h);
exit(1);
}
else
{
printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
}
// --- Free resources
if (d_A) gpuErrchk(cudaFree(d_A));
if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
if (d_U) gpuErrchk(cudaFree(d_U));
if (d_V) gpuErrchk(cudaFree(d_V));
#endif
if (devInfo) gpuErrchk(cudaFree(devInfo));
if (d_work) gpuErrchk(cudaFree(d_work));
if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));
gpuErrchk(cudaDeviceReset());
return 0;
}
TimingCPU.cpp
/* TIMING CPU */
/**************/
#include "TimingCPU.h"
#ifdef __linux__
#include <sys/time.h>
#include <stdio.h>
TimingCPU::TimingCPU() : cur_time_(0) {
StartCounter();
}
TimingCPU::~TimingCPU() { }
void TimingCPU::StartCounter()
{
struct timeval time;
if (gettimeofday(&time, 0)) return;
cur_time_ = 1000000 * time.tv_sec + time.tv_usec;
}
double TimingCPU::GetCounter()
{
struct timeval time;
if (gettimeofday(&time, 0)) return -1;
long cur_time = 1000000 * time.tv_sec + time.tv_usec;
double sec = (cur_time - cur_time_) / 1000000.0;
if (sec < 0) sec += 86400;
cur_time_ = cur_time;
return 1000. * sec;
}
#elif _WIN32 || _WIN64
#include <windows.h>
#include <iostream>
struct PrivateTimingCPU {
double PCFreq;
__int64 CounterStart;
};
// --- Default constructor
TimingCPU::TimingCPU() {
privateTimingCPU = new PrivateTimingCPU; (*privateTimingCPU).PCFreq = 0.0; (*privateTimingCPU).CounterStart = 0;
}
// --- Default destructor
TimingCPU::~TimingCPU() { }
// --- Starts the timing
void TimingCPU::StartCounter()
{
LARGE_INTEGER li;
if (!QueryPerformanceFrequency(&li)) std::cout << "QueryPerformanceFrequency failed!\n";
(*privateTimingCPU).PCFreq = double(li.QuadPart) / 1000.0;
QueryPerformanceCounter(&li);
(*privateTimingCPU).CounterStart = li.QuadPart;
}
// --- Gets the timing counter in ms
double TimingCPU::GetCounter()
{
LARGE_INTEGER li;
QueryPerformanceCounter(&li);
return double(li.QuadPart - (*privateTimingCPU).CounterStart) / (*privateTimingCPU).PCFreq;
}
#endif
TimingCPU.h
// 1 micro-second accuracy
// Returns the time in seconds
#ifndef __TIMINGCPU_H__
#define __TIMINGCPU_H__
#ifdef __linux__
class TimingCPU {
private:
long cur_time_;
public:
TimingCPU();
~TimingCPU();
void StartCounter();
double GetCounter();
};
#elif _WIN32 || _WIN64
struct PrivateTimingCPU;
class TimingCPU
{
private:
PrivateTimingCPU *privateTimingCPU;
public:
TimingCPU();
~TimingCPU();
void StartCounter();
double GetCounter();
}; // TimingCPU class
#endif
#endif
TimingGPU.cu
/**************/
/* TIMING GPU */
/**************/
#include "TimingGPU.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
struct PrivateTimingGPU {
cudaEvent_t start;
cudaEvent_t stop;
};
// default constructor
TimingGPU::TimingGPU() {
privateTimingGPU = new PrivateTimingGPU;
}
// default destructor
TimingGPU::~TimingGPU() { }
void TimingGPU::StartCounter()
{
cudaEventCreate(&((*privateTimingGPU).start));
cudaEventCreate(&((*privateTimingGPU).stop));
cudaEventRecord((*privateTimingGPU).start, 0);
}
void TimingGPU::StartCounterFlags()
{
int eventflags = cudaEventBlockingSync;
cudaEventCreateWithFlags(&((*privateTimingGPU).start), eventflags);
cudaEventCreateWithFlags(&((*privateTimingGPU).stop), eventflags);
cudaEventRecord((*privateTimingGPU).start, 0);
}
// Gets the counter in ms
float TimingGPU::GetCounter()
{
float time;
cudaEventRecord((*privateTimingGPU).stop, 0);
cudaEventSynchronize((*privateTimingGPU).stop);
cudaEventElapsedTime(&time, (*privateTimingGPU).start, (*privateTimingGPU).stop);
return time;
}
TimingGPU.cuh
#ifndef __TIMING_CUH__
#define __TIMING_CUH__
/**************/
/* TIMING GPU */
/**************/
// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
// microsecond precision.
struct PrivateTimingGPU;
class TimingGPU
{
private:
PrivateTimingGPU *privateTimingGPU;
public:
TimingGPU();
~TimingGPU();
void StartCounter();
void StartCounterFlags();
float GetCounter();
}; // TimingCPU class
#endif
Utilities.cu
#include <assert.h>
#include "cuda_runtime.h"
#include <cuda.h>
#include <cusolverDn.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) {
exit(code);
}
}
}
extern "C" void gpuErrchk(cudaError_t ans) {
gpuAssert((ans), __FILE__, __LINE__);
}
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if (CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n", __FILE__, __LINE__, err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) {
__cusolveSafeCall(err, __FILE__, __LINE__);
}
Utilities.cuh
#ifndef UTILITIES_CUH
#define UTILITIES_CUH
extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);
#ifndef DEVICE_RESET
#define DEVICE_RESET cudaDeviceReset();
#endif
template< typename T >
void check(T result, char const *const func, const char *const file, int const line)
{
if (result)
{
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
file, line);
//fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
// file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
DEVICE_RESET
// Make sure we call CUDA Device Reset before exiting
exit(EXIT_FAILURE);
}
}
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ )
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
#ifndef MAX
#define MAX(a,b) (a > b ? a : b)
#endif
#endif
Could anyone suggest fixes for the errors I am getting in svd function and the errors after that.
Referring to the documentation, for the info
parameter:
info device output an integer array of dimension batchSize
So this is expected to be an array of integers of size equal to the number of matrices in the batch. This makes sense because we expect one of these info reports for each matrix. But your allocation does not do that:
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
When I fix that:
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int) * numMatrices));
the error goes away for me. This also has implications for your host-side allocation and also the copying of this data from device to host, later in the code.