When I use cublasIsamax with regular memory allocator - it works fine.
int FindMaxIndex( const float* pVector, const size_t length )
{
int result = 0;
float* pDevVector = nullptr;
if( CUBLAS_STATUS_SUCCESS != ::cudaMalloc( (void**)&pDevVector, length * sizeof(float) ) )
{
return -1;
}
if( CUBLAS_STATUS_SUCCESS != ::cudaMemcpy( pDevVector, pVector, length * (int)sizeof(float), cudaMemcpyHostToDevice) )
{
return -2;
}
::cublasIsamax_v2( g_handle, length, pDevVector, 1, &result);
if( nullptr != pDevVector )
{
::cudaFree( pDevVector );
}
return result;
}
But if try with constant memory it failes with unknown error N14. What is wrong? Copy to constant memory is successed but exeution is failed.
__constant__ float c_pIndex[ 255 ] = {0x00};
// the same function as GetIsMax but using CUBLAS function cublasIsamax_v2
int FindMaxIndexConst( const float* pVector, const size_t length, pfnMsg fnMsg )
{
int result = 0;
cudaError_t code = ::cudaMemcpyToSymbol( c_pIndex, pVector, length * sizeof(float), 0, cudaMemcpyHostToDevice );
if( cudaSuccess != code )
{
const char* szMsg = ::cudaGetErrorString ( code );
LogError3( L"[%d] [%hs] Could not allocate CUDA memory: %I64d pDevA", code, szMsg, (__int64)(length * sizeof(float)));
}
cublasStatus_t status = ::cublasIsamax_v2( g_handle, length, c_pIndex, 1, &result);
if( CUBLAS_STATUS_SUCCESS != status )
{
LogError2( L" [%d] Failed to execute <cublasIsamax_v2> : %I64d", status, (__int64)length );
}
return result;
}
Why not allocate a regular device array and pass that to CUBLAS?
A __constant__
array is not a normal __device__
array. In your code you are taking the address of the array and passing it to a host function. The address of the array on the host is not valid on the device, and vice versa, as described in the CUDA programming guide. See the CUDA Programming Guide:
The address obtained by taking the address of a
__device__
,__shared__
or__constant__
variable can only be used in device code. The address of a__device__
or__constant__
variable obtained through cudaGetSymbolAddress() as described in Device Memory can only be used in host code.
As for accessing __constant__
memory via a device pointer, see this answer for why it will be uncached.
Finally, using __constant__
memory in this way even if it is cached in the constant cache, is inefficient due to the access pattern. The constant cache is optimized for uniform access across threads in a warp. isamax
is likely to access different memory locations in every thread, and therefore the accesses will be serialized. Thus this will be 32x slower than accessing uniformly (and likely much slower than regular device memory).