cudaMalloc is not modifying return pointer when called inside template function

Following sample problem:

#include <iostream>

using namespace std;
__device__ __constant__ float* data;

template<class T> void allocOnly(T* deviceDest, size_t numElem)
{
    cudaError_t errCode = cudaMalloc((void**)&deviceDest, numElem*sizeof(T));
    if(errCode != cudaSuccess) 
        cout << "Got error with code " << errCode << endl;
}

int main()
{
    float* test(0);
    allocOnly<float>(test,10);
    cout << "test = " << test << endl;

    float* test2(0);    
    cudaError_t errCode = cudaMalloc((void**)&test2, 10*sizeof(float));
    if(errCode != cudaSuccess) 
        cout << "Got error with code " << errCode << endl;
    cout << "test2 = " << test2 << endl;

    return 0;
}

compiled with nvcc test.cu -o testBin

returns

test = 0
test2 = 0x310100

Why is test not modified when called through template function, cudaMalloc is supposed to modify it to be a pointer to the newly allocated device memory!

Solution

The pointer is not being modified because cudaMalloc in the function allocOnly is allocating memory to the argument deviceTest which is local to the function allocOnly. You can modify the function allocOnly to allocate memory as follows:

template<class T> void allocOnly(T** deviceDest, size_t numElem)
{
    cudaError_t errCode = cudaMalloc((void**)deviceDest, numElem*sizeof(T));
    if(errCode != cudaSuccess) 
        cout << "Got error with code " << errCode << endl;
}

Inside the main function:

int main()
{
    float* test(0);
    allocOnly<float>(&test,10);
    cout << "test = " << test << endl;
    .
    .
    .

}