Getting CUB DeviceScan to work when called from a kernel

I am currently trying to learn how to use CUB to perhaps rewrite my integrator code. I've been looking at the examples and code snippets in the docs, but I have not yet found an example of what I'm trying to do. Specifically, that is to run an InclusiveSum called from a master thread. From what I've seen, all the examples call the function from the host, rather than the device, but hint that it can be done here : http://nvlabs.github.io/cub/structcub_1_1_device_scan.html#a7bcc25e4d9c14a23f71431cf1a6b2bd5

"When calling this method from kernel code, be sure to define the CUB_CDP macro in your compiler's macro definitions."

I've tried adding this in Visual Studio 2012 by going to my project's properties->Cuda Linker-> Command Line and adding "-DCUB_CDP." I'm not sure if that is correct, but I get the following build line :

"nvcc.exe" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --use-local-env --cl-version 2012 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_amd64" -rdc=true -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 --compile -cudart static -DCUB_CDP -g -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MT " -o "x64\Debug\Algorithm Test.cu.obj" "C:\Users...\Algorithm Test.cu"

My test code involves a test kernel run with 1 thread to simulate how my actual code works.

#define CUB_STDERR
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define NUMPOINTS 5*1024    
#define NUMTHREADSPERBLOCK 256
#define NUMBLOCKSPERGRID 32
#define MAXLENGTH NUMTHREADSPERBLOCK*NUMBLOCKSPERGRID   //Use multiple of 256

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <fstream>
#include <iomanip>                      //display 2 decimal places
#include <math.h>
#include <arrayFunctions.h>
#include <ctime>                        //For timers
#include <sstream>                      //For the filename
#include <assert.h>
#include <stdlib.h>
#include <cub/cub.cuh>


#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
#undef  assert
#define assert(arg)
#endif

__device__ __constant__ int numThreads = NUMTHREADSPERBLOCK;    //Number of threads per block
__device__ __constant__ int numBlocks = NUMBLOCKSPERGRID;       //Number of blocks per grid
__device__ __constant__ int maxlength = MAXLENGTH;  
__device__ double concSort[MAXLENGTH];

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    //Error checking
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}


using namespace std;
using namespace cub;

__global__ void test(double*);

int main(int argc, char** argv)
{
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
    cudaSetDevice(0);
    std::cout << std::fixed;                //Displays 2 decimal places.
    std::cout << std::setprecision(16);     //Displays 2 decimal places.

    const int maxlength = MAXLENGTH;        //Number of discrete concentrations tracking.
    double concs[maxlength] = {};           //Meant to store the initial concentrations .


    std::cout<<" ";
    std::cout<<"\n";    

    double *d_concs;                //The concentrations for a specific timestep.

    size_t size_concs = sizeof(concs);


    gpuErrchk(cudaMalloc((void**)&d_concs, size_concs));
    gpuErrchk(cudaMemcpy(d_concs, &concs, size_concs, cudaMemcpyHostToDevice));


    //Run the integrator.
    std::clock_t start;
    double duration;
    start = std::clock();

    test<<<1,1>>>(d_concs);

    std::cout<<"\n";

    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    duration = (std::clock() - start)/ (double) CLOCKS_PER_SEC;
    std::cout<<"The calculation took this long: "<< duration <<'\n';

    std::cout<<"\n";

    gpuErrchk(cudaMemcpy(concs, d_concs, size_concs, cudaMemcpyDeviceToHost));
    cudaDeviceSynchronize();

    ///*
    for (int i=0; i < 33; i++)
    {
        std::cout << "\n";
        std::cout << concs[i];
    }
    //*/

    cudaDeviceReset();  //Clean up all memory.
    return 0;
}



__global__ void test(double* concs)
{
    int size=MAXLENGTH;
    int threads = NUMTHREADSPERBLOCK;
    int blocks = NUMBLOCKSPERGRID;

    for (int i = 0; i < size; i++)
        concs[i] = i * .00000000001;

    ///*
    void *d_temp_storage = NULL;
    size_t temp_storage_bytes = 0;
    CubDebug(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, concs, concs, size));
    cudaMalloc(&d_temp_storage, temp_storage_bytes);
    CubDebug(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, concs, concs, size));

}

I get the following errors, which from the following post, suggest that defining this macro CUB_CDP is my error :

1>C:/Users/Karsten Chu/New Google Drive/Research/Visual Studio 2012/Projects/Dynamic Parallelism Test/Dynamic Parallelism Test/Algorithm Test.cu(146): error : calling a __host__ function("exit") from a __global__ function("test") is not allowed
1>C:/Users/Karsten Chu/New Google Drive/Research/Visual Studio 2012/Projects/Dynamic Parallelism Test/Dynamic Parallelism Test/Algorithm Test.cu(148): error : calling a __host__ function("exit") from a __global__ function("test") is not allowed

https://groups.google.com/forum/#!searchin/cub-users/CUB_CDP/cub-users/9ltP52Ohosg/uM9_RUy11e0J

I'd appreciate any help as I think learning how to use this library could really help me start focusing on PHYSICS rather than...anything but physics.

Solution

Remove the CubDebugExit() wrapper from your cub calls in the test kernel. Then your code will compile.

Instead of this:

CubDebugExit(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, concs, concSort, maxlength));
cudaMalloc(&d_temp_storage, temp_storage_bytes);
CubDebugExit(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, concs, concSort, maxlength));

Do this:

cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, concs, concSort, maxlength);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, concs, concSort, maxlength);

The CubDebugExit macro is not usable in device code.

If you prefer, you can also use CubDebug() instead of the CubDebugExit() wrapper/macro.