Checking device_vector inside CUDA kernel doesn't work

I'm running CUDA 4.2 on Windows 7 64 bits in the Visual Studio 2010 Professional environment

First, I have the following code running:

// include the header files
#include <iostream>
#include <stdio.h>
#include <time.h>

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

using namespace std; 

//kernel function
__global__ 
void dosomething(int *d_bPtr, int count, int* d_bStopPtr)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid==0)
       d_bStopPtr[tid]=0;
    else if(tid<count)
    {
       d_bPtr[tid]=tid;
// only if the arrary cell before it is 0, then change it to 0 too
        if (d_bStopPtr[tid-1]==0 )
           d_bStopPtr[tid]=0;

    }
}

int main()
{
    int count=100000;
// define the vectors
    thrust::host_vector <int> h_a(count);
    thrust::device_vector <int> d_b(count,0);
    int* d_bPtr=thrust::raw_pointer_cast(&d_b[0]);
    thrust::device_vector <int> d_bStop(count,1);
    int* d_bStopPtr=thrust::raw_pointer_cast(&d_bStop[0]);
// get the device property
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);

    int threadsPerBlock = prop.maxThreadsDim[0];
    int blocksPerGrid = min(prop.maxGridSize[0], (count + threadsPerBlock - 1) / threadsPerBlock);
//copy device to host
    thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
    cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//run the kernel
    while(d_bStop[count-1])
    {
    dosomething<<<blocksPerGrid, threadsPerBlock>>>(d_bPtr,count,d_bStopPtr);
    }
//copy device back to host again
    thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
    cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//wait to see the console output
    int x;
    cin>>x;
    return 0;
}

However, each time I need to check the while condition, but it is slow. So I'm thinking of checking the condition of this device vector inside the kernel, and change the code like this:

// include the header files
#include <iostream>
#include <stdio.h>
#include <time.h>

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

using namespace std; 

//kernel function
__global__ 
void dosomething(int *d_bPtr, int count, int* d_bStopPtr)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid==0)
    d_bStopPtr[tid]=0;
else if(tid<count)
    {
// if the last cell of the arrary is still not 0 yet, repeat
        while(d_bStopPtr[count-1])
        {
            d_bPtr[tid]=tid;
// only if the arrary cell before it is 0, then change it to 0 too
            if (d_bStopPtr[tid-1]==0 )
                d_bStopPtr[tid]=0;
        }
    }
}

int main()
{
    int count=100000;
// define the vectors
    thrust::host_vector <int> h_a(count);
    thrust::device_vector <int> d_b(count,0);
    int* d_bPtr=thrust::raw_pointer_cast(&d_b[0]);
    thrust::device_vector <int> d_bStop(count,1);
    int* d_bStopPtr=thrust::raw_pointer_cast(&d_bStop[0]);
// get the device property
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);

    int threadsPerBlock = prop.maxThreadsDim[0];
    int blocksPerGrid = min(prop.maxGridSize[0], (count + threadsPerBlock - 1) / threadsPerBlock);
//copy device to host
    thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
    cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//run the kernel
    dosomething<<<blocksPerGrid, threadsPerBlock>>>(d_bPtr,count,d_bStopPtr);
//copy device back to host again
    thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
    cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//wait to see the console output
    int x;
    cin>>x;
    return 0;
}

However, the second version always causes the graphic card and the computer to hang. Can you please help me with speeding up the first version? How to check the condition inside the kernel and then jump out and stop the kernel?

Solution

You are basically looking for global thread synchronous behavior. This is a no-no in GPU programming. Ideally each threadblock is independent, and can complete the work based on it's own data and processing. Creating threadblocks that depend on the results of other threadblocks to complete their work is creating the possibility of a deadlock condition. Suppose I have a GPU with 14 SMs (threadblock execution units), and suppose I create 100 threadblocks. Now suppose threadblocks 0-13 are waiting for threadblock 99 to release a lock (e.g. write a zero value to a particular location). Now suppose those first 14 threadblocks begin executing on the 14 SMs, perhaps looping, spinning on the lock value. There is no mechanism in the GPU to guarantee that threadblock 99 will execute first or even execute at all, if threadblocks 0-13 have the SMs tied up.

Let's not get into questions about "what about GMEM stalls that force eviction of threadblocks 0-13" because none of that guarantees that threadblock 99 will get priority to execute at any point. The only thing that guarantees that threadblock 99 will execute is the draining (i.e. completion) of other threadblocks. But if the other threadblocks are spinning, waiting for results from threadblock 99, that may never happen.

Good forward-compatible, scalable GPU code depends on independent parallel work. So you're advised to re-craft your algorithm to make the work you are trying to accomplish independent, at least at the inter-threadblock level.

If you must do global thread syncing, the kernel launch is the only truly guaranteed point for this, and thus your first approach is the working approach.

To help with this, it may be useful to study how reduction algorithms get implemented on a GPU. Various types of reductions have dependencies across all threads, but by creating intermediate results, we can break the work into independent pieces. The independent pieces can then be aggregated using a multi-kernel approach (or some other more advanced approaches) to speed up what amounts to a serial algorithm.

Your kernel doesn't actually do much. It sets one array equal to it's index, i.e. a[i] = i; and it sets the other array to all zeroes (although sequentially) b[i]=0;. To show an example of your first code "speeded up", you could do something like this:

    // include the header files
#include <iostream>
#include <stdio.h>
#include <time.h>

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

using namespace std;

//kernel function
__global__
void dosomething(int *d_bPtr, int count, int* d_bStopPtr)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    while(tid<count)
    {
      d_bPtr[tid]=tid;
      while(d_bStopPtr[tid]!=0)
// only if the arrary cell before it is 0, then change it to 0 too
        if (tid==0) d_bStopPtr[tid] =0;
        else if (d_bStopPtr[tid-1]==0 )
               d_bStopPtr[tid]=0;
      tid += blockDim.x;
    }
}

int main()
{
    int count=100000;
// define the vectors
    thrust::host_vector <int> h_a(count);
    thrust::device_vector <int> d_b(count,0);
    int* d_bPtr=thrust::raw_pointer_cast(&d_b[0]);
    thrust::device_vector <int> d_bStop(count,1);
    int* d_bStopPtr=thrust::raw_pointer_cast(&d_bStop[0]);
// get the device property
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);

//    int threadsPerBlock = prop.maxThreadsDim[0];
    int threadsPerBlock = 32;
//    int blocksPerGrid = min(prop.maxGridSize[0], (count + threadsPerBlock - 1) / threadsPerBlock);
    int blocksPerGrid = 1;
//copy device to host
    thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
    cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//run the kernel
//    while(d_bStop[count-1])
//    {
    dosomething<<<blocksPerGrid, threadsPerBlock>>>(d_bPtr,count,d_bStopPtr);
//    }
//copy device back to host again
    cudaDeviceSynchronize();
    thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
    cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//wait to see the console output
    int x;
    cin>>x;
    return 0;
}

On my machine this speeds the execution time up from 10 secs to almost instantaneous (much less than 1 second). Note that this is not a great example of CUDA programming, because I am only launching one block of 32 threads. That's not enough to effectively utilize the machine. But the work done by your kernel is so trivial that I'm not sure what a good example would be. I could just create a kernel that sets one array to it's index a[i]=i; and the other array to zero b[i]=0; all in parallel. That would be even faster, and we could use the whole machine that way.