CUDA Reduction: Warp Unrolling (School)

I am currently working on a project in which I am unrolling the last warp of a reduction. I have finished the code above; however, some modifications were done by guessing and I'd like an explanation why. The code I have written is only the function kernel4

// in is input array, out is where to store result, n is number of elements from in
// T is a float (32bit)
__global__ void kernel4(T *in, T *out, unsigned int n)

which is a reduction algorithm, the rest of the code was already provided.

Code:

#include <stdlib.h>
#include <stdio.h>

#include "timer.h"
#include "cuda_utils.h"

typedef float T;

#define N_ (8 * 1024 * 1024)
#define MAX_THREADS 256
#define MAX_BLOCKS 64

#define MIN(x,y) ((x < y) ? x : y)
#define tid threadIdx.x 
#define bid blockIdx.x 
#define bdim blockDim.x
#define warp_size 32

unsigned int nextPow2( unsigned int x ) {
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
{
    if (whichKernel < 3) {
        threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
        blocks = (n + threads - 1) / threads;
    } else {
        threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
        blocks = (n + (threads * 2 - 1)) / (threads * 2);
    }
    if (whichKernel == 5)
        blocks = MIN(maxBlocks, blocks);
}

T reduce_cpu(T *data, int n) {
    T sum = data[0];
    T c = (T) 0.0;
    for (int i = 1; i < n; i++)
    {
        T y = data[i] - c;
        T t = sum + y;
        c = (t - sum) - y;
        sum = t;
    } 
    return sum;
}

__global__ void
kernel4(T *in, T *out, unsigned int n)
{
    __shared__ volatile T d[MAX_THREADS];

    unsigned int i = bid * bdim + tid;

    n >>= 1;
    d[tid] = (i < n) ? in[i] + in[i+n] : 0;
    __syncthreads ();

    for(unsigned int s = bdim >> 1; s > warp_size; s >>= 1) {
        if(tid < s)
            d[tid] += d[tid + s];
        __syncthreads ();
    }

    if (tid < warp_size) {
        if (n > 64) d[tid] += d[tid + 32];
        if (n > 32) d[tid] += d[tid + 16];
        d[tid] += d[tid + 8];
        d[tid] += d[tid + 4];
        d[tid] += d[tid + 2];
        d[tid] += d[tid + 1];
    }

    if(tid == 0)
        out[bid] = d[0];
}

int main(int argc, char** argv)
{
    T *h_idata, h_odata, h_cpu;
    T *d_idata, *d_odata;   

    struct stopwatch_t* timer = NULL;
    long double t_kernel_4, t_cpu;

    int whichKernel = 4, threads, blocks, N, i;
    if(argc > 1) {
        N = atoi (argv[1]);
        printf("N: %d\n", N);
    } else {
        N = N_;
        printf("N: %d\n", N);
    }

    getNumBlocksAndThreads (whichKernel, N, MAX_BLOCKS, MAX_THREADS, blocks, threads);

    stopwatch_init ();
    timer = stopwatch_create ();

    h_idata = (T*) malloc (N * sizeof (T));
    CUDA_CHECK_ERROR (cudaMalloc (&d_idata, N * sizeof (T)));
    CUDA_CHECK_ERROR (cudaMalloc (&d_odata, blocks * sizeof (T)));

    srand48(time(NULL));
    for(i = 0; i < N; i++)
        h_idata[i] = drand48() / 100000;
    CUDA_CHECK_ERROR (cudaMemcpy (d_idata, h_idata, N * sizeof (T), cudaMemcpyHostToDevice));

    dim3 gb(blocks, 1, 1);
    dim3 tb(threads, 1, 1);

    kernel4 <<<gb, tb>>> (d_idata, d_odata, N);
    cudaThreadSynchronize ();

    stopwatch_start (timer);

    kernel4 <<<gb, tb>>> (d_idata, d_odata, N);
    int s = blocks;
    while(s > 1) {
        threads = 0;
        blocks = 0;
        getNumBlocksAndThreads (whichKernel, s, MAX_BLOCKS, MAX_THREADS, blocks, threads);

        dim3 gb(blocks, 1, 1);
        dim3 tb(threads, 1, 1);

        kernel4 <<<gb, tb>>> (d_odata, d_odata, s);
        s = (s + threads * 2 - 1) / (threads * 2);
    }
    cudaThreadSynchronize ();

    t_kernel_4 = stopwatch_stop (timer);
    fprintf (stdout, "Time to execute unrolled GPU reduction kernel: %Lg secs\n", t_kernel_4);

    double bw = (N * sizeof(T)) / (t_kernel_4 * 1e9);   // total bits / time
    fprintf (stdout, "Effective bandwidth: %.2lf GB/s\n", bw);
    CUDA_CHECK_ERROR (cudaMemcpy (&h_odata, d_odata, sizeof (T), cudaMemcpyDeviceToHost));

    stopwatch_start (timer);
    h_cpu = reduce_cpu (h_idata, N);
    t_cpu = stopwatch_stop (timer);
    fprintf (stdout, "Time to execute naive CPU reduction: %Lg secs\n", t_cpu);

    if(abs (h_odata - h_cpu) > 1e-5)
        fprintf(stderr, "FAILURE: GPU: %f  CPU: %f\n", h_odata, h_cpu);
    else
        printf("SUCCESS: GPU: %f  CPU: %f\n", h_odata, h_cpu);
    return 0;
}

My first question is: when declaring

__shared__ volatile T d[MAX_THREADS];

I would like to verify my understanding of volatile. Volatile prevents compilers from incorrectly optimizing my code and promises that load/stores are completed through the cache and not just registers (please correct me if wrong). For reduction, if partial reduction sums are still stored in registers, why is this a problem?

My second question is: when doing the actual warp reduction

    if (tid < warp_size) { // Final log2(32) = 5 strides
        if (n > 64) d[tid] += d[tid + 32];
        if (n > 32) d[tid] += d[tid + 16];
        d[tid] += d[tid + 8];
        d[tid] += d[tid + 4];
        d[tid] += d[tid + 2];
        d[tid] += d[tid + 1];
    }

The reduction sum will yield incorrect results without (n > 64) and (n > 32) conditions. The results I get are:

FAILURE: GPU: 41.966557  CPU: 41.946209

With 5 trials, the GPU reduction consistently yields an error of 0.0204. I am wary to think this is a floating point operation error.

To be honest as well, my teacher's assistant suggested this change to add the (n > 64) and (n > 32) conditions but did not explain why it would fix the code.

Since n in my trials are over 64, why does this conditional change the results. I am having difficulty tracing back the problem because I cannot use print functions like I would in a CPU.

Solution

Let's start with a few preface comments before we tackle your two questions:

I encourage you to read NVIDIA's canonical reduction tutorial
Reductions written like this make several assumptions, one of which is that the block size is a power-of-2 (for "correctness").
Your code is using warp-synchronous programming at the final reduction stage. You appear to know what you are doing, so I won't provide a detailed description of that, but it is certainly relevant for understanding here. You can google it and get descriptions if needed. It is relevant to the discussion below, but I'm not going to call out its relevance in each situation.

OK, now your questions:

I would like to verify my understanding of volatile. Volatile prevents compilers from incorrectly optimizing my code and promises that load/stores are completed through the cache and not just registers (please correct me if wrong). For reduction, if partial reduction sums are still stored in registers, why is this a problem?

Regarding a definition of volatile, I would refer you to the CUDA programming guide. I have seen summary descriptions referring to this as preventing a register optimization or preventing reordering of loads and stores. I prefer the former and will use that as a working definition.

The basic idea is that volatile forces any reference (read or write) to that variable to actually go to the memory subsystem. By this I mean it will perform a read or write, and will not attempt to use a value previously loaded into a register. Without this qualifier, the compiler is free to load a value once (for example) from the actual memory location, and then maintain that value (and any updates to it) in a register, for as long as it deems appropriate. Compilers do this with an eye toward performance. (As an aside, note that you used the word "cache" here. I would avoid that usage here. Shared memory has no cache interposed between it and the processor load/store mechanism.)

Without volatile in this type of warp-synchronous coding, we will run into a problem if we allow the compiler to "optimize" (i.e. maintain) intermediate values into registers. This primarily comes about due to inter-thread communication. To see clearly why, let's look at the last 2 steps in your final reduction:

    d[tid] += d[tid + 2];
    d[tid] += d[tid + 1];

Let's consider just threads whose tid values are 0-1. In the second-last step, thread 0 will pick up the d[2] value and add it to the d[0] value, while thread 1 will pick up the d[3] value and add it to the d[1] value. At this point, if we don't use volatile, the compiler is not obligated to write the d[1] value accumulated by thread 1 back out to shared memory. It is allowed to maintain that in a register. So the d[1] value as seen in shared memory is not "up-to-date".

Now lets go to the last step. In this step, thread 0 reads the d[1] value from shared memory and adds it to the d[0] value. But without volatile, we saw in the previous step that the shared memory contents of d[1] are no longer accurate. OTOH, if we use volatile, then the write to shared memory in the previous step will actually take place, and in the final step, thread 0 will pick up the correct value when it reads d[1]. A CUDA thread is a standalone model. By that, I mean that one thread cannot directly access values contained in registers belonging to another thread. So inter-thread communication at the warp level will normally be accomplished either through shared memory, or via warp-shuffle operations.

__syncthreads() has a similar behavior: it forces all register-optimized values like this to be written out to memory, so that they are "visible" to other threads in the block. Therefore, a more sophisticated optimization would be to only switch to a volatile qualified pointer when the reduction switches from the loop-driven __syncthreads() based reduction to the final warp-synchronous reduction. You can see an example in the tutorial slides I linked at the beginning of this answer.

As another aside, warp-synchronous programming of this kind is (more officially) deprecated in CUDA 9. Instead, you should use cooperative groups.

The reduction sum will yield incorrect results without (n > 64) and (n > 32) conditions.

These conditionals are primarily used because the code is designed to be "correct" for any block configuration that has a power-of-2 size. If we assume that the block size (number of threads per block) is a power of 2, and greater than 64, it must be 128 or larger for example. Your n variable starts out as the block size, but then gets multiplied by 2:

n >>= 1;

Therefore, if we want to ensure the correctness of this line of code:

d[tid] += d[tid + 32];

then we should only apply that operation when the thread block size is 64 (at least) which is like saying that n is greater than 64:

    if (n > 64) d[tid] += d[tid + 32];

regarding this question, the claim is made that the posted code behaves differently if the if (n > 64) is included or not. The reason for this is that the posted code includes a loop which recalculates thread count and block count as the reduction proceeds:

int s = blocks;
while(s > 1) {
    threads = 0;
    blocks = 0;
    getNumBlocksAndThreads (whichKernel, s, MAX_BLOCKS, MAX_THREADS, blocks, threads);

This loop eventually results in a block size that is smaller than 128, meaning the omission of the if conditions leads to breakage. (simply print out the threads variable, during this loop).

regarding this:

I am having difficulty tracing back the problem because I cannot use print functions like I would in a CPU.

I'm not sure what the problem is there. printf should work from within kernel code.