Search code examples
ccudaprefix-sum

Prefix sum for arbitrary size using CUDA


I'm trying to implement prefix sum for the arbitrary size of 1-D array in CUDA. The first step I'm trying to figure out is split the array into multiple blocks.

In my first experiment, I have an array of size 8. I specify 2 blocks and 4 threads for each using a parallel algorithm provided in example 39-2 in https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html

My idea is

  1. I split the array to two blocks, each four threads will take care of four elements in each block ("chunk").
  2. Then I expect to obtain two prefix sum results from two blocks respectively.
  3. Then I can add the last element of prefix sum output in the first "chunk" to all the elements in the second "chunk". And get the final answer.

Now I get stuck in the step 2. I cannot get the correct result from the two blocks respectively. Can anyone please give me some corrections? Thank you!

Below is my code:

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

#define N 8
#define thread_num 4
#define block_num 2


__global__ void prescan(float *g_odata, float *g_idata, int n);
void scanCPU(float *f_out, float *f_in, int i_n);

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end - d_start);
}

int main()
{
float a[N], c[N], g[N];
timeval start, end;

float *dev_a, *dev_g;
int size = N * sizeof(float);

double d_gpuTime, d_cpuTime;

// initialize matrices a
for (int i = 0; i < N; i++)
{
//        a[i] = (float)(rand() % 1000000) / 1000.0;
    a[i] = i+1;
    printf("a[%i] = %f\n", i, a[i]);
}
// initialize a and b matrices here
cudaMalloc((void **) &dev_a, size);
cudaMalloc((void **) &dev_g, size);

gettimeofday(&start, NULL);

cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);

prescan<<<block_num,thread_num,2*N*sizeof(float)>>>(dev_g, dev_a, N);
cudaDeviceSynchronize();

cudaMemcpy(g, dev_g, size, cudaMemcpyDeviceToHost);

gettimeofday(&end, NULL);
d_gpuTime = myDiffTime(start, end);

gettimeofday(&start, NULL);
scanCPU(c, a, N);

gettimeofday(&end, NULL);
d_cpuTime = myDiffTime(start, end);

cudaFree(dev_a); cudaFree(dev_g);

for (int i = 0; i < N; i++)
{
    printf("c[%i] = %0.3f, g[%i] = %0.3f\n", i, c[i], i, g[i]);
}

printf("GPU Time for scan size %i: %f\n", N, d_gpuTime);
printf("CPU Time for scan size %i: %f\n", N, d_cpuTime);
}


__global__ void prescan(float *g_odata, float *g_idata, int n)
{
extern  __shared__  float temp[];
// allocated on invocation
int thid = threadIdx.x;
int bid = blockIdx.x;
int fake_n = block_num * thread_num;


int offset = 1;
if(bid * thread_num + thid<n){ temp[bid * thread_num + thid]  = g_idata[bid * thread_num + thid];
}else{ temp[bid * thread_num + thid]  = 0;
} // Make the "empty" spots zeros, so it won't affect the final result.

for (int d = thread_num>>1; d > 0; d >>= 1)
    // build sum in place up the tree
{
    __syncthreads();
    if (thid < d)
    {
        int ai = bid * thread_num + offset*(2*thid+1)-1;
        int bi = bid * thread_num + offset*(2*thid+2)-1;
        temp[bi] += temp[ai];
    }
    offset *= 2;
}

if (thid == 0)
{
    temp[thread_num - 1] = 0;
}

// clear the last element
for (int d = 1; d < thread_num; d *= 2)
    // traverse down tree & build scan
{
    offset >>= 1;
    __syncthreads();
    if (thid < d)
    {
        int ai = bid * thread_num + offset*(2*thid+1)-1;
        int bi = bid * thread_num + offset*(2*thid+2)-1;
        float t = temp[bid * thread_num + ai];
        temp[ai]  = temp[ bi];
        temp[bi] += t;
    }
}
__syncthreads();

g_odata[bid * thread_num + thid] = temp[bid * thread_num + thid];
}

void scanCPU(float *f_out, float *f_in, int i_n)
{
f_out[0] = 0;
for (int i = 1; i < i_n; i++)
    f_out[i] = f_out[i-1] + f_in[i-1];

}

Solution

  • The basic error you are making is thinking that you are going to create a shared memory array that is the full size of your array to be scanned.

    Instead, when we break work down into threadblocks, we only create a shared memory array that is big enough for the size of each threadblock. It's quite possible you have some misconceptions about shared memory. Logically, it is a per-threadblock resource. Each threadblock has its own, separate, logical shared memory array. Furthermore, shared memory is limited to the size that can be supported by each SM, or the logical threadblock limit, which is usually 48KB. Therefore, trying to allocate a shared memory array that is the same size as your data array is not going to work anyway as your actual problem size gets "large":

    prescan<<<block_num,thread_num,2*N*sizeof(float)>>>(dev_g, dev_a, N);
                                     ^
    

    Instead, the usual approach is to allocate enough for each threadblock, which in this case is determined by the number of threads per block:

    prescan<<<block_num,thread_num,2*thread_num*sizeof(float)>>>(dev_g, dev_a, N);
    

    Corresponding to this change, we must change the indexing in the kernel code, so that global memory accesses use full array indexing, but shared memory accesses use a "local" index, determined by the size of the shared memory array.

    This is actually simple to do; we need to remove from your kernel code all the bid*thread_num constructs from each place where a shared memory (temp) index is being used or created. In this way, once we have data stored in shared memory, the behavior of each threadblock is identical, with respect to its own copy of shared memory.

    Here's a worked example:

    $ cat t80.cu
    #include <stdio.h>
    #include <stdlib.h>
    #include <sys/time.h>
    
    #define N 8
    #define thread_num 4
    #define block_num 2
    
    
    __global__ void prescan(float *g_odata, float *g_idata, int n);
    void scanCPU(float *f_out, float *f_in, int i_n);
    
    double myDiffTime(struct timeval &start, struct timeval &end)
    {
    double d_start, d_end;
    d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
    d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
    return (d_end - d_start);
    }
    
    int main()
    {
    float a[N], c[N], g[N];
    timeval start, end;
    
    float *dev_a, *dev_g;
    int size = N * sizeof(float);
    
    double d_gpuTime, d_cpuTime;
    
    // initialize matrices a
    for (int i = 0; i < N; i++)
    {
    //        a[i] = (float)(rand() % 1000000) / 1000.0;
        a[i] = i+1;
        printf("a[%i] = %f\n", i, a[i]);
    }
    // initialize a and b matrices here
    cudaMalloc((void **) &dev_a, size);
    cudaMalloc((void **) &dev_g, size);
    
    gettimeofday(&start, NULL);
    
    cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
    
    prescan<<<block_num,thread_num,2*thread_num*sizeof(float)>>>(dev_g, dev_a, N);
    cudaDeviceSynchronize();
    
    cudaMemcpy(g, dev_g, size, cudaMemcpyDeviceToHost);
    
    gettimeofday(&end, NULL);
    d_gpuTime = myDiffTime(start, end);
    
    gettimeofday(&start, NULL);
    scanCPU(c, a, N);
    
    gettimeofday(&end, NULL);
    d_cpuTime = myDiffTime(start, end);
    
    cudaFree(dev_a); cudaFree(dev_g);
    
    for (int i = 0; i < N; i++)
    {
        printf("c[%i] = %0.3f, g[%i] = %0.3f\n", i, c[i], i, g[i]);
    }
    
    printf("GPU Time for scan size %i: %f\n", N, d_gpuTime);
    printf("CPU Time for scan size %i: %f\n", N, d_cpuTime);
    }
    
    
    __global__ void prescan(float *g_odata, float *g_idata, int n)
    {
    extern  __shared__  float temp[];
    // allocated on invocation
    int thid = threadIdx.x;
    int bid = blockIdx.x;
    
    
    int offset = 1;
    if((bid * thread_num + thid)<n){ temp[thid]  = g_idata[bid * thread_num + thid];
    }else{ temp[thid]  = 0;
    } // Make the "empty" spots zeros, so it won't affect the final result.
    
    for (int d = thread_num>>1; d > 0; d >>= 1)
        // build sum in place up the tree
    {
        __syncthreads();
        if (thid < d)
        {
            int ai = offset*(2*thid+1)-1;
            int bi = offset*(2*thid+2)-1;
            temp[bi] += temp[ai];
        }
        offset *= 2;
    }
    
    if (thid == 0)
    {
        temp[thread_num - 1] = 0;
    }
    
    // clear the last element
    for (int d = 1; d < thread_num; d *= 2)
        // traverse down tree & build scan
    {
        offset >>= 1;
        __syncthreads();
        if (thid < d)
        {
            int ai = offset*(2*thid+1)-1;
            int bi = offset*(2*thid+2)-1;
            float t = temp[ai];
            temp[ai]  = temp[ bi];
            temp[bi] += t;
        }
    }
    __syncthreads();
    
    g_odata[bid * thread_num + thid] = temp[thid];
    }
    
    void scanCPU(float *f_out, float *f_in, int i_n)
    {
    f_out[0] = 0;
    for (int i = 1; i < i_n; i++)
        f_out[i] = f_out[i-1] + f_in[i-1];
    
    }
    $ nvcc -arch=sm_60 -o t80 t80.cu
    $ cuda-memcheck ./t80
    ========= CUDA-MEMCHECK
    a[0] = 1.000000
    a[1] = 2.000000
    a[2] = 3.000000
    a[3] = 4.000000
    a[4] = 5.000000
    a[5] = 6.000000
    a[6] = 7.000000
    a[7] = 8.000000
    c[0] = 0.000, g[0] = 0.000
    c[1] = 1.000, g[1] = 1.000
    c[2] = 3.000, g[2] = 3.000
    c[3] = 6.000, g[3] = 6.000
    c[4] = 10.000, g[4] = 0.000
    c[5] = 15.000, g[5] = 5.000
    c[6] = 21.000, g[6] = 11.000
    c[7] = 28.000, g[7] = 18.000
    GPU Time for scan size 8: 0.004464
    CPU Time for scan size 8: 0.000000
    ========= ERROR SUMMARY: 0 errors
    $
    

    Note that wherever we are accessing the global memory array, we use an index crafted to select a block-size chunk out of the global memory array. But wherever we are indexing into shared memory, we use an index crafted for a single block-sized chunk.