I have the following parallel kernel reduction written on OpenCL. I just want to sum all the values from the BlockSum
array. While using the work_group_reduce_add(BlockSum[GetIndex]);
it works perfectly right, using the optimized code I read from https://www.fz-juelich.de/SharedDocs/Downloads/IAS/JSC/EN/slides/opencl/opencl-05-reduction.pdf?__blob=publicationFile (Slide 11) does not work correctly. What seems to be the error here? The global_work_size is set to {16,16} as well as the local_work_size (meaning 256 threads in total for each workgroup). In the case of the work_group_reduce_add
I get 255 which is correct but with the optimized code I get 0
__kernel void Reduction()
{
unsigned char GetThreadX = get_local_id(0); //it takes values from 0..15
unsigned char GetThreadY = get_local_id(1); //it takes values from 0..15
unsigned char GetGroup = get_local_size(0); //16
unsigned short BlockSum[256];
int SumOfAll= 0;
unsigned short GetIndex = GetThreadX + (GetGroup * GetThreadY); // takes values 0..255, group=16
BlockSum[GetIndex] = 1;
barrier(CLK_LOCAL_MEM_FENCE);
SumOfAll= work_group_reduce_add(BlockSum[GetIndex]); //works great
// BUT CODE BELOW DOES NOT SUM CORRECTLY
/*
for(unsigned short stride=128; stride>1; stride >>= 1) {
if(GetIndex < stride)
BlockSum[GetIndex] += BlockSum[GetIndex + stride];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(GetIndex==0)
SumOfAll = BlockSum[0] + BlockSum[1];
barrier(CLK_LOCAL_MEM_FENCE);
*/
printf("SumOfAll=%d\n",SumOfAll);
}
Ok problem fixed. The BlockSum[256];
was not declared as __local
but as private memory (silently without the __local
Address Space Qualifier) which means that every thread (or core) had its own copy of these data, but the optimized reduction code was looking for shared local memory data among threads, to sum up the values. Also the variable int SumOfAll;
should also be declared as __local
with initialization or private
in my case without any initialization before. You choose.
So the working kernel is now looking like this.
I hope this type of error will help someone that is not cautious like myself.
__kernel void Reduction()
{
unsigned char GetThreadX = get_local_id(0); //it takes values from 0..15
unsigned char GetThreadY = get_local_id(1); //it takes values from 0..15
unsigned char GetGroup = get_local_size(0); //16
//*********************************************************
//below was the offending code and the root of the problem
//**********************************************************
__local unsigned short BlockSum[256];
int SumOfAll;
//**********************************************************
unsigned short GetIndex = GetThreadX + (GetGroup * GetThreadY); // takes values 0..255, group=16
BlockSum[GetIndex] = 1;
barrier(CLK_LOCAL_MEM_FENCE);
//SumOfAll = work_group_reduce_add(BlockSum[GetIndex]);
// OPTIMIZED CODE BELOW NOW SUM UP CORRECTLY
for(unsigned short stride=128; stride>1; stride >>= 1) {
if(GetIndex < stride)
BlockSum[GetIndex] += BlockSum[GetIndex + stride];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(GetIndex==0)
SumOfAll = BlockSum[0] + BlockSum[1];
barrier(CLK_LOCAL_MEM_FENCE);
printf("SumOfAll=%d\n",SumOfAll);
}