I'm developing face detection app in android platform using OpenCL
. Face detection algorithm is based on Viola Jones algorithm. I tried to make Cascade classification step kernel code. and I set classifier data
of cascade stage 1 among cascade stages to local memory(__local)
because classifier data are used for all work-items.
But, kernel profiling time without using local mem(using global mem) is more faster than that does with using local memory.
edited :
I uploaded full code.
with local memory version
__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
int cascadeLocalSize = get_local_size(0);
__local float localS1F1[42];
int localIdx = get_local_id(1)*cascadeLocalSize + get_local_id(0);
if(localIdx<42)
{
int stage1Idx = localIdx + idxNumValStageArray[0]+4;
localS1F1[localIdx] = classifierMem[stage1Idx];
}
barrier(CLK_LOCAL_MEM_FENCE);
float resizeFactor = 1.0;
int2 im_dim = get_image_dim(input_image);
unsigned int srcWidth = im_dim.x*(float)resizeFactor;
unsigned int srcHeight = im_dim.y*(float)resizeFactor;
int gx = get_global_id(0);
int gy = get_global_id(1);
int skinX=0;
int skinY=0;
int coordi=vecSkin[512*gy+gx];
skinX = coordi%im_dim.x;
skinY = coordi/im_dim.x;
if( skinX >= 10 && skinY >= 10 )
{
skinX -= 10;
skinY -= 10;
}
int type = gx%3;
unsigned int windowWidth = classifierMem[0];
unsigned int windowHeight = classifierMem[1];
unsigned int stageIndex;
float stageThres;
float numFeatures;
unsigned int featureIndex;
float featureValue;
if(skinX<srcWidth-windowWidth-1 && skinY<srcHeight-windowHeight-1){
bool stagePass = true;
unsigned int index = 0;
for(unsigned int i=numTotStage; i>0;i--){
if(stagePass){
if(index == 0){
stageIndex = idxNumValStageArray[0];
stageThres = classifierMem[stageIndex+2];
numFeatures = classifierMem[stageIndex+3];
featureIndex = 0;
featureValue = 0.0;
}
else{
stageIndex = idxNumValStageArray[index];
stageThres = classifierMem[stageIndex+2];
numFeatures = classifierMem[stageIndex+3];
featureIndex = stageIndex+4;
featureValue = 0.0;
}
float featureThres;
float succVal;
float failVal;
unsigned int numRegions;
float regionValue;
if(type ==0 && index==0)
{
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
succVal=localS1F1[featureIndex++];
failVal=localS1F1[featureIndex++];
numRegions = localS1F1[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(localS1F1[featureIndex])+skinX;
regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}// end of if(stagePass)
}// end of for(unsigned int j=numFeatures; j>0;j--)
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
else if(type ==1 && index ==0)
{
featureIndex +=14;
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
if(j==1)
featureIndex -= 42;
featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
succVal=localS1F1[featureIndex++];
failVal=localS1F1[featureIndex++];
numRegions = localS1F1[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(localS1F1[featureIndex])+skinX;
regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}
}
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
else if(index == 0)
{
featureIndex +=28;
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
if(j==2) featureIndex -= 42;
featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
succVal=localS1F1[featureIndex++];
failVal=localS1F1[featureIndex++];
numRegions = localS1F1[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(localS1F1[featureIndex])+skinX;
regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];
featureIndex+=5;
}// end of for(unsigned int k=numRegions; k>0;k--)
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}// end of if(stagePass)
}//end of for(unsigned int j=numFeatures; j>0;j--)
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
//stage
else{
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
succVal=classifierMem[featureIndex++];
failVal=classifierMem[featureIndex++];
numRegions = classifierMem[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(classifierMem[featureIndex])+skinX;
regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}
}
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
}
}
}else return;
}
original version(without local mem)
__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
float resizeFactor = 1.0;
int2 im_dim = get_image_dim(input_image);
unsigned int srcWidth = im_dim.x*(float)resizeFactor;
unsigned int srcHeight = im_dim.y*(float)resizeFactor;
int gx = get_global_id(0);
int gy = get_global_id(1);
int skinX=0;
int skinY=0;
int coordi=vecSkin[512*gy+gx];
skinX = coordi%im_dim.x;
skinY = coordi/im_dim.x;
if( skinX >= 10 && skinY >= 10 )
{
skinX -= 10;
skinY -= 10;
}
unsigned int windowWidth = classifierMem[0];
unsigned int windowHeight = classifierMem[1];
if(gx<srcWidth-windowWidth-1 && gy<srcHeight-windowHeight-1){
bool stagePass = true;
unsigned int index = 0;
for(unsigned int i=numTotStage; i>0;i--){
if(stagePass){
unsigned int stageIndex = idxNumValStageArray[index++];
float stageThres = classifierMem[stageIndex+2];
float numFeatures = classifierMem[stageIndex+3];
unsigned int featureIndex = stageIndex+4;
float featureValue = 0.0;
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
float featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
float succVal=classifierMem[featureIndex++];
float failVal=classifierMem[featureIndex++];
unsigned int numRegions = classifierMem[featureIndex++];
float regionValue =0.0;
for(unsigned int k=numRegions; k>0;k--){
float4 rectValue;
int4 regionP;
regionP.x=(int)(classifierMem[featureIndex])+skinX;
regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}
}
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
}
}else return;
}
profiling time : original version(without local mem) : 24ms modified version(with local mem) : 28ms
edited : actually localWorkSize NULL becasue globalWorkSize always vary by vector size which put the NDRangeKernel. When Put the specific localWorkSize, face detection rate fall... So i tried to put the localWorkSize NUll, then face detection rate good. So I want to the reason.
this is host code :
//localWorkSize[0] = 16;
//localWorkSize[1] = 12;
numThreadsX=512;
globalWorkSize[0] = numThreadsX;
globalWorkSize[1] = vecCoordinate.size()% numThreadsX == 0 ? vecCoordinate.size()/ numThreadsX :(vecCoordinate.size()/ numThreadsX) + 1;
errNum = clEnqueueWriteBuffer(commandQueue,classifierMem,CL_TRUE,0,sizeof(float)*cntValArray,stageValArray,0,NULL,NULL);
errNum |= clEnqueueWriteBuffer(commandQueue,idxStageMem,CL_TRUE,0,sizeof(int)*haar.numStages,idxNumValStageArray,0,NULL,NULL);
errNum |= clSetKernelArg(kHaar_Cascade, 0, sizeof(memObjBuffer_Haar22), &memObjBuffer_Haar22);
errNum |= clSetKernelArg(kHaar_Cascade, 1, sizeof(memObjBuffer22), &memObjBuffer22);
errNum |= clSetKernelArg(kHaar_Cascade, 2, sizeof(cl_mem), &classifierMem);
errNum |= clSetKernelArg(kHaar_Cascade, 3, sizeof(cl_mem), &idxStageMem);
errNum |= clSetKernelArg(kHaar_Cascade, 4, sizeof(cl_int), &haar.numStages);
errNum |= clSetKernelArg(kHaar_Cascade, 5, sizeof(cl_mem), &memVecCoordi);
errNum = clEnqueueNDRangeKernel(commandQueue, kHaar_Cascade, 2, NULL,globalWorkSize, NULL,0, NULL, &event[3]);
There's a variety of reasons:
Profile both of them and see if the profiler's outputs shows anything at max utilization of anything and indicate to us what that is.
Also, I'm not convinced your localIdx and stage1Idx make sense, they could be going out of array bounds and causing odd behaviors. At the very least for a given gx/gy you look like you're using different indices from classifierMem.