Search code examples
androidopenclviola-jonesbank-conflict

mobile OpenCL local memory bank conflict. Why using local memory is slower than does global memory in kernel?


I'm developing face detection app in android platform using OpenCL. Face detection algorithm is based on Viola Jones algorithm. I tried to make Cascade classification step kernel code. and I set classifier data of cascade stage 1 among cascade stages to local memory(__local) because classifier data are used for all work-items.

But, kernel profiling time without using local mem(using global mem) is more faster than that does with using local memory.

edited :

I uploaded full code.


with local memory version

__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
       int cascadeLocalSize = get_local_size(0);

       __local float localS1F1[42];

       int localIdx = get_local_id(1)*cascadeLocalSize + get_local_id(0);
       if(localIdx<42)
       {
           int stage1Idx = localIdx + idxNumValStageArray[0]+4;
           localS1F1[localIdx] = classifierMem[stage1Idx];
       }
       barrier(CLK_LOCAL_MEM_FENCE);


       float resizeFactor = 1.0;
       int2 im_dim = get_image_dim(input_image);
       unsigned int srcWidth = im_dim.x*(float)resizeFactor;
       unsigned int srcHeight = im_dim.y*(float)resizeFactor;

       int gx = get_global_id(0);
       int gy = get_global_id(1);

       int skinX=0;
       int skinY=0;
       int coordi=vecSkin[512*gy+gx];
       skinX = coordi%im_dim.x;
       skinY = coordi/im_dim.x;

       if( skinX >= 10 && skinY >= 10 )
       {
             skinX -= 10;
             skinY -= 10;
       }      

       int type = gx%3;

       unsigned int windowWidth = classifierMem[0];
       unsigned int windowHeight = classifierMem[1]; 


       unsigned int stageIndex;
       float stageThres;
       float numFeatures;
       unsigned int featureIndex;
       float featureValue;

       if(skinX<srcWidth-windowWidth-1 && skinY<srcHeight-windowHeight-1){
             bool stagePass = true;
             unsigned int index = 0;
             for(unsigned int i=numTotStage; i>0;i--){
                    if(stagePass){
                           if(index == 0){
                                 stageIndex = idxNumValStageArray[0];                                 
                                 stageThres = classifierMem[stageIndex+2];
                                 numFeatures = classifierMem[stageIndex+3];
                                 featureIndex = 0;
                                 featureValue = 0.0;                           
                           }
                           else{
                                 stageIndex = idxNumValStageArray[index];
                                 stageThres = classifierMem[stageIndex+2];
                                 numFeatures = classifierMem[stageIndex+3];
                                 featureIndex = stageIndex+4;
                                 featureValue = 0.0;
                           }
                           float featureThres;
                           float succVal;
                           float failVal;
                           unsigned int numRegions;
                           float regionValue;


                           if(type ==0 && index==0)
                           {
                                 for(unsigned int j=numFeatures; j>0;j--){
                                        if(stagePass){
                                               featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
                                              succVal=localS1F1[featureIndex++];
                                              failVal=localS1F1[featureIndex++];
                                              numRegions = localS1F1[featureIndex++];
                                              regionValue =0.0;

                                              float4 rectValue;
                                              int4 regionP;                                  
                                              for(unsigned int k=numRegions; k>0;k--){

                                                     regionP.x=(int)(localS1F1[featureIndex])+skinX;
                                                     regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
                                                     regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
                                                     regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;

                                                     rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                                                     rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                                                     rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                                                     rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;

                                                     regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];

                                                     featureIndex+=5;
                                              }
                                              featureValue += (regionValue < featureThres)?failVal:succVal;                              
                                              if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;

                                        }// end of if(stagePass) 
                                 }// end of for(unsigned int j=numFeatures; j>0;j--)

                                  index++;
                                 if(featureValue < stageThres)    stagePass =false;
                                 else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));

                           }

                           else if(type ==1 && index ==0)
                           {
                                 featureIndex +=14;
                                 for(unsigned int j=numFeatures; j>0;j--){
                                        if(stagePass){
                                              if(j==1)
                                                     featureIndex -= 42;

                                               featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
                                              succVal=localS1F1[featureIndex++];
                                              failVal=localS1F1[featureIndex++];
                                              numRegions = localS1F1[featureIndex++];
                                              regionValue =0.0;


                                              float4 rectValue;
                                              int4 regionP;                                  
                                              for(unsigned int k=numRegions; k>0;k--){

                                                     regionP.x=(int)(localS1F1[featureIndex])+skinX;
                                                     regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
                                                     regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
                                                     regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;

                                                     rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                                                     rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                                                     rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                                                     rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;

                                                     regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];

                                                     featureIndex+=5;
                                              }
                                              featureValue += (regionValue < featureThres)?failVal:succVal;                              
                                              if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
                                        }
                                 }

                                  index++;
                                 if(featureValue < stageThres)    stagePass =false;
                                 else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
                           }

                           else if(index == 0)
                           {
                                 featureIndex +=28;
                                 for(unsigned int j=numFeatures; j>0;j--){
                                        if(stagePass){

                                              if(j==2)     featureIndex -= 42;

                                               featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
                                              succVal=localS1F1[featureIndex++];
                                              failVal=localS1F1[featureIndex++];
                                              numRegions = localS1F1[featureIndex++];
                                              regionValue =0.0;

                                              float4 rectValue;
                                              int4 regionP;                                  
                                              for(unsigned int k=numRegions; k>0;k--){

                                                     regionP.x=(int)(localS1F1[featureIndex])+skinX;
                                                     regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
                                                     regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
                                                     regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;

                                                     rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                                                     rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                                                     rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                                                     rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;

                                                     regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];

                                                     featureIndex+=5;
                                              }// end of for(unsigned int k=numRegions; k>0;k--)
                                              featureValue += (regionValue < featureThres)?failVal:succVal;                              
                                              if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;

                                        }// end of if(stagePass)
                                 }//end of for(unsigned int j=numFeatures; j>0;j--)

                                 index++;
                                 if(featureValue < stageThres)    stagePass =false;
                                 else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
                           }

                           //stage 
                           else{
                                 for(unsigned int j=numFeatures; j>0;j--){
                                        if(stagePass){
                                               featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
                                              succVal=classifierMem[featureIndex++];
                                              failVal=classifierMem[featureIndex++];
                                              numRegions = classifierMem[featureIndex++];
                                              regionValue =0.0;
                                              float4 rectValue;
                                              int4 regionP;                                  
                                              for(unsigned int k=numRegions; k>0;k--){
                                                     regionP.x=(int)(classifierMem[featureIndex])+skinX;
                                                     regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
                                                     regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
                                                     regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;
                                                     rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                                                     rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                                                     rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                                                     rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
                                                     regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4]; 
                                                     featureIndex+=5;
                                              }
                                              featureValue += (regionValue < featureThres)?failVal:succVal;                              
                                              if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
                                        }
                                 }
                                 index++;
                                 if(featureValue < stageThres)    stagePass =false;
                                 else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
                           }
                    }
             }      
       }else return;
}

original version(without local mem)

__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
    float resizeFactor = 1.0;

    int2 im_dim = get_image_dim(input_image);

    unsigned int srcWidth = im_dim.x*(float)resizeFactor;
    unsigned int srcHeight = im_dim.y*(float)resizeFactor;

    int gx = get_global_id(0);
    int gy = get_global_id(1);


    int skinX=0;
    int skinY=0;
    int coordi=vecSkin[512*gy+gx];
    skinX = coordi%im_dim.x;
    skinY = coordi/im_dim.x;

        if( skinX >= 10 && skinY >= 10 )
    {
        skinX -= 10;
        skinY -= 10;
    }   

    unsigned int windowWidth = classifierMem[0];
    unsigned int windowHeight = classifierMem[1];   

    if(gx<srcWidth-windowWidth-1 && gy<srcHeight-windowHeight-1){
        bool stagePass = true;
        unsigned int index = 0;
        for(unsigned int i=numTotStage; i>0;i--){
            if(stagePass){
                unsigned int stageIndex = idxNumValStageArray[index++];
                float stageThres = classifierMem[stageIndex+2];
                float numFeatures = classifierMem[stageIndex+3];
                unsigned int featureIndex = stageIndex+4;
                float featureValue = 0.0;               

                for(unsigned int j=numFeatures; j>0;j--){
                    if(stagePass){
                        float featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
                        float succVal=classifierMem[featureIndex++];
                        float failVal=classifierMem[featureIndex++];
                        unsigned int numRegions = classifierMem[featureIndex++];
                        float regionValue =0.0;

                        for(unsigned int k=numRegions; k>0;k--){                    
                            float4 rectValue;
                            int4 regionP;

                            regionP.x=(int)(classifierMem[featureIndex])+skinX;
                            regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
                            regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
                            regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;

                            rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
                            rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
                            rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
                            rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;

                            regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4];

                            featureIndex+=5;
                        }

                        featureValue += (regionValue < featureThres)?failVal:succVal;                   

                        if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
                    }
                }
                if(featureValue < stageThres)   stagePass =false;
                else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
            }
        }   
    }else return;
}

profiling time : original version(without local mem) : 24ms modified version(with local mem) : 28ms

edited : actually localWorkSize NULL becasue globalWorkSize always vary by vector size which put the NDRangeKernel. When Put the specific localWorkSize, face detection rate fall... So i tried to put the localWorkSize NUll, then face detection rate good. So I want to the reason.

this is host code :

    //localWorkSize[0] = 16;
    //localWorkSize[1] = 12; 
    numThreadsX=512;
    globalWorkSize[0] = numThreadsX;
    globalWorkSize[1] =  vecCoordinate.size()% numThreadsX == 0 ? vecCoordinate.size()/ numThreadsX :(vecCoordinate.size()/ numThreadsX) + 1;
    errNum = clEnqueueWriteBuffer(commandQueue,classifierMem,CL_TRUE,0,sizeof(float)*cntValArray,stageValArray,0,NULL,NULL); 
    errNum |= clEnqueueWriteBuffer(commandQueue,idxStageMem,CL_TRUE,0,sizeof(int)*haar.numStages,idxNumValStageArray,0,NULL,NULL); 
    errNum |= clSetKernelArg(kHaar_Cascade, 0, sizeof(memObjBuffer_Haar22), &memObjBuffer_Haar22);
    errNum |= clSetKernelArg(kHaar_Cascade, 1, sizeof(memObjBuffer22), &memObjBuffer22);
    errNum |= clSetKernelArg(kHaar_Cascade, 2, sizeof(cl_mem), &classifierMem);
    errNum |= clSetKernelArg(kHaar_Cascade, 3, sizeof(cl_mem), &idxStageMem);
    errNum |= clSetKernelArg(kHaar_Cascade, 4, sizeof(cl_int), &haar.numStages);
    errNum |= clSetKernelArg(kHaar_Cascade, 5, sizeof(cl_mem), &memVecCoordi);

    errNum = clEnqueueNDRangeKernel(commandQueue, kHaar_Cascade, 2, NULL,globalWorkSize, NULL,0, NULL, &event[3]);

Solution

  • There's a variety of reasons:

    • Thread divergence of the memory load
    • Barriers aren't free
    • More instructions
    • On most platforms constant memory is pretty much the same as shared memory, so the extra work has no gains.
    • We don't know how many reads the shared memory version is saving - please tell us the size of the work group. The smaller it is, the less reads from non-local memory we are saving. Not that it matters if this is constant memory.

    Profile both of them and see if the profiler's outputs shows anything at max utilization of anything and indicate to us what that is.

    Also, I'm not convinced your localIdx and stage1Idx make sense, they could be going out of array bounds and causing odd behaviors. At the very least for a given gx/gy you look like you're using different indices from classifierMem.