c++multithreading opencv compilation openmp

Why cv::parallel_for_ run faster than my own implementation?

I am implementing the nearest-neighborhood resizing algorithm for RGB image (unsigned char type). Considering the speed comparison with OpenCV's on Android ARMv8 platform, I find that OpenCV use cv::parallel_for_ for multi-threading speed up.

Thus, I dive into the corresponding source code of OpenCV's cv::resize(), copy and paste the code that actually run, put in my main.cpp. It contains a functor resizeNNInvoker, and cv::parallel_for_ that performs multi-thread calculation on this functor.

What makes me confuse is that cv::parallel_for_ version run faster than using my_parallel_for_, whose code keeps same as OpenCV's.

To make it more clear:

Tested on Android armv8 platform
**Compiling OpenCV with OpenMP multithread, turn of other parallel framwork
Go to OpenCV's cv::parallel_for_, change its source code to the same as my_parallel_for_ (see below)
Using 4 threads by cv::setNumThreads(4), and binding 4 big cpu cores (using ncnn API)
All code compile under Release mode (via CMake)
Test input image: width=7680,height=4320, target image size: 7680/3, 4320/3.

Time cost is as follow:

method	time cost
cv::parallel_for_	3.24 ms
my_parallel_for_	7.67 ms
inplace openmp	7.75 ms

// my own implementation of parallel_for_, copied from OpenCV source code
void my_parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body)
{
    #pragma omp parallel for schedule(dynamic) num_threads(4)
    for (int i = range.start; i < range.end; ++i)
        body(cv::Range(i, i + 1));
}

// The functor that performs nearest neighbor resizing, copied from opencv source
class resizeNNInvoker : public cv::ParallelLoopBody
{
public:
    resizeNNInvoker(const cv::Mat& _src, cv::Mat &_dst, int *_x_ofs, double _ify) :
        ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs),
        ify(_ify)
    {
    }

    virtual void operator() (const cv::Range& range) const CV_OVERRIDE
    {
        //printf("--- resizeNNInvoker get called\n");
        cv::Size ssize = src.size(), dsize = dst.size();
        int y, x, pix_size = (int)src.elemSize();

        for( y = range.start; y < range.end; y++ )
        {
            uchar* D = dst.data + dst.step*y;
            int sy = std::min(cvFloor(y*ify), ssize.height-1);
            const uchar* S = src.ptr(sy);

            switch( pix_size )
            {
            case 1:
                for( x = 0; x <= dsize.width - 2; x += 2 )
                {
                    uchar t0 = S[x_ofs[x]];
                    uchar t1 = S[x_ofs[x+1]];
                    D[x] = t0;
                    D[x+1] = t1;
                }

                for( ; x < dsize.width; x++ )
                    D[x] = S[x_ofs[x]];
                break;
            case 2:
                for( x = 0; x < dsize.width; x++ )
                    *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
                break;
            case 3:
                for( x = 0; x < dsize.width; x++, D += 3 )
                {
                    const uchar* _tS = S + x_ofs[x];
                    D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
                }
                break;
            case 4:
                for( x = 0; x < dsize.width; x++ )
                    *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
                break;
            case 6:
                for( x = 0; x < dsize.width; x++, D += 6 )
                {
                    const ushort* _tS = (const ushort*)(S + x_ofs[x]);
                    ushort* _tD = (ushort*)D;
                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
                }
                break;
            case 8:
                for( x = 0; x < dsize.width; x++, D += 8 )
                {
                    const int* _tS = (const int*)(S + x_ofs[x]);
                    int* _tD = (int*)D;
                    _tD[0] = _tS[0]; _tD[1] = _tS[1];
                }
                break;
            case 12:
                for( x = 0; x < dsize.width; x++, D += 12 )
                {
                    const int* _tS = (const int*)(S + x_ofs[x]);
                    int* _tD = (int*)D;
                    _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
                }
                break;
            default:
                for( x = 0; x < dsize.width; x++, D += pix_size )
                {
                    const uchar* _tS = S + x_ofs[x];
                    for (int k = 0; k < pix_size; k++)
                        D[k] = _tS[k];
                }
            }
        }
    }

private:
    const cv::Mat& src;
    cv::Mat& dst;
    int* x_ofs;
    double ify;

    resizeNNInvoker(const resizeNNInvoker&);
    resizeNNInvoker& operator=(const resizeNNInvoker&);
};

// The entry function that calls nearest neighbor resizing with openmp multi-thread
void resize_nearest(const uchar* src_buf, int src_height, int src_width, int src_linebytes, uchar* dst_buf, int dst_height, int dst_width, int dst_linebytes, const Option& opt)
{
    cv::Size src_size;
    src_size.height = src_height;
    src_size.width = src_width;
    cv::Mat src(src_size, CV_8UC3, const_cast<uchar*>(src_buf));

    cv::Size dst_size;
    dst_size.height = dst_height;
    dst_size.width = dst_width;
    cv::Mat dst(dst_size, CV_8UC3, dst_buf);

    cv::Size ssize = src.size(), dsize = dst.size();

    double inv_scale_x = (double)dsize.width/ssize.width;
    double inv_scale_y = (double)dsize.height/ssize.height;
    double fx = inv_scale_x;
    double fy = inv_scale_y;

    cv::AutoBuffer<int> _x_ofs(dsize.width);
    int* x_ofs = _x_ofs.data();
    int pix_size = (int)src.elemSize();
    double ifx = 1./fx, ify = 1./fy;
    int x;

    for( x = 0; x < dsize.width; x++ )
    {
        int sx = cvFloor(x*ifx);
        x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
    }

    cv::Range range(0, dsize.height);

    // !! define the instance of resizeNNInvoker functor.
    resizeNNInvoker invoker(src, dst, x_ofs, ify);

#if 0
    cv::parallel_for_(range, invoker);   //!! use opencv's, cost 3.24 ms
#elif 0
    my_parallel_for_(range, invoker);    //!! use own implementation, cost 7.67 ms
#else
    set_omp_dynamic(1);    //!! use inplace-implementation, cost 7.75 ms
    cv::Range stripeRange = range;
    #pragma omp parallel for schedule(dynamic) num_threads(4)
    for (int i = stripeRange.start; i < stripeRange.end; ++i)
        invoker(cv::Range(i, i + 1));
#endif
}

Solution

Finally I figure it out. My OpenMP configuration in CMakeLists.txt cause that mismatched performance.

To be concrete, I build the executable test which depends on a static library libplain.a, and libplain.a compiles and links to OpenMP with these (the previous wrong one):

find_package(OpenMP)
if(NOT TARGET OpenMP::OpenMP_CXX AND (OpenMP_CXX_FOUND OR OPENMP_FOUND))
    target_compile_options(plain PRIVATE ${OpenMP_CXX_FLAGS})
endif()

if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
    if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20))
        target_compile_options(plain PRIVATE -fopenmp)
        target_link_libraries(plain PUBLIC -fopenmp -static-openmp)
    elseif(OpenMP_CXX_FOUND)
        target_link_libraries(plain PUBLIC OpenMP::OpenMP_CXX)
    else()
        target_link_libraries(plain PRIVATE "${OpenMP_CXX_FLAGS}")
    endif()
endif()

Now change all the PRIVATE visibility to PUBLIC, then the openmp compile and link flags correctly propagate to the executable target test:

target_link_libraries(plain PUBLIC ${OpenCV_LIBS})

find_package(OpenMP)
if(NOT TARGET OpenMP::OpenMP_CXX AND (OpenMP_CXX_FOUND OR OPENMP_FOUND))
    target_compile_options(plain PUBLIC ${OpenMP_CXX_FLAGS})
endif()

if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
    if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20))
        target_compile_options(plain PUBLIC -fopenmp)
        target_link_libraries(plain PUBLIC -fopenmp -static-openmp)
    elseif(OpenMP_CXX_FOUND)
        target_link_libraries(plain PUBLIC OpenMP::OpenMP_CXX)
    else()
        target_link_libraries(plain PUBLIC "${OpenMP_CXX_FLAGS}")
    endif()
endif()

With this updated cmake config, rebuilt program, my_parallel_for_ get nearly same speed as cv::parallel_for_.