Search code examples
c++parallel-processingopenmp

How to fast paralleled Code than nomal Code using OpenMP?


#include<opencv2/opencv.hpp>
#include <iostream>
#include <omp.h>
#include "jaehyukTool.h"

using namespace cv;
using namespace std;

void cpu_PeakFinder(Mat img , Mat* peak);
void cpuMp_PeakFinder(Mat img, Mat* peak);

int main()
{
    /*data set 준비*/
    Mat img = imread("testimg.bmp", IMREAD_GRAYSCALE);
    img.convertTo(img, CV_8UC1);
    imgInfo(img);

    /*==========================================================================
    ============================================================================\*/

    /*CPU 실행 코드*/
    /*병렬처리 미사용 코드*/
    {
        Mat peak_cpu(img.rows, img.cols, CV_8UC1, Scalar(0));
        
        system_clock::time_point begin = system_clock::now();
        for (int k = 0; k < 11000; k++)
            cpu_PeakFinder(img, &peak_cpu);

        nanoseconds time = system_clock::now() - begin;
        printf("싱글 스레드 cpu 계산 : %.6lf(ms)\n", time.count() * 1.0E-06);
        transpose(peak_cpu, peak_cpu);
        imshow("peak_cpu", peak_cpu);
    }
    /*==========================================================================
    ============================================================================*/
    {
        /*병렬처리 사용 코드*/
        Mat peak_mp(img.rows, img.cols, CV_8UC1, Scalar(0));
        uchar* ppeakMp = peak_mp.data;

        system_clock::time_point begin = system_clock::now();
        for (int k = 0; k < 11000; k++)
            cpuMp_PeakFinder(img, &peak_mp);

        nanoseconds time = system_clock::now() - begin;
        printf("멀티 스레딩 cpu: %.6lf(ms)\n", time.count() * 1.0E-06);
        transpose(peak_mp, peak_mp);
        imshow("peak_mp", peak_mp);
    }

    waitKey();
    return 0;
}

void cpu_PeakFinder(Mat img, Mat* peak)
{
    uchar* pimg = img.data;
    uchar* ppeak = peak->data;
    int height = img.rows;
    int width = img.cols;
    int offset, x, y, row;
    int max, idx;
    for (y = 0; y < height; ++y)
    {
        max = pimg[width * y + 0];
        idx = 0;
        for (x = 0; x < width; ++x)
        {
            offset = width * y + x;
            if (max < pimg[offset])
            {
                max = pimg[offset];
                idx = offset;
            }
        }
        ppeak[idx] = 255;
    }
}

void cpuMp_PeakFinder(Mat img, Mat* peak)
{
    uchar* pimg = img.data;
    uchar* ppeakMp = peak->data;
    const int width = img.cols;
    const int height = img.rows;
    int offset, x, y;
    int max, idx;

#pragma omp parallel  private(offset) private(max) private(idx) private(x) private(y) num_threads(height)
    {
        y = omp_get_thread_num();
        max = pimg[width * y + 0];
        idx = 0;

        for (x = 0; x < width; ++x)
        {
            offset = width * y + x;
            if (max < pimg[offset])
            {
                max = pimg[offset];
                idx = offset;
            }
        }
        ppeakMp[idx] = 255;
    }
}

I want cpuMp_PeakFinder() to be faster than cpu_PeakFinder(). But that has not worked as I hoped.

Why is the parallelized code slower?

  • I used OpenMp for code parallelization.
  • The input is an image of size 64x5120.
  • The functions (cpuMp_PeakFinder(), cpu_PeakFinder()) finds the maximum value of each row.

Solution

  • setting the number of threads num_threads manually is usually wrong in computational code, as setting them lower or higher than your actual number of cores will penalize your performance, as threads will be contesting on the available cores or underutilizing them.

    instead you should rely on #pragma omp parallel for to distribute work to the correct number of threads (the total number of threads of your processor).

    #pragma omp parallel for schedule(dynamic)
    for (int y = 0; y < height; y++)
    {
    ...
    }
    

    the parallel version is now about twice as fast as the single-thread version.

    you may want to play with the blocks in schedule(dynamic, blocks) to get the best number of chunks per core, but because the work is very small , twice as fast is the best you can get.


    MWE with const correctness

    #include <opencv2/opencv.hpp>
    #include <iostream>
    #include <omp.h>
    
    void cpu_PeakFinder(const cv::Mat& img, cv::Mat& peak);
    void cpuMp_PeakFinder(const cv::Mat& img, cv::Mat& peak);
    
    int main()
    {
        cv::Mat img(64, 5120, CV_8UC1);
        /*==========================================================================
        ============================================================================\*/
    
        {
            cv::Mat peak_cpu(img.rows, img.cols, CV_8UC1, cv::Scalar(0));
    
            std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
            for (int k = 0; k < 11000; k++)
                cpu_PeakFinder(img, peak_cpu);
    
            std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
            std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() << "[ms]" << std::endl;
            // cv::transpose(peak_cpu, peak_cpu);
            // imshow("peak_cpu", peak_cpu);
        }
        /*==========================================================================
        ============================================================================*/
        {
            cv::Mat peak_mp(img.rows, img.cols, CV_8UC1, cv::Scalar(0));
            uchar *ppeakMp = peak_mp.data;
    
            std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
            for (int k = 0; k < 11000; k++)
                cpuMp_PeakFinder(img, peak_mp);
    
            std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
            std::cout << "Time difference parallel = " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() << "[ms]" << std::endl;
            // transpose(peak_mp, peak_mp);
            // imshow("peak_mp", peak_mp);
        }
    
        // cv::waitKey();
        return 0;
    }
    
    void cpu_PeakFinder(const cv::Mat& img, cv::Mat& peak)
    {
        uchar *pimg = img.data;
        uchar *ppeak = peak.data;
        int height = img.rows;
        int width = img.cols;
        int offset, x, y, row;
        int max, idx;
        for (y = 0; y < height; ++y)
        {
            max = pimg[width * y + 0];
            idx = 0;
            for (x = 0; x < width; ++x)
            {
                offset = width * y + x;
                if (max < pimg[offset])
                {
                    max = pimg[offset];
                    idx = offset;
                }
            }
            ppeak[idx] = 255;
        }
    }
    
    void cpuMp_PeakFinder(const cv::Mat& img, cv::Mat& peak)
    {
        uchar *pimg = img.data;
        uchar *ppeakMp = peak.data;
        const int width = img.cols;
        const int height = img.rows;
        int offset, x, y;
        int max, idx;
    
        #pragma omp parallel for schedule(dynamic)
        for (int y = 0; y < height; y++)
        {
            max = pimg[width * y + 0];
            idx = 0;
    
            for (x = 0; x < width; ++x)
            {
                offset = width * y + x;
                if (max < pimg[offset])
                {
                    max = pimg[offset];
                    idx = offset;
                }
            }
            ppeakMp[idx] = 255;
        }
    }