Search code examples
c++c++11ppl

Strange execution time in Debug and Release versions


i started to play with Parallel Pattern Library in VS2010 the application gives me expected results but when i benchmark the debug version and release version i get strange execution time in Release version as follow Debug Version : "Sequential Duration : 1014 " "Parallel Duration : 437 " Release Version "Sequential Duration : 31 " "Parallel Duration : 484 "

this is my application code

double DoWork(int workload)
{
    double result=0;
    for(int i =0 ; i < workload;i++)
    {
        result +=sqrt((double)i * 4*3) + i* i;
    }
    return result;
}

vector<double> Seqential()
{
    vector<double> results(100);
    for(int i = 0 ; i <100 ; i++)
    {
        results[i] = DoWork(1000000);
    }

    return results;
}

vector<double> Parallel()
{
     vector<double> results(100);
     parallel_for(0,(int)100,1,[&results](int i)
     {
         results[i] = DoWork(1000000);
     });

     return results;
}

double Sum(const vector<double>& results)
{
    double result =0;
    for(int i = 0 ; i < results.size();i++)
        result += results[i];
    return result;
}

int main()
{
    DWORD start = GetTickCount();
    vector<double> results = Seqential();
    DWORD duration = GetTickCount() - start;
    cout<<"Sequential Duration : "<<duration <<"  Result : " <<Sum(results) << endl;

    start = GetTickCount();
    results = Parallel();
    duration = GetTickCount() - start;
    cout<<"Prallel Duration : "<<duration <<"  Result : " <<Sum(results) << endl;
    system("PAUSE");
    return 0;
}

Solution

  • The problem is not in Parallel being slow but in Seqential being too fast:

    • In Seqential, the compiler sees that DoWork will always produce the same result, so the loop calling it 100 times is optimized away and DoWork ends-up being called only once.
    • Compiler is not clever enough to optimize the parallel_for in quite the same way, so it ends-up doing the actual work (100 times more actual work, in fact).

    If you make DoWork dependent on the loop counter, different calls will now produce different results, so no calls will be redundant, so there will be nothing for compiler to optimize-away.

    For example:

    #include <vector>
    #include <iostream>
    #include <math.h>
    #include <ppl.h>
    #include <Windows.h>
    
    using namespace std;
    using namespace Concurrency;
    
    double DoWork(int workload, int outer_i)
    {
    double result=0;
    for(int i =0 ; i < workload;i++)
    {
        result +=sqrt((double)i * 4*3) + i* i;
    }
    result += outer_i;
    return result;
    }
    
    vector<double> Seqential()
    {
    vector<double> results(100);
    for(int i = 0 ; i <100 ; i++)
    {
        results[i] = DoWork(1000000, i);
    }
    
    return results;
    }
    
    vector<double> Parallel()
    {
    vector<double> results(100);
    parallel_for(0,(int)100,1,[&results](int i)
    {
        results[i] = DoWork(1000000, i);
    });
    
    return results;
    }
    
    double Sum(const vector<double>& results)
    {
    double result =0;
    for(int i = 0 ; i < results.size();i++)
        result += results[i];
    return result;
    }
    
    int main()
    {
    DWORD start = GetTickCount();
    vector<double> results = Seqential();
    DWORD duration = GetTickCount() - start;
    cout<<"Sequential Duration : "<<duration <<"  Result : " <<Sum(results) << endl;
    
    start = GetTickCount();
    results = Parallel();
    duration = GetTickCount() - start;
    cout<<"Prallel Duration : "<<duration <<"  Result : " <<Sum(results) << endl;
    system("PAUSE");
    return 0;
    }
    

    When built by Visual C++ 2010 under Release configuration and run on a quad-core CPU, this prints:

    Sequential Duration : 1607  Result : 1.68692e+015
    Prallel Duration : 374  Result : 1.68692e+015
    

    (BTW, you should really consider formatting your code better.)