i started to play with Parallel Pattern Library in VS2010 the application gives me expected results but when i benchmark the debug version and release version i get strange execution time in Release version as follow Debug Version : "Sequential Duration : 1014 " "Parallel Duration : 437 " Release Version "Sequential Duration : 31 " "Parallel Duration : 484 "
this is my application code
double DoWork(int workload)
{
double result=0;
for(int i =0 ; i < workload;i++)
{
result +=sqrt((double)i * 4*3) + i* i;
}
return result;
}
vector<double> Seqential()
{
vector<double> results(100);
for(int i = 0 ; i <100 ; i++)
{
results[i] = DoWork(1000000);
}
return results;
}
vector<double> Parallel()
{
vector<double> results(100);
parallel_for(0,(int)100,1,[&results](int i)
{
results[i] = DoWork(1000000);
});
return results;
}
double Sum(const vector<double>& results)
{
double result =0;
for(int i = 0 ; i < results.size();i++)
result += results[i];
return result;
}
int main()
{
DWORD start = GetTickCount();
vector<double> results = Seqential();
DWORD duration = GetTickCount() - start;
cout<<"Sequential Duration : "<<duration <<" Result : " <<Sum(results) << endl;
start = GetTickCount();
results = Parallel();
duration = GetTickCount() - start;
cout<<"Prallel Duration : "<<duration <<" Result : " <<Sum(results) << endl;
system("PAUSE");
return 0;
}
The problem is not in Parallel
being slow but in Seqential
being too fast:
Seqential
, the compiler sees that DoWork
will always produce the same result, so the loop calling it 100 times is optimized away and DoWork
ends-up being called only once.parallel_for
in quite the same way, so it ends-up doing the actual work (100 times more actual work, in fact).If you make DoWork
dependent on the loop counter, different calls will now produce different results, so no calls will be redundant, so there will be nothing for compiler to optimize-away.
For example:
#include <vector>
#include <iostream>
#include <math.h>
#include <ppl.h>
#include <Windows.h>
using namespace std;
using namespace Concurrency;
double DoWork(int workload, int outer_i)
{
double result=0;
for(int i =0 ; i < workload;i++)
{
result +=sqrt((double)i * 4*3) + i* i;
}
result += outer_i;
return result;
}
vector<double> Seqential()
{
vector<double> results(100);
for(int i = 0 ; i <100 ; i++)
{
results[i] = DoWork(1000000, i);
}
return results;
}
vector<double> Parallel()
{
vector<double> results(100);
parallel_for(0,(int)100,1,[&results](int i)
{
results[i] = DoWork(1000000, i);
});
return results;
}
double Sum(const vector<double>& results)
{
double result =0;
for(int i = 0 ; i < results.size();i++)
result += results[i];
return result;
}
int main()
{
DWORD start = GetTickCount();
vector<double> results = Seqential();
DWORD duration = GetTickCount() - start;
cout<<"Sequential Duration : "<<duration <<" Result : " <<Sum(results) << endl;
start = GetTickCount();
results = Parallel();
duration = GetTickCount() - start;
cout<<"Prallel Duration : "<<duration <<" Result : " <<Sum(results) << endl;
system("PAUSE");
return 0;
}
When built by Visual C++ 2010 under Release configuration and run on a quad-core CPU, this prints:
Sequential Duration : 1607 Result : 1.68692e+015
Prallel Duration : 374 Result : 1.68692e+015
(BTW, you should really consider formatting your code better.)