there is a sample working code below ( parallel_for using Parallel Pattern Library ( ppl ) ). The main problem in here is sqr < concurrent_vector > stored values changing in every execution, but it should not be!
I used < concurrent_vector > for random access why it's not working?
#include <iostream>
#include <ppl.h>
#include <concurrent_vector.h>
using namespace std;
using namespace concurrency;
const int a = 10, b = 30;
critical_section cs;
int main() {
concurrent_vector< int > labels( a * b );
concurrent_vector< int > sqr( 5 );
// filling label vector
for ( int y = 0; y < b; y++ ) {
for ( int x = 0; x < a; x++ ) {
if( x<2 && y>3 )
labels[ a * y + x ] = 1;
else if( x<30 && y<5 )
labels[ a * y + x ] = 2;
else if( x>5 && y>10 )
labels[ a * y + x ] = 3;
else if( x>2 && y>20 )
labels[ a * y + x ] = 4;
}
}
// printing
for ( int y = 0; y < b; y++ ) {
for ( int x = 0; x < a; x++ ) {
cout << labels[ a * y + x ] << ", ";
}
cout << endl;
}
parallel_for ( 0, b, [ & ]( int y ) {
for ( int x = 0; x < a; x++ ) {
//cs.lock(); // when i used it's working but slow
int i = labels[ a * y + x ];
//cs.unlock();
if ( i < 0 ) continue;
sqr[ i ] ++;
}
} );
for( int i=0; i<5; i++ )
cout << sqr[i] << ", ";
cout << "" << endl;
system ("pause");
return 0;
}
Using task_group::wait
method should be faster (as you don't have to lock/unlock every time) and it may work as you expect.
This method blocks the current task until the tasks of another task group have completed their work.
See MSDN: Parallel Tasks.
Update: I have run some timing tests and seems that this is not a solution (besides both fail on large data inputs on my Dual-Core). This can be a bug of "design' in concurrent_vector" as in Intel's TBB - tbb::concurrent_vector returns wrong size