I'm trying to test the speed up of OpenMP on an array sum program.
The elements are generated using random generator to avoid optimization.
The length of array is also set large enough to indicate the performance difference.
This program is built using g++ -fopenmp -g -O0 -o main main.cpp
, -g -O0
are used to avoid optimization.
However OpenMP parallel for code is significant slower than sequential code.
Test result:
Your thread count is: 12
Filling arrays
filling time:66718888
Now running omp code
2thread omp time:11154095
result: 4294903886
Now running omp code
4thread omp time:10832414
result: 4294903886
Now running omp code
6thread omp time:11165054
result: 4294903886
Now running sequential code
sequential time: 3525371
result: 4294903886
#include <iostream>
#include <stdio.h>
#include <omp.h>
#include <ctime>
#include <random>
using namespace std;
long long llsum(char *vec, size_t size, int threadCount) {
long long result = 0;
size_t i;
#pragma omp parallel for num_threads(threadCount) reduction(+: result) schedule(guided)
for (i = 0; i < size; ++i) {
result += vec[i];
}
return result;
}
int main(int argc, char **argv) {
int threadCount = 12;
omp_set_num_threads(threadCount);
cout << "Your thread count is: " << threadCount << endl;
const size_t TEST_SIZE = 8000000000;
char *testArray = new char[TEST_SIZE];
std::mt19937 rng;
rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6(0, 4);
cout << "Filling arrays\n";
auto fillingStartTime = clock();
for (int i = 0; i < TEST_SIZE; ++i) {
testArray[i] = dist6(rng);
}
auto fillingEndTime = clock();
auto fillingTime = fillingEndTime - fillingStartTime;
cout << "filling time:" << fillingTime << endl;
// test omp time
for (int i = 1; i <= 3; ++i) {
cout << "Now running omp code\n";
auto ompStartTime = clock();
auto ompResult = llsum(testArray, TEST_SIZE, i * 2);
auto ompEndTime = clock();
auto ompTime = ompEndTime - ompStartTime;
cout << i * 2 << "thread omp time:" << ompTime << endl << "result: " << ompResult << endl;
}
// test sequential addition time
cout << "Now running sequential code\n";
auto seqStartTime = clock();
long long expectedResult = 0;
for (int i = 0; i < TEST_SIZE; ++i) {
expectedResult += testArray[i];
}
auto seqEndTime = clock();
auto seqTime = seqEndTime - seqStartTime;
cout << "sequential time: " << seqTime << endl << "result: " << expectedResult << endl;
delete[]testArray;
return 0;
}
As pointed out by @High Performance Mark, I should use omp_get_wtime()
instead of clock()
.
clock()
is 'active processor time', not 'elapsed time.
See
After using omp_get_wtime()
, and fixing the int i
to size_t i
, the result is more meaningful:
Your thread count is: 12
Filling arrays
filling time:267.038
Now running omp code
2thread omp time:26.1421
result: 15999820788
Now running omp code
4thread omp time:7.16911
result: 15999820788
Now running omp code
6thread omp time:5.66505
result: 15999820788
Now running sequential code
sequential time: 30.4056
result: 15999820788