I am trying to write a multi-threaded program to produce a vector of N*NumPerThread
uniform random integers, where N
is the return value of std::thread::hardware_concurrency()
and NumPerThread
is the amount of random numbers I want each thread to generate.
I created a multi-threaded version:
#include <iostream>
#include <thread>
#include <vector>
#include <random>
#include <chrono>
using Clock = std::chrono::high_resolution_clock;
namespace Vars
{
const unsigned int N = std::thread::hardware_concurrency(); //number of threads on device
const unsigned int NumPerThread = 5e5; //number of random numbers to generate per thread
std::vector<int> RandNums(NumPerThread*N);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(1, 1000);
int sz = 0;
}
using namespace Vars;
void AddN(int start)
{
static std::mutex mtx;
std::lock_guard<std::mutex> lock(mtx);
for (unsigned int i=start; i<start+NumPerThread; i++)
{
RandNums[i] = dis(gen);
++sz;
}
}
int main()
{
auto start_time = Clock::now();
std::vector<std::thread> threads;
threads.reserve(N);
for (unsigned int i=0; i<N; i++)
{
threads.emplace_back(std::move(std::thread(AddN, i*NumPerThread)));
}
for (auto &i: threads)
{
i.join();
}
auto end_time = Clock::now();
std::cout << "\nTime difference = "
<< std::chrono::duration<double, std::nano>(end_time - start_time).count() << " nanoseconds\n";
std::cout << "size = " << sz << '\n';
}
and a single-threaded version
#include <iostream>
#include <thread>
#include <vector>
#include <random>
#include <chrono>
using Clock = std::chrono::high_resolution_clock;
namespace Vars
{
const unsigned int N = std::thread::hardware_concurrency(); //number of threads on device
const unsigned int NumPerThread = 5e5; //number of random numbers to generate per thread
std::vector<int> RandNums(NumPerThread*N);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(1, 1000);
int sz = 0;
}
using namespace Vars;
void AddN()
{
for (unsigned int i=0; i<NumPerThread*N; i++)
{
RandNums[i] = dis(gen);
++sz;
}
}
int main()
{
auto start_time = Clock::now();
AddN();
auto end_time = Clock::now();
std::cout << "\nTime difference = "
<< std::chrono::duration<double, std::nano>(end_time - start_time).count() << " nanoseconds\n";
std::cout << "size = " << sz << '\n';
}
The execution times are more or less the same. I am assuming there is a problem with the multi-threaded version?
P.S. I looked at all of the other similar questions here, I don't see how they directly apply to this task...
You can do with without any mutex.
Be sure each thread has its own copy of
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(1, 1000);
That should run much faster.
UNTESTED code - but this should make my above suggestion more clear:
using Clock = std::chrono::high_resolution_clock;
namespace SharedVars
{
const unsigned int N = std::thread::hardware_concurrency(); //number of threads on device
const unsigned int NumPerThread = 5e5; //number of random numbers to generate per thread
std::vector<int> RandNums(NumPerThread*N);
std::mutex mtx;
}
void PerThread_AddN(int threadNumber)
{
using namespace SharedVars;
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(1, 1000);
int sz = 0;
vector<int>::iterator from;
vector<int>::iterator to;
{
std::lock_guard<std::mutex> lock(mtx); // hold the lock only while accessing shared vector, not while accessing its contents
from = RandNums.begin () + threadNumber*NumPerThread;
to = from + NumPerThread;
}
for (auto i = from; i < to; ++i)
{
*i = dis(gen);
}
}
int main()
{
auto start_time = Clock::now();
std::vector<std::thread> threads;
threads.reserve(N);
for (unsigned int i=0; i<N; i++)
{
threads.emplace_back(std::move(std::thread(PerThread_AddN, i)));
}
for (auto &i: threads)
{
i.join();
}
auto end_time = Clock::now();
std::cout << "\nTime difference = "
<< std::chrono::duration<double, std::nano>(end_time - start_time).count() << " nanoseconds\n";
std::cout << "size = " << sz << '\n';
}