C++ multithreaded version of creating vector of random numbers slower than single-threaded version

I am trying to write a multi-threaded program to produce a vector of N*NumPerThread uniform random integers, where N is the return value of std::thread::hardware_concurrency() and NumPerThread is the amount of random numbers I want each thread to generate.

I created a multi-threaded version:

#include <iostream>
#include <thread>
#include <vector>
#include <random>
#include <chrono>

using Clock = std::chrono::high_resolution_clock;

namespace Vars
{
    const unsigned int N = std::thread::hardware_concurrency(); //number of threads on device
    const unsigned int NumPerThread = 5e5; //number of random numbers to generate per thread
    std::vector<int> RandNums(NumPerThread*N);
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> dis(1, 1000);
    int sz = 0;
}

using namespace Vars;

void AddN(int start)
{
    static std::mutex mtx;
    std::lock_guard<std::mutex> lock(mtx);
    for (unsigned int i=start; i<start+NumPerThread; i++)
    {
        RandNums[i] = dis(gen);
        ++sz;
    }
}

int main()
{
    auto start_time = Clock::now();
    std::vector<std::thread> threads;
    threads.reserve(N);
    
    for (unsigned int i=0; i<N; i++)
    {
        threads.emplace_back(std::move(std::thread(AddN, i*NumPerThread)));
    }

    for (auto &i: threads)
    {
        i.join();
    }
        
    auto end_time = Clock::now();
    std::cout << "\nTime difference = "
    << std::chrono::duration<double, std::nano>(end_time - start_time).count() << " nanoseconds\n";
    std::cout << "size = " << sz << '\n';
}

and a single-threaded version

#include <iostream>
#include <thread>
#include <vector>
#include <random>
#include <chrono>


using Clock = std::chrono::high_resolution_clock;



namespace Vars
{
    const unsigned int N = std::thread::hardware_concurrency(); //number of threads on device
    const unsigned int NumPerThread = 5e5; //number of random numbers to generate per thread
    std::vector<int> RandNums(NumPerThread*N);
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> dis(1, 1000);
    int sz = 0;
}
    


using namespace Vars;


void AddN()
{
    for (unsigned int i=0; i<NumPerThread*N; i++)
    {
        RandNums[i] = dis(gen);
        ++sz;
    }
}

int main()
{
    auto start_time = Clock::now();

    AddN();
    
    auto end_time = Clock::now();
    std::cout << "\nTime difference = "
    << std::chrono::duration<double, std::nano>(end_time - start_time).count() << " nanoseconds\n";
    std::cout << "size = " << sz << '\n';
}

The execution times are more or less the same. I am assuming there is a problem with the multi-threaded version?

P.S. I looked at all of the other similar questions here, I don't see how they directly apply to this task...

Solution

You can do with without any mutex.

Create your vector
Use a mutex just to (and technically this probably isn't ncessary) to create an iterator point at v.begin () + itsThreadIndex*NumPerThread;
then each thread can freely increment that iterator and write to a part of the vector not touched by other threads.

Be sure each thread has its own copy of

   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<> dis(1, 1000);

That should run much faster.

UNTESTED code - but this should make my above suggestion more clear:

using Clock = std::chrono::high_resolution_clock;

namespace SharedVars
{
    const unsigned int N = std::thread::hardware_concurrency(); //number of threads on device
    const unsigned int NumPerThread = 5e5; //number of random numbers to generate per thread
    std::vector<int> RandNums(NumPerThread*N);
    std::mutex mtx;
}

void PerThread_AddN(int threadNumber)
{
    using namespace SharedVars;
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> dis(1, 1000);
    int sz = 0;

    vector<int>::iterator from;
    vector<int>::iterator to;
    {
        std::lock_guard<std::mutex> lock(mtx);  // hold the lock only while accessing shared vector, not while accessing its contents
        from = RandNums.begin () + threadNumber*NumPerThread;
        to = from + NumPerThread;
    }
    for (auto i = from; i < to; ++i)
    {
        *i = dis(gen);
    }
}

int main()
{
    auto start_time = Clock::now();
    std::vector<std::thread> threads;
    threads.reserve(N);
    
    for (unsigned int i=0; i<N; i++)
    {
        threads.emplace_back(std::move(std::thread(PerThread_AddN, i)));
    }
    for (auto &i: threads)
    {
        i.join();
    }
    auto end_time = Clock::now();
    std::cout << "\nTime difference = "
    << std::chrono::duration<double, std::nano>(end_time - start_time).count() << " nanoseconds\n";
    std::cout << "size = " << sz << '\n';
}