Search code examples
c++stdthread

C++ multithreaded version of creating vector of random numbers slower than single-threaded version


I am trying to write a multi-threaded program to produce a vector of N*NumPerThread uniform random integers, where N is the return value of std::thread::hardware_concurrency() and NumPerThread is the amount of random numbers I want each thread to generate.

I created a multi-threaded version:

#include <iostream>
#include <thread>
#include <vector>
#include <random>
#include <chrono>

using Clock = std::chrono::high_resolution_clock;

namespace Vars
{
    const unsigned int N = std::thread::hardware_concurrency(); //number of threads on device
    const unsigned int NumPerThread = 5e5; //number of random numbers to generate per thread
    std::vector<int> RandNums(NumPerThread*N);
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> dis(1, 1000);
    int sz = 0;
}

using namespace Vars;

void AddN(int start)
{
    static std::mutex mtx;
    std::lock_guard<std::mutex> lock(mtx);
    for (unsigned int i=start; i<start+NumPerThread; i++)
    {
        RandNums[i] = dis(gen);
        ++sz;
    }
}

int main()
{
    auto start_time = Clock::now();
    std::vector<std::thread> threads;
    threads.reserve(N);
    
    for (unsigned int i=0; i<N; i++)
    {
        threads.emplace_back(std::move(std::thread(AddN, i*NumPerThread)));
    }

    for (auto &i: threads)
    {
        i.join();
    }
        
    auto end_time = Clock::now();
    std::cout << "\nTime difference = "
    << std::chrono::duration<double, std::nano>(end_time - start_time).count() << " nanoseconds\n";
    std::cout << "size = " << sz << '\n';
}

and a single-threaded version

#include <iostream>
#include <thread>
#include <vector>
#include <random>
#include <chrono>


using Clock = std::chrono::high_resolution_clock;



namespace Vars
{
    const unsigned int N = std::thread::hardware_concurrency(); //number of threads on device
    const unsigned int NumPerThread = 5e5; //number of random numbers to generate per thread
    std::vector<int> RandNums(NumPerThread*N);
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> dis(1, 1000);
    int sz = 0;
}
    


using namespace Vars;


void AddN()
{
    for (unsigned int i=0; i<NumPerThread*N; i++)
    {
        RandNums[i] = dis(gen);
        ++sz;
    }
}

int main()
{
    auto start_time = Clock::now();

    AddN();
    
    auto end_time = Clock::now();
    std::cout << "\nTime difference = "
    << std::chrono::duration<double, std::nano>(end_time - start_time).count() << " nanoseconds\n";
    std::cout << "size = " << sz << '\n';
}

The execution times are more or less the same. I am assuming there is a problem with the multi-threaded version?

P.S. I looked at all of the other similar questions here, I don't see how they directly apply to this task...


Solution

  • You can do with without any mutex.

    • Create your vector
    • Use a mutex just to (and technically this probably isn't ncessary) to create an iterator point at v.begin () + itsThreadIndex*NumPerThread;
    • then each thread can freely increment that iterator and write to a part of the vector not touched by other threads.

    Be sure each thread has its own copy of

       std::random_device rd;
       std::mt19937 gen(rd());
       std::uniform_int_distribution<> dis(1, 1000);
    

    That should run much faster.

    UNTESTED code - but this should make my above suggestion more clear:

    using Clock = std::chrono::high_resolution_clock;
    
    namespace SharedVars
    {
        const unsigned int N = std::thread::hardware_concurrency(); //number of threads on device
        const unsigned int NumPerThread = 5e5; //number of random numbers to generate per thread
        std::vector<int> RandNums(NumPerThread*N);
        std::mutex mtx;
    }
    
    void PerThread_AddN(int threadNumber)
    {
        using namespace SharedVars;
        std::random_device rd;
        std::mt19937 gen(rd());
        std::uniform_int_distribution<> dis(1, 1000);
        int sz = 0;
    
        vector<int>::iterator from;
        vector<int>::iterator to;
        {
            std::lock_guard<std::mutex> lock(mtx);  // hold the lock only while accessing shared vector, not while accessing its contents
            from = RandNums.begin () + threadNumber*NumPerThread;
            to = from + NumPerThread;
        }
        for (auto i = from; i < to; ++i)
        {
            *i = dis(gen);
        }
    }
    
    int main()
    {
        auto start_time = Clock::now();
        std::vector<std::thread> threads;
        threads.reserve(N);
        
        for (unsigned int i=0; i<N; i++)
        {
            threads.emplace_back(std::move(std::thread(PerThread_AddN, i)));
        }
        for (auto &i: threads)
        {
            i.join();
        }
        auto end_time = Clock::now();
        std::cout << "\nTime difference = "
        << std::chrono::duration<double, std::nano>(end_time - start_time).count() << " nanoseconds\n";
        std::cout << "size = " << sz << '\n';
    }