Search code examples
cudagpgputhrust

CUDA: slower thrust::reduce after calling thrust::for_each_n


I am trying to take a sum of numbers using thrust with GK107 [GeForce GTX 650]. I am confused to observe that the execution time for thrust::reduce significantly increases just after initializing a device_vector<curandState> on the memory.

The following is the sample code:

#include <iostream>
#include <stack>
#include <ctime>

#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/for_each.h>
#include <curand.h>
#include <curand_kernel.h>

struct tic_toc{
    std::stack<clock_t> tictoc_stack;
    inline void tic() { tictoc_stack.push(clock());}
    inline void toc() {
        std::cout << "Time elapsed: "
            << ((double)(clock() - tictoc_stack.top())) / CLOCKS_PER_SEC << "s"
            << std::endl;
        tictoc_stack.pop();
    }
};

struct curand_setup{
    using init_tuple = thrust::tuple<int, curandState &>;
    const unsigned long long seed;
    curand_setup(unsigned long long _seed) : seed(_seed) {}
    __device__ void operator()(init_tuple t){
        curandState s;
        int id = thrust::get<0>(t);
        curand_init(seed, id, 0, &s);
        thrust::get<1>(t) = s;
    }
};

int main(int argc, char** argv){
    int N = 1<<18;
    std::cout << "N " << N << std::endl;
    tic_toc tt;

    thrust::device_vector<float> val(N,1);

    tt.tic();
    float mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
    tt.toc();

    thrust::device_vector<curandState> rand_state(N);
    auto rand_init_it = thrust::make_zip_iterator(
            thrust::make_tuple(thrust::counting_iterator<int>(0),rand_state.begin()));
    thrust::for_each_n(rand_init_it, N, curand_setup(0));

    tt.tic();
    mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
    tt.toc();

    tt.tic();
    mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
    tt.toc();

    return 0;
}

and the output is:

Time elapsed: 0.000594s
Time elapsed: 5.60026s
Time elapsed: 0.001098s

The situation did not change when I wrote my own kernel for the summation or I copied the data to thrust::host_vector and reduced them.

Why is thrust::reduce so slow just after initializing thrust::device_vector<curandState>, and is there any way to avoid this problem? I would appreciate the help.

My system is Linux Mint 18.3 with kernel 4.15.0-23-generic.

output of nvcc --version : nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2015 NVIDIA Corporation Built on Tue_Aug_11_14:27:32_CDT_2015 Cuda compilation tools, release 7.5, V7.5.17


Solution

  • Why is thrust::reduce so slow just after initializing thrust::device_vector<curandState>

    It isn't. The source of your confusion is your time measurement, which is incorrect.

    In general, thrust API calls which operate on the device are asynchronous on the host. The only exceptions are calls which return a value (and thrust::reduce is one of those). As a result, the middle call in your code is not only measuring the execution time of thrust::reduce, but also the prior thrust::for_each_n call, and it is that prior call which is much slower.

    You can confirm this to yourself in two ways. If you modify your thrust code like this:

    tt.tic();
    float mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
    tt.toc();
    
    thrust::device_vector<curandState> rand_state(N);
    auto rand_init_it = thrust::make_zip_iterator(
            thrust::make_tuple(thrust::counting_iterator<int>(0),rand_state.begin()));
    thrust::for_each_n(rand_init_it, N, curand_setup(0));
    cudaDeviceSynchronize(); // wait until for_each is complete
    
    tt.tic();
    mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
    tt.toc();
    
    tt.tic();
    mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
    tt.toc();
    

    You should get something like this:

    $ nvcc -arch=sm_52 -std=c++11 -o slow_thrust slow_thrust.cu 
    $ ./slow_thrust 
    N 262144
    Time elapsed: 0.000471s
    Time elapsed: 0.000621s
    Time elapsed: 0.000448s
    

    i.e. when you use cudaDeviceSynchronize() to capture the runtime of the prior call, all the reduce calls have about the same runtime. Alternatively you can use a profiling tool on your original code, something like:

    $ nvprof --print-gpu-trace ./slow_thrust
    N 262144
    ==7870== NVPROF is profiling process 7870, command: ./slow_thrust
    Time elapsed: 0.000521s
    Time elapsed: 0.06983s
    Time elapsed: 0.000538s
    ==7870== Profiling application: ./slow_thrust
    ==7870== Profiling result:
       Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
    214.30ms  7.6800us            (512 1 1)       (256 1 1)         8        0B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<float>, float>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<float>, float>, unsigned long>(thrust::device_ptr<float>, float) [109]
    214.56ms  5.8550us             (52 1 1)       (256 1 1)        29       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [128]
    214.58ms  2.7200us              (1 1 1)       (256 1 1)        27       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [136]
    214.60ms  1.1840us                    -               -         -         -         -        4B  3.2219MB/s      Device    Pageable  GeForce GTX 970         1         7  [CUDA memcpy DtoH]
    214.98ms  221.27us            (512 1 1)       (256 1 1)        20        0B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW>, unsigned long>(thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW) [151]
    219.51ms  69.492ms            (512 1 1)       (256 1 1)       108        0B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::detail::normal_iterator<thrust::device_ptr<curandStateXORWOW>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<curand_setup, void>>, int>, thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::detail::normal_iterator<thrust::device_ptr<curandStateXORWOW>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<curand_setup, void>>, int>(thrust::use_default, thrust::use_default) [160]
    289.00ms  9.5360us             (52 1 1)       (256 1 1)        29       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [179]
    289.01ms  3.4880us              (1 1 1)       (256 1 1)        27       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [187]
    289.07ms  1.3120us                    -               -         -         -         -        4B  2.9075MB/s      Device    Pageable  GeForce GTX 970         1         7  [CUDA memcpy DtoH]
    289.66ms  9.9520us             (52 1 1)       (256 1 1)        29       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [211]
    289.68ms  3.3280us              (1 1 1)       (256 1 1)        27       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [219]
    289.69ms  1.3120us                    -               -         -         -         -        4B  2.9075MB/s      Device    Pageable  GeForce GTX 970         1         7  [CUDA memcpy DtoH]
    

    There you can see that the three calls which make up a reduce operation are taking cumulatively 8-13 microseconds each, whereas the for_each_n requires 69 milliseconds to complete.