Simpson's Integration code with Thrust outputs different results on two machines with NVC++

I wrote a numerical integration code:

#include <thrust/inner_product.h>
#include <thrust/transform.h>
#include <thrust/for_each.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <array>

#include <iostream>


template<typename val_type, std::size_t n, std::size_t n_batch,
                template<typename...> typename Container>
struct Integrator {

    const val_type coeff;

    Integrator(val_type a, val_type b) :
        coeff((b-a)/3./static_cast<val_type>(n)) {}

    template <typename itor_type>
    void operator()(itor_type f_begin, itor_type dens_begin) {

        val_type C = this->coeff;
        auto zitor_begin = thrust::make_zip_iterator(
                            thrust::make_tuple(
                              thrust::make_counting_iterator(0),f_begin));

        auto titor_begin = make_transform_iterator(zitor_begin,
        [C](auto _tuple){ 
            
             return static_cast<val_type>(thrust::get<1>(_tuple) 
                          * (thrust::get<0>(_tuple)==0 ? C 
                                             : thrust::get<0>(_tuple)%2==0? 2.*C:4.*C)); 
        });

        auto binary_pred = [](int i,int j) { return i/n==j/n ; };

        thrust::reduce_by_key(thrust::make_counting_iterator(0),  // input key
                              thrust::make_counting_iterator(static_cast<int>(n*n_batch)),
                              titor_begin,                        // input value
                                                    thrust::make_discard_iterator(),    // output key
                              dens_begin,                         // output value
                              binary_pred);
    } // end of operator()

};

The algorithm is a simple Simpson integration. And here is the main.cu:

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include "Integrator.hpp"

using Real = double;


constexpr std::size_t n = 1000, m = 4;
constexpr Real h = 2.*M_PI/(static_cast<Real>(n)-1.);

int main(int argc, char* argv[]) {


    Integrator<Real,n,m,thrust::device_vector> inttest(0,2.*M_PI);
    
    thrust::host_vector<Real> _fff(n*m), _x(n);
    thrust::device_vector<Real> fff(n*m), fint(m);
    for (int i=0; i<n; ++i)
        _x[i] = i*h;
    for (int I=0; I<m; ++I)
        for (int i=0; i<n; ++i)
            _fff[I*n+i] = std::sin(0.5*_x[i]);
    
    fff = _fff;
    inttest(fff.begin(),fint.begin());
    
    thrust::copy(fint.begin(),fint.end(),
                  std::ostream_iterator<Real>(std::cout," "));
    std::cout << std::endl;


}

I compiled this code on a Windows Subsystem for Linux (WSL) of a Windows11 system and a Ubuntu22.4 Linux server. Compilation succeeded on both machines. (I simply used nvc++ main.cu)

The two systems (WSL and Ubuntu Linux) have exactly the same

gcc version (11.3),
cuda version (12.0) and
nvc++ version (23.3).

However, only the Linux machine yields the correct result (should be 3.99 3.99 3.99 3.99), while The WSL Windows11 machine yields 0 0 0 0. And the size of the executable a.out file is 24Mb on WSL and 8Mb on the Linux machine, so I know something wrong must have happened, but where?

Solution

As the problem only occurs with nvc++ and not with nvcc, this seems to be a compiler problem.

The NVHPC SDK (and with it the nvc++ compiler) does not officially support Windows or WSL yet, as none of them is mentioned in the installation guide or the release notes.