I have been trying to implement some code requiring to call reduce on thrust::device_ptr, and the results are not consistent with CPU implementation while dealing with large values. I have to deal with large values. So is there a way around:
My code:
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
#define NZ 412//
#define NX 402//
using namespace std;
using real =double;
void allocate_array_2d(real**& preal, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
preal = new real * [dim1];
preal[0] = new real[dim1 * dim2];
for (int i = 1; i < dim1; i++) preal[i] = preal[i - 1] + dim2;
for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2; j++) {
preal[i][j] = 0;
}
}
}
#define cudaCheckError(code) \
{ \
if ((code) != cudaSuccess) { \
fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
cudaGetErrorString(code)); \
} \
}
int main()
{
real** a;
std::cout.precision(30);
allocate_array_2d(a, NZ, NX);//input array
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
a[i][j] = 2.14748e+09;
}
}
real* da;
cudaCheckError(cudaMalloc(&da, NZ * NX * sizeof(real)));
cudaCheckError(cudaMemcpy(da,a[0], NZ * NX * sizeof(real),cudaMemcpyHostToDevice));
///************************
//CUDA KERNELS ARE HERE
// REMOVED FOR CLEAR QUESTION
///*************************
real sum1=0;
thrust::device_ptr<real> dev_ptr = thrust::device_pointer_cast(da);
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
cout<<" \nsum gpu "<< sum1<<"\n";
real sum2=0;
////////CPU PART DOING SAME THING//////
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
sum2 += a[i][j];
}
}
cout<<"\nsum cpu "<< sum2<<"\n";
if((sum2-sum1)<0.001)
std::cout << "\nSUCESS "<< "\n";
else
std::cout << "\nFailure & by "<<sum2-sum1<< "\n";
}
The compiler that I am using is nvcc and my graphics card is nvidia 1650 with compute capability 7.5.
According to the documentation, thrust expects the type for summation to be reflected in the init
value:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
^
The type of that constant you have is an integral type. If you change that to a double-precision constant:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0.0, thrust::plus<real>());
you get matching results, between CPU and GPU, according to my testing. (You could alternatively cast your constant to real
type: (real)0
and use that, and there are other ways to address this as well, such as dropping the use of the init value and the binary op.)