Search code examples
c++cudathrust

Unexplained errors in namespace thrust::system::cuda::thrust specifically in "system_error" and "cuda_category"


I am trying to cast a raw pointer using thrust::raw_pointer_cast to catch the output in the functor. I have tried multiple approaches to passing a pointer to a float, but keep getting a memory conflict and two intellisense errors thrust::system::cuda::thrust has no member "system_error" and has no member "cuda_category". The odd thing is that it seems to be an error in the program throw_on_error.hpp, which appears to be part of the BULK library even though I have not specifically referenced BULK. I'm new to C++ so it may be possible that I am misunderstanding pointers, or that I'm missing some sort of include.

Below is the version of the code I've been trying to get to work. Any help would be greatly appreciated.

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/sequence.h>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <algorithm>
#include <memory.h>
#include <cstdio>
#include <thread>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>


using namespace std;

const int num_segs = 1;  // number of segments to sort
const int num_vals = 5;  // number of values in each segment


template <typename T> 
struct sort_vector
{

    T *Ndata;
    T *Ddata;
    T *answer;

    sort_vector(T *_Ndata, T *_Ddata, float *a) : Ndata(_Ndata), Ddata(_Ddata), answer(a) {};


    __host__ __device__ void operator()(int idx)
    {
        thrust::sort(thrust::seq, Ndata + idx*num_vals, Ndata + ((idx + 1)*num_vals));
        thrust::sort(thrust::seq, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
        *answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));

    }
};

int main() {

    thrust::device_vector<float> d_Ndata(num_segs*num_vals);
    d_Ndata[0] = 30;
    d_Ndata[1] = 5.5;
    d_Ndata[2] = 60;
    d_Ndata[3] = 21;
    d_Ndata[4] = 2;

    thrust::device_vector<float> d_Ddata(num_segs*num_vals);
    d_Ddata[0] = 50;
    d_Ddata[1] = 9.5;
    d_Ddata[2] = 30;
    d_Ddata[3] = 8.1;
    d_Ddata[4] = 1;

    cout << "original norm" << endl;
    int f = 0;
    while (f < num_segs*num_vals){
        cout << d_Ndata[f] << endl;
        f++;
    }

    cout << "original dut" << endl;
    int g = 0;
    while (g < num_segs*num_vals){
        cout << d_Ddata[g] << endl;
        g++;
    }

    thrust::device_vector<int> d_idxs(num_segs);
    thrust::sequence(d_idxs.begin(), d_idxs.end());

    float *answer = (float*)malloc(sizeof(float));

    cudaStream_t s1;
    cudaStreamCreate(&s1);


    clock_t start;
    double duration;
    start = clock();

    thrust::for_each(thrust::cuda::par.on(s1),
        d_idxs.begin(),
    d_idxs.end(), sort_vector<float>(thrust::raw_pointer_cast(d_Ndata.data()), thrust::raw_pointer_cast(d_Ddata.data()), thrust::raw_pointer_cast(answer)));

    cudaStreamSynchronize(s1);

    cout << "sum" << endl;
    cout << answer << endl;

    //free(answer);

    cudaStreamDestroy(s1);


    duration = (clock() - start) / (double)CLOCKS_PER_SEC;
    cout << "time " << duration << endl;

    cin.get();
    return 0;
}

Solution

  • The main problem is here:

    float *answer = (float*)malloc(sizeof(float));
    

    this is creating a host memory allocation. When you then pass that pointer to the functor:

     thrust::raw_pointer_cast(answer)
    

    you are passing a pointer to host memory to a functor that will run in device code. If the functor attempts to access that location, it will be an illegal access. In CUDA, device code is not allowed to directly access a host pointer location, and vice versa (ignoring various concepts that are not in play here).

    So when your functor code does this:

    *answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
    

    that will trigger an illegal access when it tries to write to *answer.

    A straightforward solution would be to create answer as pointing to a properly allocated location in device memory. The following code demonstrates the change and runs without error for me:

    $ cat t1190.cu
    #include <iostream>
    #include <thrust/device_vector.h>
    #include <thrust/host_vector.h>
    #include <thrust/sort.h>
    #include <thrust/execution_policy.h>
    #include <thrust/for_each.h>
    #include <thrust/sequence.h>
    #include <cstdlib>
    #include <ctime>
    #include <vector>
    #include <algorithm>
    #include <memory.h>
    #include <cstdio>
    #include <thread>
    #include <thrust/copy.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/reduce.h>
    
    
    using namespace std;
    
    const int num_segs = 1;  // number of segments to sort
    const int num_vals = 5;  // number of values in each segment
    
    
    template <typename T>
    struct sort_vector
    {
    
        T *Ndata;
        T *Ddata;
        T *answer;
    
        sort_vector(T *_Ndata, T *_Ddata, float *a) : Ndata(_Ndata), Ddata(_Ddata), answer(a) {};
    
    
        __host__ __device__ void operator()(int idx)
        {
            thrust::sort(thrust::seq, Ndata + idx*num_vals, Ndata + ((idx + 1)*num_vals));
            thrust::sort(thrust::seq, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
            *answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
    
        }
    };
    
    int main() {
    
        thrust::device_vector<float> d_Ndata(num_segs*num_vals);
        d_Ndata[0] = 30;
        d_Ndata[1] = 5.5;
        d_Ndata[2] = 60;
        d_Ndata[3] = 21;
        d_Ndata[4] = 2;
    
        thrust::device_vector<float> d_Ddata(num_segs*num_vals);
        d_Ddata[0] = 50;
        d_Ddata[1] = 9.5;
        d_Ddata[2] = 30;
        d_Ddata[3] = 8.1;
        d_Ddata[4] = 1;
    
        cout << "original norm" << endl;
        int f = 0;
        while (f < num_segs*num_vals){
            cout << d_Ndata[f] << endl;
            f++;
        }
    
        cout << "original dut" << endl;
        int g = 0;
        while (g < num_segs*num_vals){
            cout << d_Ddata[g] << endl;
            g++;
        }
    
        thrust::device_vector<int> d_idxs(num_segs);
        thrust::sequence(d_idxs.begin(), d_idxs.end());
    
        thrust::device_vector<float> dv_answer(1);
        //float *answer = (float*)malloc(sizeof(float));
    
        cudaStream_t s1;
        cudaStreamCreate(&s1);
    
    
        clock_t start;
        double duration;
        start = clock();
    
        thrust::for_each(thrust::cuda::par.on(s1),
            d_idxs.begin(),
        d_idxs.end(), sort_vector<float>(thrust::raw_pointer_cast(d_Ndata.data()), thrust::raw_pointer_cast(d_Ddata.data()), thrust::raw_pointer_cast(dv_answer.data())));
    
        cudaStreamSynchronize(s1);
    
        cout << "sum" << endl;
        cout << dv_answer[0] << endl;
    
        //free(answer);
    
        cudaStreamDestroy(s1);
    
    
        duration = (clock() - start) / (double)CLOCKS_PER_SEC;
        cout << "time " << duration << endl;
    
        return 0;
    }
    $ nvcc -std=c++11  t1190.cu -o t1190
    $ ./t1190
    original norm
    30
    5.5
    60
    21
    2
    original dut
    50
    9.5
    30
    8.1
    1
    sum
    98.6
    time 0.000919
    $
    

    I'm not going to try to explain the intellisense errors. Intellisense generally doesn't play well with CUDA, and as you've seen intellisense may flag things that will actually compile just fine (such as this code here in this question). If a CUDA code compile correctly, there's a good chance that the Intellisense reported issues can be safely ignored.

    As an additional few comments:

    1. You seem to be going down an odd path here for a thrust beginner, running Thrust algorithms from within a functor. There's nothing technically wrong with what you are doing, but this type of code would normally be reserved for specific situations, not for general thrust usage. Since your num_segs is 1 in this example, you will be running one CUDA thread to perform all this work, which will definitely not be performant. If you intend to scale up later, great. I've made similar comments previously, so I'll not elaborate further here.

    2. This functor writes to a single location (*answer) to deposit its result. If you scale this up to more than one thread, you're going to have to provide more than one location for the functor to write to (one per thread or per element in the vector you pass to for_each) or else threads will be overwriting results from each other.