Search code examples
c++cudathrust

How to prevent the copy of thrust's device_vector to device


So I have a helper class (creatively named “BetterVector”) that is designed to be passed back and forth from host and device, with most of its functionality accessible from either side (a significant flaw of device_vector). However, kernels fail with a non-descriptive allocation error.

From the stack trace, it appears to trigger sometimes on the copy constructor, and sometimes on the deconstructor, and I’m not entirely sure why it changes. I figured it was the device_vector data member having a host-only constructor and deconstructor, which I used the following post to utilize a union to prevent the calling of these functions, but the issue still persists. If any of you have any suggestions, it would be greatly appreciated.

main.cu testing file:

#include <abstract/BetterVector.cuh>

struct thrust_functor {
    abstract::BetterVector<int> vector;

    explicit thrust_functor(const abstract::BetterVector<int> &vector) : vector(vector) {}

    __host__ void operator()(int i) {
        printf("Thrust functor index %d: %d\n", i, (int) vector[i]);
    }
};

__global__ void baseCudaPrint(abstract::BetterVector<int>* ptr) {
    const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
    abstract::BetterVector<int> vector = *ptr;
    printf("Cuda kernel index %zu: %d\n", i, (int) vector[i]);
}


int main() {
    abstract::BetterVector<int> vector({1, 2, 3, 4});
    for (int i = 0; i < 4; i++) {
        printf("Host index %d: %d\n", i, (int) vector[i]);
    }
    printf("\n");

    abstract::BetterVector<int>* devVectorPtr;
    cudaMalloc(&devVectorPtr, sizeof(abstract::BetterVector<int>));
    cudaMemcpy(devVectorPtr, &vector, 1, cudaMemcpyHostToDevice);
    baseCudaPrint<<<1, vector.size()>>>(devVectorPtr);
    cudaDeviceSynchronize();
    cudaFree(devVectorPtr);
    printf("\n");

    thrust::counting_iterator<int> first(0);
    thrust::counting_iterator<int> last = first + vector.size();
    thrust::for_each(thrust::host, first, last, thrust_functor(vector));
    cudaDeviceSynchronize();
    printf("\n");
}

abstract/BetterVector.cuh:

#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/functional.h>

namespace abstract {
template<typename T>
    struct equal_to : public thrust::unary_function<T, bool> {
        T lhs;

        __device__ __host__ explicit equal_to(T lhs) : lhs(lhs) {}

        __device__ __host__ bool operator()(T rhs) {
            return lhs == rhs;
        }
    };
template<typename T, typename VecType = thrust::device_vector<T>>
class BetterVector {
protected:
    typename VecType::pointer raw;
    size_t cachedSize;
    union {
        VecType vector;
    };

public:

    __host__ BetterVector() : vector(), raw(vector.data()), cachedSize(0) {}

    __host__ explicit BetterVector(size_t size) : vector(size), raw(vector.data()), cachedSize(size) {}

    __host__ explicit BetterVector(VecType vec) : vector(vec), raw(vector.data()), cachedSize(vec.size()) {}

    __host__ explicit BetterVector(std::vector<T> vec) : vector(vec), raw(vector.data()), cachedSize(vec.size()) {}

    __host__ __device__ BetterVector(const BetterVector &otherVec) :
#ifndef __CUDA_ARCH__
            vector(otherVec.vector),
#endif
            cachedSize(otherVec.cachedSize), raw(otherVec.raw) {}


    __host__ __device__ virtual ~BetterVector() {
#ifndef __CUDA_ARCH__
        vector.~VecType();
#endif
    }

    __host__ __device__ typename VecType::const_reference operator[](size_t index) const {
#ifdef __CUDA_ARCH__
        return raw[index];
#else
        return vector[index];
#endif
    }

    __host__ __device__ size_t size() const {
#ifdef __CUDA_ARCH__
        return cachedSize;
#else
        return vector.size();
#endif
    }
}

Solution

  • The central issue here seems to be that by using the trick of placing items in union so that constructors and destructors are not automatically called, you have prevented proper initialization of vector, and your constructor(s) are not accomplishing that.

    1. For the first part of the test code, up through the CUDA kernel call, there is one constructor of interest for this particular observation, here:

      __host__ explicit BetterVector(std::vector<T> vec) : vector(vec), raw(vector.data()), cachedSize(vec.size()) {}
      

      My claim is vector(vec) is not properly constructing vector. I suspect this revolves around the use of the union, wherein the defined constructor is not called (and possibly instead a copy-initializer is used, but this is not clear to me).

      In any event, we can use a clue from the link you provided to resolve this:

    constructor can be called through so called "placement new"

    1. As mentioned in the comments, this copy operation cannot possibly be correct, it is only copying 1 byte:

      cudaMemcpy(devVectorPtr, &vector, 1, cudaMemcpyHostToDevice);
                                        ^
      
    2. The device version of printf doesn't seem to be understanding the format specifier %zu, I replaced it with %lu

    3. It's not a problem per se, but it may be worthwhile to point out that this line of code:

      abstract::BetterVector<int> vector = *ptr;
      

      produces a separate BetterVector object in each thread, initialized from the object passed to the kernel.

    This level of "fixing" will get you to the point where your main code appears to run correctly up through the CUDA kernel launch. However the thrust code thereafter still has a problem that I haven't been able to sort out. The call to for_each if working properly should generate 3 kernel calls "under the hood" even though it is a host function, due to your code design (using a device_vector in thrust host path. Very odd.) Anyway I'm not able to sort that out for you, but I can say that the 3 kernel calls each trigger a call to your __host__ __device__ constructor (as well as the corresponding destructor), which doesn't surprise me. Thrust is passing a BetterVector object via pass-by-value to each kernel launch, and doing so triggers a constructor/destructor sequence to support the pass by value operation. So given that we had to jump through hoops to get the previous constructor "working", there may be an issue in that sequence. But I haven't been able to pinpoint the problem.

    Anyway here is an example that has the items above addressed:

    $ cat t37.cu
    #include <thrust/device_vector.h>
    #include <thrust/device_ptr.h>
    #include <thrust/functional.h>
    
    namespace abstract {
    template<typename T>
        struct equal_to : public thrust::unary_function<T, bool> {
            T lhs;
    
            __device__ __host__ explicit equal_to(T lhs) : lhs(lhs) {}
    
            __device__ __host__ bool operator()(T rhs) {
                return lhs == rhs;
            }
        };
    template<typename T, typename VecType = thrust::device_vector<T>>
    class BetterVector {
    protected:
        typename VecType::pointer raw;
        size_t cachedSize;
        union {
            VecType vector;
        };
    
    public:
    
        __host__ BetterVector() : vector(), raw(vector.data()), cachedSize(0) {}
    
        __host__ explicit BetterVector(size_t size) : vector(size), raw(vector.data()), cachedSize(size) {}
    
        __host__ explicit BetterVector(VecType vec) : vector(vec), raw(vector.data()), cachedSize(vec.size()) {}
    
    //    __host__ explicit BetterVector(std::vector<T> vec) : vector(vec), raw(vector.data()), cachedSize(vec.size()) {}
        __host__ explicit BetterVector(std::vector<T> vec) : cachedSize(vec.size()) { new (&vector) VecType(vec); raw = vector.data();}
    
        __host__ __device__ BetterVector(const BetterVector &otherVec) :
    #ifndef __CUDA_ARCH__
                vector(otherVec.vector),
    #endif
                cachedSize(otherVec.cachedSize), raw(otherVec.raw) {}
    
    
        __host__ __device__ virtual ~BetterVector() {
    #ifndef __CUDA_ARCH__
            vector.~VecType();
    #endif
        }
    
        __host__ __device__ typename VecType::const_reference operator[](size_t index) const {
    #ifdef __CUDA_ARCH__
            return raw[index];
    #else
            return vector[index];
    #endif
        }
    
        __host__ __device__ size_t size() const {
    #ifdef __CUDA_ARCH__
            return cachedSize;
    #else
            return vector.size();
    #endif
        }
    };
    }
    
    
    struct thrust_functor {
        abstract::BetterVector<int> vector;
    
        explicit thrust_functor(const abstract::BetterVector<int> &vector) : vector(vector) {}
    
        __host__ void operator()(int i) {
            printf("Thrust functor index %d: %d\n", i, (int) vector[i]);
        }
    };
    
    __global__ void baseCudaPrint(abstract::BetterVector<int>* ptr) {
        const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
        abstract::BetterVector<int> vector = *ptr;
        printf("Cuda kernel index %lu: %d\n", i, (int) vector[i]);
    }
    
    
    int main() {
            // these indented lines mysteriously "fix" the thrust problems
            thrust::device_vector<int> x1(4,1);
            thrust::device_vector<int> x2(x1);
            //
        abstract::BetterVector<int> vector({1, 2, 3, 4});
        for (int i = 0; i < 4; i++) {
            printf("Host index %d: %d\n", i, (int) vector[i]);
        }
        printf("\n");
    
        abstract::BetterVector<int>* devVectorPtr;
        cudaMalloc(&devVectorPtr, sizeof(abstract::BetterVector<int>));
        cudaMemcpy(devVectorPtr, &vector, sizeof(abstract::BetterVector<int>), cudaMemcpyHostToDevice);
        baseCudaPrint<<<1, vector.size()>>>(devVectorPtr);
        cudaDeviceSynchronize();
        cudaFree(devVectorPtr);
        printf("\n");
    
        thrust::counting_iterator<int> first(0);
        thrust::counting_iterator<int> last = first + vector.size();
        thrust::for_each(thrust::host, first, last, thrust_functor(vector));
        cudaDeviceSynchronize();
        printf("\n");
    }
    $ nvcc -std=c++14 t37.cu -o t37 -lineinfo -arch=sm_70
    $ cuda-memcheck ./t37
    ========= CUDA-MEMCHECK
    Host index 0: 1
    Host index 1: 2
    Host index 2: 3
    Host index 3: 4
    
    Cuda kernel index 0: 1
    Cuda kernel index 1: 2
    Cuda kernel index 2: 3
    Cuda kernel index 3: 4
    
    Thrust functor index 0: 1
    Thrust functor index 1: 2
    Thrust functor index 2: 3
    Thrust functor index 3: 4
    
    ========= ERROR SUMMARY: 0 errors
    $
    

    I'll also add a subjective comment that I think this code design is going to be troublesome (in case that is not clear already) and I would suggest that you consider another path for a "universal" vector. To pick just one example, your method for allowing access via host code using the thrust-provided [] operator is going to be horribly slow. That will invoke a separate cudaMemcpy for each item accessed that way. Anyway, good luck!