Search code examples
c++boostcudathrust

Thrust+boost code compilation error


I have strange problem which I can't solve. It's connected with boost+thrust code.

Code:

#include <boost/config/compiler/nvcc.hpp>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/sequence.h>
#include <thrust/random.h>
#include <thrust/generate.h>
#include <thrust/detail/type_traits.h>

#include <cuda_runtime.h>

#include <cublas_v2.h>
#include <common/inc/helper_cuda.h>

#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_int_distribution.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/generate.hpp>
#include <boost/compute/algorithm/generate_n.hpp>


#include <algorithm>
#include <time.h>
#include <limits.h>
#include <algorithm>

using namespace boost::numeric::ublas;
using namespace boost::random;
using namespace boost::compute;


int main(int argc, char **argv)
{
    int N = 100000;

    unbounded_array<float> lineMatrix1(N*N);
    unbounded_array<float> lineMatrix2(N*N);    

    generate_n(lineMatrix1.begin(), N*N, []() { return (10 * rand() / RAND_MAX); });
    generate_n(lineMatrix2.begin(), N*N, []() { return (10 * rand() / RAND_MAX); });    

    matrix<float> matrix1(N, N, lineMatrix1);
    matrix<float> matrix2(N, N, lineMatrix2);
    matrix<float> zeroMatrix(N, N, 0);  
    matrix<float> zeroMatrix2(N, N, 0);

    //boost single core computation start

    auto matrix3 = prod(matrix1, matrix2);

    //boost single core computation finish

    //thrust computation start

    findCudaDevice(argc, (const char **)argv);

    cublasHandle_t handle;
    cublasCreate(&handle);

    float alpha = 1.0f;
    float beta = 0.0f;

    auto result = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, matrix1.data().cbegin(), N, matrix2.data().cbegin(), N, &beta, zeroMatrix.data().begin(), N);
    cudaDeviceSynchronize();

    thrust::device_vector<float> deviceMatrix1(N*N);
    thrust::device_vector<float> deviceMatrix2(N*N);
    thrust::device_vector<float> deviceZeroMatrix(N*N, 0);

    thrust::copy(matrix1.data().cbegin(), matrix1.data().cend(), deviceMatrix1.begin());
    thrust::copy(matrix2.data().cbegin(), matrix2.data().cend(), deviceMatrix2.begin());

    auto result2 = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, deviceMatrix1.data().get(), N, deviceMatrix2.data().get(), N, &beta, deviceZeroMatrix.data().get(), N);
    cudaDeviceSynchronize();

    thrust::copy(deviceZeroMatrix.cbegin(), deviceZeroMatrix.cend(), zeroMatrix2.data().begin());

    std::cout << result << std::endl;
    std::cout << result2 << std::endl;

    //thrust computation finish    

    float eps = 0.00001;
    int differCount1 = 0;
    int differCount2 = 0;

    for (int i = 0; i < matrix3.size1(); i++)
    {
        for (int j = 0; j < matrix3.size2(); j++)
        {
            if (std::abs(matrix3(i, j) != zeroMatrix(i, j)) > eps)
                differCount1++;

            if (std::abs(matrix3(i, j) != zeroMatrix2(i, j)) > eps)
                differCount2++;
        }
    }

    std::cout << differCount1 << std::endl;
    std::cout << differCount2 << std::endl;

    char c;
    std::cin >> c;

    return 0;
}

This file has name 'myFirstMatrixTest.cu'.

So, I have compilator errors:

MSB3721 exit from command ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\bin\nvcc.exe" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -gencode=arch=compute_37,code=\"sm_37,compute_37\" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_52,code=\"sm_52,compute_52\" -gencode=arch=compute_60,code=\"sm_60,compute_60\" -gencode=arch=compute_61,code=\"sm_61,compute_61\" -gencode=arch=compute_70,code=\"sm_70,compute_70\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.14.26428\bin\HostX86\x64" -x cu -rdc=true -I./ -I../common/inc -I../../common/inc -I/common/inc -I../ -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 --compile -cudart static -Xcompiler "/wd 4819" -g -DWIN32 -DWIN32 -D_MBCS -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /FS /Zi /RTC1 /MTd " -o x64/Debug/MyFirstMatrixTest.cu.obj "C:\User Root\Repository\CUDA Projects\MatrixMultiplicationThrust\MyFirstMatrixTest.cu"" with code "2". MyFirstMatrixTest C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\VC\VCTargets\BuildCustomizations\CUDA 9.2.targets 707

and this:

Fatal Error C1012 unmatched parenthesis : missing character ")" MyFirstMatrixTest c:\local\boost\preprocessor\slot\detail\shared.hpp 27

Why could this error occur?

Thank you.


Solution

  • Well, the first problem is

    int N = 100000;
    

    So N^2 = 10,000,000,000... (will never fit in an int). That is 10G*4 bytes(float) = 40 GBytes of data. For me that throws a memory exception.

    The next problem I had was with the combination of unbounded_array and generate_n. Just didn't work. But since you're using Thrust, use the Thrust types and algorithms (I'm not sure why Thrust has it's own types to replace STL, but whatever).

    I'm using Visual Studio 2017 v15.7 in 2015 mode (else I get a not supported error) with Cuda v9.2 and Boost 1.67.0.

    I modified your code until it compiles correctly: (Note the correction in the randomizer functor, it was first only generating integers and casting them to floats)

    #include <boost/config/compiler/nvcc.hpp>
    
    #include <thrust/host_vector.h>
    #include <thrust/device_vector.h>
    #include <thrust/copy.h>
    #include <thrust/generate.h>
    #include <thrust/inner_product.h>
    
    #include <cuda_runtime.h>
    
    #include <cublas_v2.h>
    #pragma comment(lib,"cublas.lib")
    #include <helper_cuda.h>
    
    #include <boost/numeric/ublas/matrix.hpp>
    //#include <boost/numeric/ublas/io.hpp>
    using boost::numeric::ublas::matrix;
    
    #include <random>
    
    int main(int argc, char **argv)
    {
        constexpr size_t N = 100;
        constexpr size_t NN = N * N;
    
        thrust::host_vector<float> lineMatrix1; lineMatrix1.reserve(NN);
        thrust::host_vector<float> lineMatrix2; lineMatrix2.reserve(NN);
        {
            std::random_device rd;  //Will be used to obtain a seed for the random number engine
            std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd()
            std::uniform_real_distribution<float> dis(0.0f, 10.0f);
            auto genRnd = [&]() { return dis(gen); };
            thrust::generate_n(std::back_inserter(lineMatrix1), NN, genRnd);
            thrust::generate_n(std::back_inserter(lineMatrix2), NN, genRnd);
        }
    
        matrix<float> matrix1(N, N);
        thrust::copy_n(std::cbegin(lineMatrix1), NN, std::begin(matrix1.begin1()));
        //std::cout << "Matrix 1:\n" << matrix1 << std::endl;
    
        matrix<float> matrix2(N, N);
        thrust::copy_n(std::cbegin(lineMatrix2), NN, std::begin(matrix2.begin1()));
        //std::cout << "Matrix 2:\n" << matrix2 << std::endl;
    
        //auto matrix3 = prod(matrix1, matrix2);
        auto matrix3 = trans(prod(trans(matrix1), trans(matrix2)));
        //std::cout << "Matrix 3:\n" << matrix3 << std::endl;
    
        thrust::host_vector<float> hostResult; hostResult.reserve(NN);
        for (auto rowIt = matrix3.cbegin1(); rowIt != matrix3.cend1(); rowIt++)
            for (const auto& element : rowIt)
                hostResult.push_back(element);
        std::cout << "Host Result:\n";
        for (const auto& el : hostResult) std::cout << el << " ";
        std::cout << std::endl;
        //////boost single core computation finish
    
        //////thrust computation start
        findCudaDevice(argc, (const char **)argv);
        cublasHandle_t handle;
        cublasCreate(&handle);
    
        const float alpha = 1.0f;
        const float beta = 0.0f;
    
        thrust::device_vector<float> deviceMatrix1; deviceMatrix1.reserve(NN);
        thrust::copy_n(std::cbegin(lineMatrix1), NN, std::back_inserter(deviceMatrix1));
    
        thrust::device_vector<float> deviceMatrix2; deviceMatrix2.reserve(NN);
        thrust::copy_n(std::cbegin(lineMatrix2), NN, std::back_inserter(deviceMatrix2));
    
        thrust::device_vector<float> deviceZeroMatrix(NN,0);
        auto result2 = cublasSgemm(handle,
            CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
            &alpha,
            deviceMatrix1.data().get(), N,
            deviceMatrix2.data().get(), N,
            &beta,
            deviceZeroMatrix.data().get(), N);
        cudaDeviceSynchronize();
    
        cublasDestroy(handle);
    
        thrust::host_vector<float> deviceResult; deviceResult.reserve(NN);
        thrust::copy_n(std::cbegin(deviceZeroMatrix), NN, std::back_inserter(deviceResult));
        std::cout << "Device Result:\n";
        for (const auto& el : deviceResult) std::cout << el << " ";
        std::cout << std::endl;
        //////thrust computation finish    
    
        auto accError = thrust::inner_product(std::cbegin(hostResult), std::cend(hostResult), std::cbegin(deviceResult), 0.0f, std::plus<float>(),
            [](auto val1, auto val2) { return std::abs(val1 - val2); });
    
        std::cout << "Accumulated error: " << accError << std::endl;
        std::cout << "Average error: " << accError/NN << std::endl;
    
        std::cin.ignore();
    
        return 0;
    }
    

    edit: Fixed the code. ublas matrix stores the matrices different then vector, so I had to transpose the matrices and the result. Furthermore, it turned out to be difficult to copy the ublas matrix back to a vector.

    edit2: compilation parameters

    "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\bin\nvcc.exe" -gencode=arch=compute_30,code=\"sm_30,compute_30\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64" -x cu  -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include"  -G   --keep-dir x64\Debug -maxrregcount=0  --machine 64 --compile -cudart static  -g   -DWIN32 -DWIN64 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /FS /Zi /RTC1 /MDd " -o x64\Debug\kernel.cu.obj "C:\Cpp\Cuda\SoHelp2\kernel.cu"