Search code examples
c++makefilecmakecudanvcc

Adapting CMakeList.txt to run with CUDA


I'm working with a slam system, i've install dso, which the code can be seen here::

https://github.com/JakobEngel/dso

Everything works fine, I manage to compile and run without errors. But know I want to parallelize the code, using CUDA. I'm having lot's of trouble adapting it's CMakeLists.txt in order to be able to use CUDA. The original CMakeLists from dso is available here:

dso CMakeLists.txt

I'm trying to adapt it basing my changes on this implementation of another author on another SLAM system:

ORB SLAM 2 CMakeLists.txt using CUDA

Right now my CMakeLists, with my changes (not working), is like this:

SET(PROJECT_NAME DSO)

PROJECT(${PROJECT_NAME})
CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
#set(CMAKE_VERBOSE_MAKEFILE ON)

set(BUILD_TYPE Release)
#set(BUILD_TYPE RelWithDebInfo) 

set(EXECUTABLE_OUTPUT_PATH bin)
set(LIBRARY_OUTPUT_PATH lib)
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)

# required libraries
#SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} "/usr/include")
find_package(SuiteParse REQUIRED)
find_package(Eigen3 REQUIRED)
find_package(Boost)

# optional libraries
find_package(LibZip QUIET)
find_package(Pangolin 0.2 QUIET)
find_package(OpenCV QUIET)
#find_package(OpenACC)

# flags
add_definitions("-DENABLE_SSE")
set(CMAKE_CXX_FLAGS
   "${SSE_FLAGS} -O3 -g -std=c++11"
)

set(CMAKE_C_FLAGS
    "${SSE_FLAGS} -O3 -g -std=c++11"
)

#LIST(APPEND CMAKE_C_FLAGS "-Wall -Wextra -DUSE_NVTX") <<<< Error: doesn't recognize -Wall -Wextra
#LIST(APPEND CMAKE_CXX_FLAGS "-Wall -Wextra -DUSE_NVTX") << Error: doesn't recognize -Wall -Wextra

find_package(CUDA REQUIRED)
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
SET(CUDA_HOST_COMPILER /usr/bin/g++)
LIST(APPEND CUDA_NVCC_FLAGS "--compiler-options -fno-strict-aliasing -use_fast_math -ccbin gcc-5")

set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")

if (MSVC)
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
endif (MSVC)

set(CMAKE_LIBRARY_OUTPUT_DIRECTORY lib)

cuda_include_directories(
  ${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc
)



# Sources files
set(dso_SOURCE_FILES
  ${PROJECT_SOURCE_DIR}/src/FullSystem/FullSystem.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/FullSystemOptimize.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/FullSystemOptPoint.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/FullSystemDebugStuff.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/FullSystemMarginalize.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/Residuals.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/CoarseTracker.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/CoarseInitializer.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/ImmaturePoint.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/HessianBlocks.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/PixelSelector2.cpp
  ${PROJECT_SOURCE_DIR}/src/OptimizationBackend/EnergyFunctional.cpp
  ${PROJECT_SOURCE_DIR}/src/OptimizationBackend/AccumulatedTopHessian.cpp
  ${PROJECT_SOURCE_DIR}/src/OptimizationBackend/AccumulatedSCHessian.cpp
  ${PROJECT_SOURCE_DIR}/src/OptimizationBackend/EnergyFunctionalStructs.cpp
  ${PROJECT_SOURCE_DIR}/src/util/settings.cpp
  ${PROJECT_SOURCE_DIR}/src/util/Undistort.cpp
  ${PROJECT_SOURCE_DIR}/src/util/globalCalib.cpp
)


include_directories(
  ${PROJECT_SOURCE_DIR}/src
  ${PROJECT_SOURCE_DIR}/thirdparty/Sophus
  ${PROJECT_SOURCE_DIR}/thirdparty/sse2neon
  ${EIGEN3_INCLUDE_DIR}
) 


# decide if we have pangolin
if (Pangolin_FOUND)
    message("--- found PANGOLIN, compiling dso_pangolin library.")
    include_directories( ${Pangolin_INCLUDE_DIRS} ) 
    set(dso_pangolin_SOURCE_FILES 
      ${PROJECT_SOURCE_DIR}/src/IOWrapper/Pangolin/KeyFrameDisplay.cpp
      ${PROJECT_SOURCE_DIR}/src/IOWrapper/Pangolin/PangolinDSOViewer.cpp)
    set(HAS_PANGOLIN 1)
else ()
    message("--- could not find PANGOLIN, not compiling dso_pangolin library.")
    message("    this means there will be no 3D display / GUI available for dso_dataset.")
    set(dso_pangolin_SOURCE_FILES )
    set(HAS_PANGOLIN 0)
endif ()

# decide if we have openCV
if (OpenCV_FOUND)
    message("--- found OpenCV, compiling dso_opencv library.")
    include_directories( ${OpenCV_INCLUDE_DIRS} )
    set(dso_opencv_SOURCE_FILES 
      ${PROJECT_SOURCE_DIR}/src/IOWrapper/OpenCV/ImageDisplay_OpenCV.cpp
      ${PROJECT_SOURCE_DIR}/src/IOWrapper/OpenCV/ImageRW_OpenCV.cpp)
    set(HAS_OPENCV 1)
else ()
    message("--- could not find OpenCV, not compiling dso_opencv library.")
    message("    this means there will be no image display, and image read / load functionality.")
    set(dso_opencv_SOURCE_FILES 
      ${PROJECT_SOURCE_DIR}/src/IOWrapper/ImageDisplay_dummy.cpp
      ${PROJECT_SOURCE_DIR}/src/IOWrapper/ImageRW_dummy.cpp)
    set(HAS_OPENCV 0)
endif ()

# decide if we have ziplib.
if (LIBZIP_LIBRARY)
    message("--- found ziplib (${LIBZIP_VERSION}), compiling with zip capability.")
    add_definitions(-DHAS_ZIPLIB=1)
    include_directories( ${LIBZIP_INCLUDE_DIR_ZIP} ${LIBZIP_INCLUDE_DIR_ZIPCONF} ) 
else()
    message("--- not found ziplib (${LIBZIP_LIBRARY}), compiling without zip capability.")
    set(LIBZIP_LIBRARY "")
endif()


# compile main library.
include_directories( ${CSPARSE_INCLUDE_DIR} ${CHOLMOD_INCLUDE_DIR}) 
cuda_add_library(dso SHARED ${dso_SOURCE_FILES} ${dso_opencv_SOURCE_FILES} ${dso_pangolin_SOURCE_FILES} 
${PROJECT_SOURCE_DIR}/src/teste.cu
)

#set_property( TARGET dso APPEND_STRING PROPERTY COMPILE_FLAGS -Wall )


if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") # OSX
    set(BOOST_THREAD_LIBRARY boost_thread-mt)
else()
    set(BOOST_THREAD_LIBRARY boost_thread)
endif()

# build main executable (only if we have both OpenCV and Pangolin)
if (OpenCV_FOUND AND Pangolin_FOUND)
    message("--- compiling dso_dataset.")
    add_executable(dso_dataset ${PROJECT_SOURCE_DIR}/src/main_dso_pangolin.cpp)
    target_link_libraries(dso_dataset dso boost_system cxsparse ${BOOST_THREAD_LIBRARY} ${LIBZIP_LIBRARY} ${Pangolin_LIBRARIES} ${OpenCV_LIBS})
else()
    message("--- not building dso_dataset, since either don't have openCV or Pangolin.")
endif()

unset(CMAKE_RUNTIME_OUTPUT_DIRECTORY)

So, 'main_dso_pangolin.cpp' is my main file. At this point, with only this changes the code compiles. But i wanted to try if i was able to make some CUDA code. In order to do this I created a 'teste.cu' file, that has the same code as one of the cuda samples, like this:

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

// CUDA runtime
#include </usr/local/cuda-9.0/include/cuda_runtime.h>
#include <cuda.h>

// helper functions and utilities to work with CUDA
#include </usr/local/cuda-9.0/samples/common/inc/helper_functions.h>
#include </usr/local/cuda-9.0/samples/common/inc/helper_cuda.h>

__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
{
    // __shared__ float shared[2 * blockDim.x];
    extern __shared__ float shared[];

    const int tid = threadIdx.x;
    const int bid = blockIdx.x;

    if (tid == 0) timer[bid] = clock();

    // Copy input.
    shared[tid] = input[tid];
    shared[tid + blockDim.x] = input[tid + blockDim.x];

    // Perform reduction to find minimum.
    for (int d = blockDim.x; d > 0; d /= 2)
    {
        __syncthreads();

        if (tid < d)
        {
            float f0 = shared[tid];
            float f1 = shared[tid + d];

            if (f1 < f0)
            {
                shared[tid] = f1;
            }
        }
    }

    // Write result.
    if (tid == 0) output[bid] = shared[0];

    __syncthreads();

    if (tid == 0) timer[bid+gridDim.x] = clock();
}

#define NUM_BLOCKS    64
#define NUM_THREADS   256

void xx(int argc, char** argv){

    printf("CUDA Clock sample\n");

    // This will pick the best possible CUDA capable device
    int dev = findCudaDevice(argc, (const char **)argv);

    float *dinput = NULL;
    float *doutput = NULL;
    clock_t *dtimer = NULL;

    clock_t timer[NUM_BLOCKS * 2];
    float input[NUM_THREADS * 2];

    for (int i = 0; i < NUM_THREADS * 2; i++)
    {
        input[i] = (float)i;
    }

    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
    checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
    checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));

    checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));

    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 *NUM_THREADS>>>(dinput, doutput, dtimer);

    checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));

    checkCudaErrors(cudaFree(dinput));
    checkCudaErrors(cudaFree(doutput));
    checkCudaErrors(cudaFree(dtimer));

    long double avgElapsedClocks = 0;

    for (int i = 0; i < NUM_BLOCKS; i++)
    {
        avgElapsedClocks += (long double) (timer[i + NUM_BLOCKS] - timer[i]);
    }

    avgElapsedClocks = avgElapsedClocks/NUM_BLOCKS;
    printf("Average clocks/block = %Lf\n", avgElapsedClocks);

}

And in my main, the first thing i do is to call this function. This time, when i do 'cmake' and 'make i get errors like:

/home/cesar/Documents/dso/src/teste.cu:18:21: error: ‘threadIdx’ was not declared in this scope
     const int tid = threadIdx.x;

/home/cesar/Documents/dso/src/teste.cu:19:21: error: ‘blockIdx’ was not declared in this scope
     const int bid = blockIdx.x;

I've install CUDA Toolkit correctly, but here is the version:

cesar@cesar-X550JX:/usr/local/cuda/bin$ /usr/local/cuda/bin/nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2017 NVIDIA Corporation
Built on Fri_Sep__1_21:08:03_CDT_2017
Cuda compilation tools, release 9.0, V9.0.176

What do you think i'm doing wrong or i'm missing? I'm having many difficulties adapting CMakeLists.txt due to its complexity and well defined structure.

--- EDIT ---

Running with make -j VERBOSE=1 i get this messages which tells me that a regular c++ compiler is being used:

/usr/bin/c++  -fPIC  -O3 -g -std=c++11 -D_FORCE_INLINES  -shared -Wl,-soname,libdso.so -o lib/libdso.so CMakeFiles/dso.dir/src/FullSystem/FullSystem.cpp.o CMakeFiles/dso.dir/src/FullSystem/FullSystemOptimize.cpp.o CMakeFiles/dso.dir/src/FullSystem/FullSystemOptPoint.cpp.o CMakeFiles/dso.dir/src/FullSystem/FullSystemDebugStuff.cpp.o CMakeFiles/dso.dir/src/FullSystem/FullSystemMarginalize.cpp.o CMakeFiles/dso.dir/src/FullSystem/Residuals.cpp.o CMakeFiles/dso.dir/src/FullSystem/CoarseTracker.cpp.o CMakeFiles/dso.dir/src/FullSystem/CoarseInitializer.cpp.o CMakeFiles/dso.dir/src/FullSystem/ImmaturePoint.cpp.o CMakeFiles/dso.dir/src/FullSystem/HessianBlocks.cpp.o CMakeFiles/dso.dir/src/FullSystem/PixelSelector2.cpp.o CMakeFiles/dso.dir/src/OptimizationBackend/EnergyFunctional.cpp.o CMakeFiles/dso.dir/src/OptimizationBackend/AccumulatedTopHessian.cpp.o CMakeFiles/dso.dir/src/OptimizationBackend/AccumulatedSCHessian.cpp.o CMakeFiles/dso.dir/src/OptimizationBackend/EnergyFunctionalStructs.cpp.o CMakeFiles/dso.dir/src/util/settings.cpp.o CMakeFiles/dso.dir/src/util/Undistort.cpp.o CMakeFiles/dso.dir/src/util/globalCalib.cpp.o CMakeFiles/dso.dir/src/IOWrapper/OpenCV/ImageDisplay_OpenCV.cpp.o CMakeFiles/dso.dir/src/IOWrapper/OpenCV/ImageRW_OpenCV.cpp.o CMakeFiles/dso.dir/src/IOWrapper/Pangolin/KeyFrameDisplay.cpp.o CMakeFiles/dso.dir/src/IOWrapper/Pangolin/PangolinDSOViewer.cpp.o CMakeFiles/dso.dir/src/dso_generated_teste.cu.o /usr/local/cuda/lib64/libcudart_static.a -lpthread -ldl -lrt

[ 96%] Building CXX object CMakeFiles/dso_dataset.dir/src/main_dso_pangolin.cpp.o
/usr/bin/c++   -DENABLE_SSE -DHAS_ZIPLIB=1 -I/usr/include/opencv -I/home/cesar/Documents/dso/src -I/home/cesar/Documents/dso/thirdparty/Sophus -I/home/cesar/Documents/dso/thirdparty/sse2neon -I/usr/include/eigen3 -I/home/cesar/Documents/Pangolin/include -I/home/cesar/Documents/Pangolin/build/src/include -I/usr/local/include -I/usr/include/suitesparse -I/usr/local/cuda/include  -O3 -g -std=c++11 -D_FORCE_INLINES   -o CMakeFiles/dso_dataset.dir/src/main_dso_pangolin.cpp.o -c /home/cesar/Documents/dso/src/main_dso_pangolin.cpp

I also tried to separate .cpp files from .cu files, used add_library for .cpp and cuda_add_library for .cu files, like this:

add_library(dso ${dso_SOURCE_FILES} ${dso_opencv_SOURCE_FILES} ${dso_pangolin_SOURCE_FILES})
cuda_add_library(my_cuda_lib ${PROJECT_SOURCE_DIR}/src/teste.cu)

And then use my_cuda_lib in target_link_libraries, like this:

target_link_libraries(dso_dataset dso boost_system cxsparse ${BOOST_THREAD_LIBRARY} ${LIBZIP_LIBRARY} ${Pangolin_LIBRARIES} ${OpenCV_LIBS} ${CUDA_LIBRARIES} my_cuda_lib)

But still got the same errors.

-- EDIT: MCVE ---

To demonstrate my error i created a simple example. I have 2 simple files, my main which is a .cpp and my cuda file .cu. My main just calls the function on the other file, looks like this:

#include <iostream>
#include "hello_world.cu"

using namespace std;

int main()
{

    teste();

    return 0;

}

And my .cu file looks like this:

#include <stdio.h>
#include <iostream>
// CUDA runtime
#include </usr/local/cuda-9.0/include/cuda_runtime.h>

// helper functions and utilities to work with CUDA
#include </usr/local/cuda-9.0/samples/common/inc/helper_functions.h>
#include </usr/local/cuda-9.0/samples/common/inc/helper_cuda.h>

__global__ void kernel (void){
  extern __shared__ float shared[];

  const int tid = threadIdx.x;
  const int bid = blockIdx.x;
}

int teste( void ) {
  kernel<<<1,1>>>();
  printf( "Hello, World!\n" ); 
  return 0;
}

My CMakeLists.txt that i made to compile this looks like this:

cmake_minimum_required(VERSION 2.8)
set(CUDA_HOST_COMPILER /usr/bin/g++-5)
find_package(CUDA QUIET REQUIRED)

# Pass options to NVCC
set(
    CUDA_NVCC_FLAGS
    ${CUDA_NVCC_FLAGS};
    -O3
    )

# For compilation ...
# Specify target & source files to compile it from
cuda_add_executable(
    helloworld
    hello_world.cu
    teste.cpp
)

After making cmake and running with "cmake --build ." (i don't know why it has to be this command, normally i just do make -j, but in this example only this works) i get the same errors as in my project, ‘threadIdx’ was not declared in this scope, same for 'blockIdx' etc..


Solution

  • Since you are including hello_world.cu file in your main code, then you want to have it compiled with nvcc compiler. To achieve this change name of teste.cpp file to teste.cu (otherwise g++ will be used).

    Also remove 'hello_world.cu' from CMakeLists.txt (it is included already in teste file) to have something like this:

    cuda_add_executable(
        helloworld
        teste.cu
    )
    

    Then it should work.

    -- EDIT: Additional question --

    If you want to keep your .cpp file then you need kind of separation between what g++ can do for you and what nvcc should. So you can introduce to your project additional hello_world.h file:

    #ifndef HELLO_WORLD_H
    #define HELLO_WORLD_H
    
    int teste();
    
    #endif
    

    include it in your teste.cpp:

    #include <iostream>
    #include "hello_world.h"
    
    using namespace std;
    
    int main()
    {
    
        teste();
    
        return 0;
    
    }
    

    and then your CMakeLists.txt looks like in your original example:

    ...
    cuda_add_executable(
        helloworld
        teste.cpp
        hello_world.cu
    )
    

    In such a case hello_world.cu will be compiled with nvcc, and then compilling and linking of teste.cpp will be done by g++ (which will be possible in that case since there is no CUDA code in teste.cpp).