Search code examples
c++cmakecuda

Trying to call a device function from another file's global function


I've been trying to make a small project with CMake and CUDA as a beginner to GPU programming (and also sorta CMake) and I've run into a problem so I want to know a way to do the following:

Ok, let's say I have two separate file (they're each two actually a .cuh and a .cu file but I'm just going to refer to them as one each as they are compiled together) I have file A and file B.

I essentially want this:

File A calls a __global__ function located in its own file this global function then calls a __device__ function from File B.

I want to be able to compile File B as a library so that if other files want to reference this __device__ function for any reason I can, and I want File A to compile as an executable.

How do I go about doing this? Is the only option keeping them in the same file?

Edit 1:

I've tried adding -dc and -rdc=true but there are also other files that have no CUDA functionality (no device global or host functions) that are being compiled into libraries and linked to File A.

Edit 2:

I realised I uploaded an error from a previous testing anyway I have a small version of what I am doing, I've tried to keep it short but here is the file structure


main.cu

main.cuh

CMakeLists.txt

Maths/

  AMaths.cu

  AMaths.cuh

  GMaths.cu
  
  GMaths.cuh

  CMakeLists.txt

CMakeLists.txt

cmake_minimum_required (VERSION 3.8)

project (Engine LANGUAGES CUDA CXX)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -rdc=true -lcudadevrt -lcublas_device -dc")

include_directories(Maths)

add_executable(Main Main.cu Main.cuh)

add_subdirectory(Maths)

target_link_libraries(Main
    PRIVATE
    GMaths
    AMaths)

Main.cu

#include "Main.cuh"

__global__ void caller(float* vec1, float* vec2, float* out) {
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    GMaths::Add(vec1[i], vec2[i], out[i]);
}

int main() {
    float* vec1;
    float* vec2;
    float* out;
    cudaMalloc(&vec1, sizeof(float) * 3);
    cudaMalloc(&vec2, sizeof(float) * 3);
    cudaMallocManaged(&out, sizeof(float) * 3);
    float* temp = new float[3] {1, 2, 3};
    cudaMemcpy(temp, &vec1, sizeof(float) * 3, cudaMemcpyHostToDevice);
    cudaMemcpy(temp, &vec2, sizeof(float) * 3, cudaMemcpyHostToDevice);
    caller <<< 1, 3 >>> (vec1, vec2, out);
    return 0;
}

Main.cuh

#ifndef MAIN_CUH
#define MAIN_CUH

#include "GMaths.cuh"
#include "AMaths.cuh"

__global__ void caller(float* vec1, float* vec2, float* out);
int main();

#endif

Maths\AMaths.cu

#include "AMaths.cuh"

namespace AMaths {
    void VecAdd(float* vec1, float* vec2, int vecsize, float* out) {
        for (int i = 0; i < vecsize; i++) {
            out[i] = vec1[i] + vec2[i];
        }
    }
    float V2Dot(Vector2 v1, Vector2 v2) {
        return v1.x * v2.x + v1.y * v2.y;
    }
    float V3Dot(Vector3 v1, Vector3 v2) {
        return v1.x * v2.x + v1.y * v2.y + v1.z *v2.z;
    }
    Quaternion EulerToQuaternion(Euler angles) {
        Quaternion result(std::cos(angles.roll / 2) * std::cos(angles.pitch / 2) * std::cos(angles.yaw / 2) + std::sin(angles.roll / 2) * std::sin(angles.pitch / 2) * std::sin(angles.yaw / 2),
        std::sin(angles.roll / 2) * std::cos(angles.pitch / 2) * std::cos(angles.yaw / 2) - std::cos(angles.roll / 2) * std::sin(angles.pitch / 2) * std::sin(angles.yaw / 2),
        std::cos(angles.roll / 2) * std::sin(angles.pitch / 2) * std::cos(angles.yaw / 2) + std::sin(angles.roll / 2) * std::cos(angles.pitch / 2) * std::sin(angles.yaw / 2),
        std::cos(angles.roll / 2) * std::cos(angles.pitch / 2) * std::sin(angles.yaw / 2) - std::sin(angles.roll / 2) * std::sin(angles.pitch / 2) * std::cos(angles.yaw / 2));
        return result;
    }
    Euler QuaternionToEuler(Quaternion angles) {
        float sinr_cosangles = 2 * (angles.w * angles.i + angles.j * angles.k);
        float cosr_cosangles = 1 - 2 * (angles.i * angles.i + angles.j * angles.j);

        float sinangles = std::sqrt(1 + 2 * (angles.w * angles.j - angles.i * angles.k));
        float cosangles = std::sqrt(1 - 2 * (angles.w * angles.j - angles.i * angles.k));

        float siny_cosangles = 2 * (angles.w * angles.k + angles.i * angles.j);
        float cosy_cosangles = 1 - 2 * (angles.j * angles.j + angles.k * angles.k);
        
        Euler result(std::atan2(siny_cosangles, cosy_cosangles),
            2 * std::atan2(sinangles, cosangles) - PI / 2,
            std::atan2(sinr_cosangles, cosr_cosangles));
        return result;
    }
    void Vector3ToQuaternion(Vector3 vector, Quaternion &out) {
        out.w = 0;
        out.i = vector.x;
        out.j = vector.y;
        out.k = vector.z;
    }
    void QuaternionToVector3(Quaternion quat, Vector3 &out) {
        out.x = quat.i;
        out.y = quat.j;
        out.z = quat.k;
    }
    void QuaternionMultiplication(Quaternion Q1, Quaternion Q2, Quaternion &out) {
        out.w = Q1.w * Q2.w - Q1.i * Q2.i - Q1.j * Q2.j - Q1.k  * Q2.k;
        out.i = Q1.w * Q2.i + Q1.i * Q2.w + Q1.j * Q2.k - Q1.k  * Q2.j;
        out.j = Q1.w * Q2.j - Q1.i * Q2.k + Q1.j * Q2.w + Q1.k  * Q2.i;
        out.k = Q1.w * Q2.k + Q1.i * Q2.j - Q1.j * Q2.i + Q1.k  * Q2.w;
    }
    void InverseQuaternion(Quaternion Q, Quaternion &out) {
        out.w = Q.w;
        out.i = -Q.i;
        out.j = -Q.w;
        out.k = -Q.k;
    }
    void QuaternionToRotationMatrix(Quaternion Q, RotationMatrix &out) {
        float v01 = Q.w * Q.i;
        float v02 = Q.w * Q.j;
        float v03 = Q.w * Q.k;
        float v12 = Q.i * Q.j;
        float v13 = Q.i * Q.k;
        float v23 = Q.j * Q.k;
        float q02 = Q.w * Q.w;
        out.x1 = 2 * (q02 + Q.i * Q.i) - 1;
        out.y2 = 2 * (q02 + Q.j * Q.j) - 1;
        out.z3 = 2 * (q02 + Q.k * Q.k) - 1;
        out.y1 = 2 * (v12 - v03);
        out.z1 = 2 * (v13 + v02);
        out.x2 = 2 * (v12 + v03);
        out.z2 = 2 * (v23 - v01);
        out.x3 = 2 * (v13 - v02);
        out.y3 = 2 * (v23 + v01);
    }
}

Maths\AMaths.cuh

#ifndef AMATHS_H
#define AMATHS_H
#include <cmath>
#include <iostream>

namespace AMaths {
    #define PI 3.14159265359
}

namespace Literals {
    constexpr long double operator"" _mm(long double x) { return x / 1000; };
    constexpr long double  operator"" _cm(long double x) { return x / 100; };
    constexpr long double operator"" _m(long double x) { return x; };
    constexpr long double operator"" _deg(long double x) { return x * (PI / 180); };
    constexpr long double operator"" _rad(long double x) { return x; };
}

namespace AMaths {
    #define HALFPI 1.57079632679
    #define QUATERPI 0.785398163397
    #define ONEOVERPI 0.318309886184
    #define A 0.0776509570923569
    #define B -0.287434475393028
    #define C (QUATERPI - A - B)
    #define S1 0.166666666667
    #define S2 0.00833333333333
    #define S3 0.000198412698413
    struct Vector2 {
        float x, y;
    };
    struct Vector3 {
        float x, y, z;
        Vector3() {
            this->x = 0;
            this->y = 0;
            this->z = 0;
        }
        Vector3(float x, float y, float z) {
            this->x = x;
            this->y = y;
            this->z = z;
        }
    };
    struct Euler {
        float yaw, pitch, roll;
        Euler() {
            this->yaw = 0;
            this->pitch = 0;
            this->roll = 0;
        }
        Euler(float yaw, float pitch, float roll) {
            this->yaw = yaw;
            this->pitch = pitch;
            this->roll = roll;
        }
    };
    struct Quaternion {
        float w, i, j, k;
        Quaternion() {
            this->w = 0;
            this->i = 0;
            this->j = 0;
            this->k = 0;
        }
        Quaternion(float w, float i, float j, float k) {
            this->w = w;
            this->i = i;
            this->j = j;
            this->k = k;
        }
    };
    struct RotationMatrix {
        float x1, y1, z1;
        float x2, y2, z2;
        float x3, y3, z3;
    };
    void VecAdd(float* vec1, float* vec2, int vecsize, float* out);
    float V2Dot(Vector2 v1, Vector2 v2);
    float V3Dot(Vector3 v1, Vector3 v2);
    Quaternion EulerToQuaternion(Euler);
    Euler QuaternionToEuler(Quaternion);
    void Vector3ToQuaternion(Vector3 vector, Quaternion &out);
    void QuaternionToVector3(Quaternion quat, Vector3 &out);
    void QuaternionMultiplication(Quaternion Q1, Quaternion Q2, Quaternion &out);
    void InverseQuaternion(Quaternion Q, Quaternion &out);
    void QuaternionToRotationMatrix(Quaternion Q, RotationMatrix &out);
}
#endif

Maths\GMaths.cu

#include "GMaths.cuh"
namespace GMaths {
    __device__ void Add(float val1, float val2, float &out) {
        out = val1 + val2;
    }

    __global__ void VecAdd(float* vec1, float* vec2, float* out) {
        int i = threadIdx.x + blockDim.x * blockIdx.x;
        out[i] = vec1[i] + vec2[i];
    }
    __global__ void MatMulVec4(float* matrix[16], float* vector[4], float* out[4]) {
        int temp = threadIdx.x + blockDim.x * blockIdx.x;
        int i = temp % 4;
        int s = (int)(temp / (4));
        out[s][i] = matrix[s][i] * vector[s][0] + matrix[s][i+4] * vector[s][1] + matrix[s][i+8] * vector[s][2] + matrix[s][i+12] * vector[s][3];
    }
}

Maths\GMaths.cuh

#ifndef GMATHS_CUH
#define GMATHS_CUH
namespace GMaths {
    __device__ void Add(float val1, float val2, float &out);
    __global__ void VecAdd(float* vec1, float* vec2, float* out);
    __global__ void MatMulVec4(float* matrix[16], float* vector[4], float* out[4]);
}
#endif

Maths\CMakeLists.txt

add_library(AMaths AMaths.cuh AMaths.cu)
add_library(GMaths GMaths.cuh GMaths.cu)
target_compile_features(AMaths PUBLIC cxx_std_11)
target_compile_features(GMaths PUBLIC cxx_std_11)
set_target_properties(AMaths PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 
set_target_properties(GMaths PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

Edit 3:

Specific error I am getting currently:

Main.obj : error LNK2019: unresolved external symbol __cudaRegisterLinkedBinary_5bf867cd_7_Main_cu_cd620806 referenced in f
unction "void __cdecl __sti____cudaRegisterAll(void)" (?__sti____cudaRegisterAll@@YAXXZ) [C:\Users\{myname}\source\repos\Render 
ingengine\build\Renderingengine\Main.vcxproj]

Edit 4:

After taking paleonix's suggestion and removing the flags and adding the CUDA_SEPERABLE_COMPILATION to on for main and it works

New CMakeLists.txt:

cmake_minimum_required (VERSION 3.8)

project (Engine LANGUAGES CUDA CXX)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)

include_directories(Maths)

add_executable(Main Main.cu Main.cuh)

set_target_properties(Main PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 

add_subdirectory(Maths)

target_link_libraries(Main
    PRIVATE
    GMaths
    AMaths)

Solution

  • Putting all kinds of flags into CMAKE_CUDA_FLAGS like

    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -rdc=true -lcudadevrt -lcublas_device -dc")
    

    is a bad idea because even if it works, it isn't very portable (e.g. it wont work if you want to use clang as CUDA compiler). In this particular case you have also mixed compiler flags and linker flags which might be the actual reason for the linker error.

    • To replace -rdc=true and -dc, just use the CUDA_SEPARABLE_COMPILATION property on the Main target just as you have done for the two libraries.
    • According to my observation* -lcudadevrt is automatically added by CMake without any need for your input.
    • -lcublas_device is a bit more tricky as the CUDAToolkit package which provides targets for cuBLAS and other CUDA libraries doesn't seem to provide a target for the device side cuBLAS library. Ideally one would create a proper target for that, but in your context it might be enough to have the plain name cublas_device in the target_link_libraries() command.

    Further remarks:

    • I don't think cxx_std_11 will work on CUDA targets, you should use cuda_std_11 which became available with CMake 3.17, i.e. cmake_minimum_required() should be modified correspondingly.

    • The arguments to cudaMemcpy() are in the wrong order and the & is wrong here. It should be

      cudaMemcpy(vec1, temp, sizeof(float) * 3, cudaMemcpyHostToDevice);
      
    • These are most probably just due to keeping the MRE short and not real oversights on the author's side, but for completeness:


    *: I looked at CMakeFiles/Main.dir/linkLibs.rsp in the build directory to see what is being linked.