I've been trying to make a small project with CMake and CUDA as a beginner to GPU programming (and also sorta CMake) and I've run into a problem so I want to know a way to do the following:
Ok, let's say I have two separate file (they're each two actually a .cuh
and a .cu
file but I'm just going to refer to them as one each as they are compiled together) I have file A and file B.
I essentially want this:
File A calls a __global__
function located in its own file this global function then calls a __device__
function from File B.
I want to be able to compile File B as a library so that if other files want to reference this __device__
function for any reason I can, and I want File A to compile as an executable.
How do I go about doing this? Is the only option keeping them in the same file?
Edit 1:
I've tried adding -dc
and -rdc=true
but there are also other files that have no CUDA functionality (no device global or host functions) that are being compiled into libraries and linked to File A.
Edit 2:
I realised I uploaded an error from a previous testing anyway I have a small version of what I am doing, I've tried to keep it short but here is the file structure
main.cu
main.cuh
CMakeLists.txt
Maths/
AMaths.cu
AMaths.cuh
GMaths.cu
GMaths.cuh
CMakeLists.txt
CMakeLists.txt
cmake_minimum_required (VERSION 3.8)
project (Engine LANGUAGES CUDA CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -rdc=true -lcudadevrt -lcublas_device -dc")
include_directories(Maths)
add_executable(Main Main.cu Main.cuh)
add_subdirectory(Maths)
target_link_libraries(Main
PRIVATE
GMaths
AMaths)
Main.cu
#include "Main.cuh"
__global__ void caller(float* vec1, float* vec2, float* out) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
GMaths::Add(vec1[i], vec2[i], out[i]);
}
int main() {
float* vec1;
float* vec2;
float* out;
cudaMalloc(&vec1, sizeof(float) * 3);
cudaMalloc(&vec2, sizeof(float) * 3);
cudaMallocManaged(&out, sizeof(float) * 3);
float* temp = new float[3] {1, 2, 3};
cudaMemcpy(temp, &vec1, sizeof(float) * 3, cudaMemcpyHostToDevice);
cudaMemcpy(temp, &vec2, sizeof(float) * 3, cudaMemcpyHostToDevice);
caller <<< 1, 3 >>> (vec1, vec2, out);
return 0;
}
Main.cuh
#ifndef MAIN_CUH
#define MAIN_CUH
#include "GMaths.cuh"
#include "AMaths.cuh"
__global__ void caller(float* vec1, float* vec2, float* out);
int main();
#endif
Maths\AMaths.cu
#include "AMaths.cuh"
namespace AMaths {
void VecAdd(float* vec1, float* vec2, int vecsize, float* out) {
for (int i = 0; i < vecsize; i++) {
out[i] = vec1[i] + vec2[i];
}
}
float V2Dot(Vector2 v1, Vector2 v2) {
return v1.x * v2.x + v1.y * v2.y;
}
float V3Dot(Vector3 v1, Vector3 v2) {
return v1.x * v2.x + v1.y * v2.y + v1.z *v2.z;
}
Quaternion EulerToQuaternion(Euler angles) {
Quaternion result(std::cos(angles.roll / 2) * std::cos(angles.pitch / 2) * std::cos(angles.yaw / 2) + std::sin(angles.roll / 2) * std::sin(angles.pitch / 2) * std::sin(angles.yaw / 2),
std::sin(angles.roll / 2) * std::cos(angles.pitch / 2) * std::cos(angles.yaw / 2) - std::cos(angles.roll / 2) * std::sin(angles.pitch / 2) * std::sin(angles.yaw / 2),
std::cos(angles.roll / 2) * std::sin(angles.pitch / 2) * std::cos(angles.yaw / 2) + std::sin(angles.roll / 2) * std::cos(angles.pitch / 2) * std::sin(angles.yaw / 2),
std::cos(angles.roll / 2) * std::cos(angles.pitch / 2) * std::sin(angles.yaw / 2) - std::sin(angles.roll / 2) * std::sin(angles.pitch / 2) * std::cos(angles.yaw / 2));
return result;
}
Euler QuaternionToEuler(Quaternion angles) {
float sinr_cosangles = 2 * (angles.w * angles.i + angles.j * angles.k);
float cosr_cosangles = 1 - 2 * (angles.i * angles.i + angles.j * angles.j);
float sinangles = std::sqrt(1 + 2 * (angles.w * angles.j - angles.i * angles.k));
float cosangles = std::sqrt(1 - 2 * (angles.w * angles.j - angles.i * angles.k));
float siny_cosangles = 2 * (angles.w * angles.k + angles.i * angles.j);
float cosy_cosangles = 1 - 2 * (angles.j * angles.j + angles.k * angles.k);
Euler result(std::atan2(siny_cosangles, cosy_cosangles),
2 * std::atan2(sinangles, cosangles) - PI / 2,
std::atan2(sinr_cosangles, cosr_cosangles));
return result;
}
void Vector3ToQuaternion(Vector3 vector, Quaternion &out) {
out.w = 0;
out.i = vector.x;
out.j = vector.y;
out.k = vector.z;
}
void QuaternionToVector3(Quaternion quat, Vector3 &out) {
out.x = quat.i;
out.y = quat.j;
out.z = quat.k;
}
void QuaternionMultiplication(Quaternion Q1, Quaternion Q2, Quaternion &out) {
out.w = Q1.w * Q2.w - Q1.i * Q2.i - Q1.j * Q2.j - Q1.k * Q2.k;
out.i = Q1.w * Q2.i + Q1.i * Q2.w + Q1.j * Q2.k - Q1.k * Q2.j;
out.j = Q1.w * Q2.j - Q1.i * Q2.k + Q1.j * Q2.w + Q1.k * Q2.i;
out.k = Q1.w * Q2.k + Q1.i * Q2.j - Q1.j * Q2.i + Q1.k * Q2.w;
}
void InverseQuaternion(Quaternion Q, Quaternion &out) {
out.w = Q.w;
out.i = -Q.i;
out.j = -Q.w;
out.k = -Q.k;
}
void QuaternionToRotationMatrix(Quaternion Q, RotationMatrix &out) {
float v01 = Q.w * Q.i;
float v02 = Q.w * Q.j;
float v03 = Q.w * Q.k;
float v12 = Q.i * Q.j;
float v13 = Q.i * Q.k;
float v23 = Q.j * Q.k;
float q02 = Q.w * Q.w;
out.x1 = 2 * (q02 + Q.i * Q.i) - 1;
out.y2 = 2 * (q02 + Q.j * Q.j) - 1;
out.z3 = 2 * (q02 + Q.k * Q.k) - 1;
out.y1 = 2 * (v12 - v03);
out.z1 = 2 * (v13 + v02);
out.x2 = 2 * (v12 + v03);
out.z2 = 2 * (v23 - v01);
out.x3 = 2 * (v13 - v02);
out.y3 = 2 * (v23 + v01);
}
}
Maths\AMaths.cuh
#ifndef AMATHS_H
#define AMATHS_H
#include <cmath>
#include <iostream>
namespace AMaths {
#define PI 3.14159265359
}
namespace Literals {
constexpr long double operator"" _mm(long double x) { return x / 1000; };
constexpr long double operator"" _cm(long double x) { return x / 100; };
constexpr long double operator"" _m(long double x) { return x; };
constexpr long double operator"" _deg(long double x) { return x * (PI / 180); };
constexpr long double operator"" _rad(long double x) { return x; };
}
namespace AMaths {
#define HALFPI 1.57079632679
#define QUATERPI 0.785398163397
#define ONEOVERPI 0.318309886184
#define A 0.0776509570923569
#define B -0.287434475393028
#define C (QUATERPI - A - B)
#define S1 0.166666666667
#define S2 0.00833333333333
#define S3 0.000198412698413
struct Vector2 {
float x, y;
};
struct Vector3 {
float x, y, z;
Vector3() {
this->x = 0;
this->y = 0;
this->z = 0;
}
Vector3(float x, float y, float z) {
this->x = x;
this->y = y;
this->z = z;
}
};
struct Euler {
float yaw, pitch, roll;
Euler() {
this->yaw = 0;
this->pitch = 0;
this->roll = 0;
}
Euler(float yaw, float pitch, float roll) {
this->yaw = yaw;
this->pitch = pitch;
this->roll = roll;
}
};
struct Quaternion {
float w, i, j, k;
Quaternion() {
this->w = 0;
this->i = 0;
this->j = 0;
this->k = 0;
}
Quaternion(float w, float i, float j, float k) {
this->w = w;
this->i = i;
this->j = j;
this->k = k;
}
};
struct RotationMatrix {
float x1, y1, z1;
float x2, y2, z2;
float x3, y3, z3;
};
void VecAdd(float* vec1, float* vec2, int vecsize, float* out);
float V2Dot(Vector2 v1, Vector2 v2);
float V3Dot(Vector3 v1, Vector3 v2);
Quaternion EulerToQuaternion(Euler);
Euler QuaternionToEuler(Quaternion);
void Vector3ToQuaternion(Vector3 vector, Quaternion &out);
void QuaternionToVector3(Quaternion quat, Vector3 &out);
void QuaternionMultiplication(Quaternion Q1, Quaternion Q2, Quaternion &out);
void InverseQuaternion(Quaternion Q, Quaternion &out);
void QuaternionToRotationMatrix(Quaternion Q, RotationMatrix &out);
}
#endif
Maths\GMaths.cu
#include "GMaths.cuh"
namespace GMaths {
__device__ void Add(float val1, float val2, float &out) {
out = val1 + val2;
}
__global__ void VecAdd(float* vec1, float* vec2, float* out) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
out[i] = vec1[i] + vec2[i];
}
__global__ void MatMulVec4(float* matrix[16], float* vector[4], float* out[4]) {
int temp = threadIdx.x + blockDim.x * blockIdx.x;
int i = temp % 4;
int s = (int)(temp / (4));
out[s][i] = matrix[s][i] * vector[s][0] + matrix[s][i+4] * vector[s][1] + matrix[s][i+8] * vector[s][2] + matrix[s][i+12] * vector[s][3];
}
}
Maths\GMaths.cuh
#ifndef GMATHS_CUH
#define GMATHS_CUH
namespace GMaths {
__device__ void Add(float val1, float val2, float &out);
__global__ void VecAdd(float* vec1, float* vec2, float* out);
__global__ void MatMulVec4(float* matrix[16], float* vector[4], float* out[4]);
}
#endif
Maths\CMakeLists.txt
add_library(AMaths AMaths.cuh AMaths.cu)
add_library(GMaths GMaths.cuh GMaths.cu)
target_compile_features(AMaths PUBLIC cxx_std_11)
target_compile_features(GMaths PUBLIC cxx_std_11)
set_target_properties(AMaths PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(GMaths PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
Edit 3:
Specific error I am getting currently:
Main.obj : error LNK2019: unresolved external symbol __cudaRegisterLinkedBinary_5bf867cd_7_Main_cu_cd620806 referenced in f
unction "void __cdecl __sti____cudaRegisterAll(void)" (?__sti____cudaRegisterAll@@YAXXZ) [C:\Users\{myname}\source\repos\Render
ingengine\build\Renderingengine\Main.vcxproj]
Edit 4:
After taking paleonix's suggestion and removing the flags and adding the CUDA_SEPERABLE_COMPILATION
to on for main and it works
New CMakeLists.txt:
cmake_minimum_required (VERSION 3.8)
project (Engine LANGUAGES CUDA CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
include_directories(Maths)
add_executable(Main Main.cu Main.cuh)
set_target_properties(Main PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
add_subdirectory(Maths)
target_link_libraries(Main
PRIVATE
GMaths
AMaths)
Putting all kinds of flags into CMAKE_CUDA_FLAGS
like
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -rdc=true -lcudadevrt -lcublas_device -dc")
is a bad idea because even if it works, it isn't very portable (e.g. it wont work if you want to use clang as CUDA compiler). In this particular case you have also mixed compiler flags and linker flags which might be the actual reason for the linker error.
-rdc=true
and -dc
, just use the CUDA_SEPARABLE_COMPILATION
property on the Main
target just as you have done for the two libraries.-lcudadevrt
is automatically added by CMake without any need for your input.-lcublas_device
is a bit more tricky as the CUDAToolkit
package which provides targets for cuBLAS and other CUDA libraries doesn't seem to provide a target for the device side cuBLAS library. Ideally one would create a proper target for that, but in your context it might be enough to have the plain name cublas_device
in the target_link_libraries()
command.Further remarks:
I don't think cxx_std_11
will work on CUDA
targets, you should use cuda_std_11
which became available with CMake 3.17, i.e. cmake_minimum_required()
should be modified correspondingly.
The arguments to cudaMemcpy()
are in the wrong order and the &
is wrong here. It should be
cudaMemcpy(vec1, temp, sizeof(float) * 3, cudaMemcpyHostToDevice);
These are most probably just due to keeping the MRE short and not real oversights on the author's side, but for completeness:
cudaDeviceSynchronize()
and do something (print) with the results for the MRE to make sense.cudaFree()
and delete[]
.*: I looked at CMakeFiles/Main.dir/linkLibs.rsp
in the build directory to see what is being linked.