I'm trying to compile a small library containing CUDA code.
I have succesfully compiled it as a shared lib, but what I actually need is a static lib.
I have two source files:
main.c: containing a test function written in C. I compile this file with gcc
main_kernel.cu: containing a CUDA-kernel 'testKernel' and a C-wrapper function 'test_gpu' that calls into testKernel.
Here is an excerpt of main_kernel.cu:
__global__ void testKernel(float *data, const int l)
{
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < l)
data[idx]++;
}
#ifdef __cplusplus
extern "C" {
#endif
void test_gpu(float *data, const int length)
{
// Run kernel
testKernel<<< 512, 1024 >>>(data, length);
}
#ifdef __cplusplus
}
#endif
I use gcc to compile main.c into main.o <- This works as desired.
I compile main_kernel.cu with nvcc using the -rdc=true option into an intermediate object I call main_kernel_h.o.
I then use nvcc with the -dlink option to device-link the intermediate object into main_kernel.o.
Then I, as per this answer, I link all three into a static lib with ar using the rcs flags.
This all runs fine, but the problem arises when I want to link an executable to the new library.
I then get undefined references for a bunch of CUDA-functions. Here is the exact error:
../../build/test/bin/libLib.a(main_kernel_h.o): In function `__nv_cudaEntityRegisterCallback(void**)':
tmpxft_0000422a_00000000-5_main_kernel.compute_52.cudafe1.cpp:(.text+0x60): undefined reference to `__cudaRegisterFunction'
../../build/test/bin/libLib.a(main_kernel_h.o): In function `__device_stub__Z10testKernelPfi(float*, int)':
tmpxft_0000422a_00000000-5_main_kernel.compute_52.cudafe1.cpp:(.text+0x8a): undefined reference to `cudaSetupArgument'
tmpxft_0000422a_00000000-5_main_kernel.compute_52.cudafe1.cpp:(.text+0xb0): undefined reference to `cudaSetupArgument'
tmpxft_0000422a_00000000-5_main_kernel.compute_52.cudafe1.cpp:(.text+0xc7): undefined reference to `cudaLaunch'
../../build/test/bin/libLib.a(main_kernel_h.o): In function `test_gpu':
tmpxft_0000422a_00000000-5_main_kernel.compute_52.cudafe1.cpp:(.text+0x124): undefined reference to `cudaConfigureCall'
../../build/test/bin/libLib.a(main_kernel.o): In function `__cudaUnregisterBinaryUtil':
link.stub:(.text+0xf): undefined reference to `__cudaUnregisterFatBinary'
../../build/test/bin/libLib.a(main_kernel.o): In function `__cudaRegisterLinkedBinary(__fatBinC_Wrapper_t const*, void (*)(void**), void*)':
link.stub:(.text+0xd0): undefined reference to `__cudaRegisterFatBinary'
The ouput I get from nm is this:
main.o:
U _GLOBAL_OFFSET_TABLE_
0000000000000000 r .LC0
0000000000000000 T testFunc
U test_gpu
main_kernel.o:
U atexit
U __cudaRegisterFatBinary
0000000000000015 T __cudaRegisterLinkedBinary_57_tmpxft_0000422a_00000000_9_main_kernel_compute_52_cpp1_ii_335679f8
0000000000000000 t __cudaUnregisterBinaryUtil
U __cudaUnregisterFatBinary
0000000000000000 r fatbinData
U __fatbinwrap_57_tmpxft_0000422a_00000000_9_main_kernel_compute_52_cpp1_ii_335679f8
0000000000000000 r _ZL15__fatDeviceText
0000000000000000 b _ZL20__cudaFatCubinHandle
0000000000000010 b _ZL22__cudaPrelinkedFatbins
000000000000005b t _ZL26__cudaRegisterLinkedBinaryPK19__fatBinC_Wrapper_tPFvPPvES2_
0000000000000000 r _ZL87def_module_id_str_57_tmpxft_0000422a_00000000_9_main_kernel_compute_52_cpp1_ii_335679f8
0000000000000020 b _ZZ96__cudaRegisterLinkedBinary_57_tmpxft_0000422a_00000000_9_main_kernel_compute_52_cpp1_ii_335679f8E3__p
0000000000000030 b _ZZL26__cudaRegisterLinkedBinaryPK19__fatBinC_Wrapper_tPFvPPvES2_E16__callback_array
0000000000000028 b _ZZL26__cudaRegisterLinkedBinaryPK19__fatBinC_Wrapper_tPFvPPvES2_E3__i
main_kernel_h.o:
U cudaConfigureCall
U cudaLaunch
U __cudaRegisterFunction
U __cudaRegisterLinkedBinary_57_tmpxft_0000422a_00000000_9_main_kernel_compute_52_cpp1_ii_335679f8
U cudaSetupArgument
0000000000000000 r fatbinData
0000000000000000 D __fatbinwrap_57_tmpxft_0000422a_00000000_9_main_kernel_compute_52_cpp1_ii_335679f8
U _GLOBAL_OFFSET_TABLE_
0000000000000000 r .LC0
00000000000000e0 T test_gpu
00000000000000d0 T _Z10testKernelPfi
0000000000000070 T _Z31__device_stub__Z10testKernelPfiPfi
0000000000000000 r _ZL15__module_id_str
0000000000000000 t _ZL22____nv_dummy_param_refPv
0000000000000000 t _ZL24__sti____cudaRegisterAllv
0000000000000010 t _ZL31__nv_cudaEntityRegisterCallbackPPv
0000000000000030 b _ZL32__nv_fatbinhandle_for_managed_rt
0000000000000020 b _ZZ31__device_stub__Z10testKernelPfiPfiE3__f
0000000000000010 b _ZZL22____nv_dummy_param_refPvE5__ref
0000000000000000 b _ZZL31__nv_cudaEntityRegisterCallbackPPvE5__ref
If you want my exact commands I have also included the makefiles I use to build the objects;
Makefile for the library:
ARCH = -gencode arch=compute_30,code=sm_30 \
-gencode arch=compute_35,code=sm_35 \
-gencode arch=compute_50,code=[sm_50,compute_50] \
-gencode arch=compute_52,code=[sm_52,compute_52]
VPATH=.
SLIB=libLib.so
ALIB=libLib.a
OBJDIR=../../build/test/bin-int/lib/
OUTDIR=../../build/test/bin/
# Base C-stuff
CC=gcc
CPP=g++
NVCC=nvcc
AR=ar
ARFLAGS=rcs
OPTS=-Ofast
LDFLAGS= -lm -pthread -lc
COMMON= -DEXT_SO
CFLAGS=-Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors
# OPTS=-O0 -g # <- Debug
CFLAGS+=$(OPTS)
# CUDA
COMMON+= -I/usr/local/cuda/include/
LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand
# CUDNN
LDFLAGS+= -lcudnn
# C-objects
OBJ=main.o
# CUDA-objects
# LDFLAGS+= -lstdc++ # <- Unsure if this is required
OBJ_CUDA=main_kernel.o
CUDA_HOST=main_kernel_h.o
OBJS = $(addprefix $(OBJDIR), $(OBJ))
OBJS_CUDA = $(addprefix $(OBJDIR), $(OBJ_CUDA))
OBJS_HOST = $(addprefix $(OBJDIR), $(CUDA_HOST))
DEPS = $(wildcard ./*.h) Makefile
# Build all steps
all: obj $(OBJS) $(OBJS_HOST) $(OBJS_CUDA) $(ALIB)
# Link static lib
$(ALIB): $(OBJS_CUDA) $(OBJS_HOST) $(OBJS)
$(AR) $(ARFLAGS) $(OUTDIR)$@ $^
# Compile c
$(OBJDIR)%.o: %.c $(DEPS)
$(CC) $(COMMON) $(CFLAGS) -c $< -o $@
# Compile cuda-hostcode
$(OBJDIR)%_h.o: %.cu $(DEPS)
$(NVCC) $(ARCH) -c -rdc=true --compiler-options "$(CFLAGS)" $< -o $@
# Device Link device code
$(OBJDIR)%.o: $(OBJDIR)%_h.o $(DEPS)
$(NVCC) $(ARCH) -dlink -o $@ $< -lcuda -lcudart -lcublas -lcurand -lcudnn
obj:
mkdir -p $(OBJDIR)
.PHONY: clean
clean:
rm -rf $(OBJS) $(ALIB) $(OBJDIR)/*
And for the executable trying to link the static library:
VPATH=.
EXEC=Test
OBJDIR=../../build/test/bin-int/test/
OUTDIR=../../build/test/bin/
LIB=$(OUTDIR)libLib.a
# Base C-stuff
CC=gcc
CPP=g++
OPTS=-Ofast
LDFLAGS= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand -lcudnn -Wl,-rpath,'$$ORIGIN' -s
CFLAGS= -MMD -MP -DNDEBUG -DSTRIP_PYTHON -I../src -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors
# OPTS=-O0 -g # <- Debug
CFLAGS+=$(OPTS)
# C-objects
OBJ=test.o
OBJS = $(addprefix $(OBJDIR), $(OBJ))
DEPS = $(wildcard ./*.h) Makefile
# Build all steps
all: obj $(EXEC)
# Link executable
$(EXEC): $(OBJS)
$(CC) $^ -o $(OUTDIR)$@ $(LDFLAGS) $(LIB)
# Compile c
$(OBJDIR)%.o: %.c $(DEPS)
$(CC) $(CFLAGS) -c $< -o $@
obj:
mkdir -p $(OBJDIR)
.PHONY: clean
clean:
rm -rf $(OBJS) $(OBJDIR)/*
I hope you can help me find my mistake(s).
Turns out the problem was not in compilation of the static lib, but in the linking of said library.
The problem was fixed for me by changing:
# Link executable
$(EXEC): $(OBJS)
$(CC) $^ -o $(OUTDIR)$@ $(LDFLAGS) $(LIB)
Into:
# Link executable
$(EXEC): $(OBJS)
$(CC) $^ -o $(OUTDIR)$@ $(LIB) $(LDFLAGS)
Such that the static lib, which is just a collection of objects, is linked with the other objects, before linking to the CUDA libs in $(LDFLAGS).
A note as well for anyone else stumbling on this in the future; it seems to be a bit dependent on the version of your compiler, whether or not this actually causes an error.