Build R package with relocatable device code

I am writing an R package that uses Thrust both to handle memory allocations and to avoid having to write my own CUDA kernels.

In some instances, I call cuBLAS routines from device code rather from host code. This changes the compilation requirements. While the code compiles using the nvcc commands below, it may be desirable to explicitly call the host linker (g++). How can I modify the current build process to accomplish this?

The steps I am using are:

Compile output file (max.o) containing device relocatable code using -dc switch
Create a library (libmax.a) to link with
Compile output file containing wrapper functions (somePackage.o) using -c switch
Create shared library (somePackage.so) that links to libmax.a using -shared switch

Working example shown below:

iterator.h: This defines some types, including strideAccessor.

max.h: Declaration of function in max.cu

max.cu: Defines a function which finds index of the maximum element in each of n concatenated arrays of dimension d.

somePackage.cu: A wrapper handling the R/C++ interface

$ cat iterator.h
#ifndef ITER_H
#define ITER_H

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/tuple.h>
#include <thrust/iterator/zip_iterator.h>

typedef thrust::device_vector<int> ivec_d;
typedef thrust::device_vector<double> fvec_d;
typedef thrust::device_vector<int>::iterator intIter;
typedef thrust::device_vector<double>::iterator realIter;
typedef thrust::host_vector<int> ivec_h;
typedef thrust::host_vector<double> fvec_h;

typedef thrust::counting_iterator<int> countIter;

//Used for generating rep( (1:len)*incr, times=infinity)
struct stride: public thrust::unary_function<int, int>{

  int incr;

  __host__ __device__ stride(int incr=1): incr(incr){}

  __host__ __device__ int operator()(int x){

    return x*incr;
  }
};

typedef thrust::transform_iterator<stride, countIter> strideIter;
typedef thrust::permutation_iterator<realIter, strideIter> strideAccessor;


#endif

$ cat max.h
#include "iterator.h"

void cublas_max(fvec_d &x, ivec_d &result, int n, int d);

$ cat max.cu
#include "iterator.h"
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <cublas_v2.h>
#include <iostream>

struct whichMax : thrust::unary_function<double, int>{
  int dim;

  __host__ __device__ whichMax(int dim): dim(dim){}

  __host__ __device__ int operator()(double &vec){

    cublasHandle_t handle;
    cublasCreate_v2(&handle);
    int incx=1, n = dim, result =0;
    double *vec_ptr = thrust::raw_pointer_cast(&vec);

    //find the first index of a maximal element
    cublasIdamax(handle, n, vec_ptr, incx, &result);
    cublasDestroy_v2(handle);
    return result;
  }
};

void cublas_max(fvec_d &x, ivec_d &result, int n, int d){

  stride f(d);
  strideIter siter = thrust::transform_iterator<stride, countIter>(thrust::make_counting_iterator<int>(0), f);
  strideAccessor stridex = thrust::permutation_iterator<realIter, strideIter>(x.begin(), siter);

  whichMax g(d);

  //find the index of maximum for each of n subvectors
  thrust::copy(result.begin(), result.end(), std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;
  thrust::transform(stridex, stridex + n, result.begin(),  g);
  thrust::copy(result.begin(), result.end(), std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;
}

$ cat somePackage.cu
#include "iterator.h"
#include "max.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <R.h>
#include <Rinternals.h>
#include <Rmath.h>
#include <iostream>

extern "C" SEXP Rcublas_max(SEXP x, SEXP n, SEXP dim){

  double *xptr = REAL(x);
  int N = INTEGER(n)[0], D = INTEGER(n)[0];

  fvec_d dx(xptr, xptr+N*D);
  ivec_d dresult(N);

  cublas_max(dx, dresult, N, D);

  ivec_h hresult(N);
  thrust::copy(dresult.begin(), dresult.end(), hresult.begin());

  SEXP indices = PROTECT(allocVector(INTSXP, N));

  for(int i=0; i<N; ++i)
    INTEGER(indices)[i] = hresult[i];

  UNPROTECT(1);
  return indices;
}

$ make
nvcc -dc -arch=sm_35 -Xcompiler -fPIC -lcublas_device -lcublas_device max.cu -o max.o
nvcc -lib -arch=sm_35 -Xcompiler -fPIC -lcublas_device -lcublas_device max.o -o libmax.a
nvcc -c -arch=sm_35 -Xcompiler -fPIC -lcublas_device somePackage.cu -lmax -I/home/emittman/src/R-3.3.1/builddir/include -I. -o somePackage.o
nvcc -shared -arch=sm_35 -Xcompiler -fPIC -lcublas_device somePackage.o -I/home/emittman/src/R-3.3.1/builddir/include -I. -L. -lcublas_device -lmax -o somePackage.so
ptxas info    : 'device-function-maxrregcount' is a BETA feature

Solution

I created a R package with Rcpp, calling some extern functions from a C++ shared library which then call the CUDA kernels to perform the computations required.

What you are trying to do here is to compile your CUDA code into static library and then link it to your R package (which itself will be compiled into a shared library). My approach is different from yours and I am providing descriptions to my method just to give you a different thought.

Here is an simplified example.

kernels.cu of the shared library containing CUDA code:

__global__
void my_cuda_kernel( ... ) {
    // ......
}

main.cu of the shared library containing CUDA code:

extern "C" {
    void do_cuda_work( ... ) {
        thrust::copy( ... );
        my_cuda_kernel <<< ... >>> ( ... );
    }
}

package.cpp in the R package:

extern void do_cuda_work( ... );

// [[Rcpp::export]]
void call_cuda_code( ... ) {
    do_cuda_work( ... );
}

To compile the CUDA code into a shared library, you need to use:

nvcc -arch=sm_35 -dc ... kernels.cu -o kernels.o
nvcc -arch=sm_35 -dc ... main.cu -o main.o
nvcc --shared -arch=sm_35 ... kernels.o main.o ... libMyCUDALibrary.so

Please note that for separate compilation to work you need to specify -arch=sm_35 for both compiler and linker and -dc for compiler. Once you successfully created the shared library, linking your R package to it is fairly straight forward. (However, you may need to create a Makevars file under the src folder of your R package to specify include and library paths, and maybe RPATH as well):

CXX_STD= CXX11
PKG_CPPFLAGS= -I../../../CPP/include
PKG_LIBS= -L../../../CPP/bin/Release -lMyCUDALibrary -Wl,-rpath=$$HOME/MyCUDALibrary/CPP/bin/Release `$(R_HOME)/bin/Rscript -e "Rcpp::LdFlags()"`