Vectorize/optimize a function in R

Suppose I have the following function:

myfunc <- function(n){
  
  alln <- 1:n
  
  part1 <- qbeta(0.02,alln,n-alln+1)+qbeta(0.03,1,n-alln+1)
  
  return(sum(dbinom(alln,n,0.5)*part1))
}

How to optimize the function to support vectorized n without using Rcpp?

Here are my attempts/benchmarks for n=1:1000

vec_myfunc <- Vectorize(myfunc)
bench::mark(vec = vec_myfunc(1:1000), sapply = sapply(1:1000, myfunc), check = FALSE, min_iterations = 10)

  expression     min median `itr/sec` mem_alloc `gc/sec` n_itr
  <bch:expr> <bch:t> <bch:>     <dbl> <bch:byt>    <dbl> <int>
1 vec          2.43s   2.5s     0.402    34.9MB    0.938     3
2 sapply       2.21s  2.32s     0.430    34.9MB    1.72      2

Solution

It can be vectorized using sequence, cumsum, and diff. However, most of the time is spent in qbeta, so vectorization by itself will be hardly noticeable.

myfunc2 <- function(n){
  alln <- sequence(n)
  allnr <- sequence(n, n, -1)
  
  part1 <- qbeta(0.02, alln, allnr) + qbeta(0.03, 1, allnr)
  
  diff(c(0, cumsum(dbinom(alln, rep.int(n, n), 0.5)*part1)[cumsum(n)]))
}

microbenchmark::microbenchmark(
  myfunc = myfunc(1:1000),
  myfunc2 = myfunc2(1:1000),
  times = 10,
  check = "equal"
)
#> Unit: seconds
#>     expr      min       lq     mean   median       uq      max neval
#>   myfunc 2.443684 2.498148 2.549963 2.563231 2.616979 2.657841    10
#>  myfunc2 2.404791 2.448580 2.507485 2.523236 2.554989 2.580222    10

A bigger time-saver will come from pre-computing qbeta(0.03, 1, n - alln + 1). All the other qbeta calls have unique inputs, so those will still take the bulk of the time. If duplicates in n are possible, we can also check for those. This would be my version of a fully optimized function without finding a faster version of qbeta, which is unlikely without much effort:

myfunc3 <- function(n){
  un <- unique(n)
  alln <- sequence(un)
  allnr <- sequence(un, un, -1)
  qbeta3 <- 1 - (1 - 0.03)^(1/(1:max(n)))
  
  part1 <- qbeta(0.02, alln, allnr) + qbeta3[allnr]
  
  if (length(n) != length(un)) {
    diff(c(0, cumsum(dbinom(alln, rep.int(un, un), 0.5)*part1)[cumsum(un)]))[match(n, un)]
  } else {
    diff(c(0, cumsum(dbinom(alln, rep.int(un, un), 0.5)*part1)[cumsum(un)]))
  }
}

microbenchmark::microbenchmark(
  myfunc2 = myfunc2(1:1000),
  myfunc3 = myfunc3(1:1000),
  times = 10,
  check = "equal"
)
#> Unit: seconds
#>     expr      min       lq     mean   median       uq      max neval
#>  myfunc2 2.412228 2.479897 2.505713 2.508293 2.537983 2.568844    10
#>  myfunc3 1.279183 1.334574 1.364390 1.364373 1.399121 1.440190    10