Attempting to vectorize R function for Improved Speed

I've been chipping away at some code to improve its speed. The best I've been able to do so far is as shown in the reproducible example below, but I am still using two loops. In particular, the final line with the two sapply() is functionally what I want, but not computationally the most efficient here in R.

Tossing this out to the community to learn if anyone sees a computational improvement and still returns the object poly.pr in the form returned by the code now.

This code is seemingly fast enough when run only once. However, in larger problems it increases as the dimensions X2 and nds increase. Furthermore, this code is executed many times as it lives inside an optimization procedure. So, finding ways to eek out the fastest time here will be really helpful.

GRM <- function(theta, d, score, a, D = 1.7, machineValue = sqrt(.Machine$double.eps)){
    maxD <- length(d)
    pos1 <- which(score == 0)
    pos2 <- which(score == maxD)
    pos3 <- which(score > 0 & score < maxD)

    pr.1 <- 1/(1 + exp(D*a*(theta - d[(score[pos1]+1)])))
    pr.2 <- 1/(1 + exp(-D*a*(theta - d[score[pos2]])))
    pr.3 <- 1/(1 + exp(D*a*(theta - d[(score[pos3]+1)]))) - 1/(1 + exp(D*a*(theta - d[score[pos3]])))

    ### Now put them back together in order
    pr <- numeric(length(score))
    pr[pos1] <- pr.1
    pr[pos2] <- pr.2
    pr[pos3] <- pr.3
    ind1 <- which(pr > 1 - machineValue)
    ind2 <- which(pr < machineValue)
    if (any(ind1)) 
        pr[ind1] <- 1 - machineValue
    if (any(ind2))  
        pr[ind2] <- machineValue
    pr
}

X2 <- cbind(sample(0:2,3000,replace=T), sample(0:3,3000,replace=T))
I <- ncol(X2)
d.params <- list(c(-1.84, 0.22), c(-1.53, -0.43, 2.55))
a.params <- c(1,1.5)
nds <- c(-3,0,3)

### Working to improve speed here by removing loop if possible
poly.pr <- exp(sapply(1:length(nds), function(j) rowSums(log(sapply(1:I, function(i) GRM(nds[j],d.params[[i]],X2[,i],a.params[i],1))))))

Solution

Here are a few options. The first is to simply speed up the current version of GRM by frontloading the calculations involving log and exp (and pmin and pmax instead of comparisons) then using indexing:

GRM2 <- function(theta, d, score, a, D = 1.7, machineValue = sqrt(.Machine$double.eps)) {
  pr <- 1/(1 + exp(D*a*(theta - d)))
  pr <- log(
    pmax(
      pmin(
        c(
          pr[1],
          diff(pr),
          1/(1 + exp(-D*a*(theta - d[length(d)])))
        ),
        1 - machineValue
      ),
      machineValue
    )
  )
  pr[score + 1L]
}

The second option builds on the first, but uses outer to avoid the loop over nds:

GRM3 <- function(theta, d, score, a, D = 1.7, machineValue = sqrt(.Machine$double.eps)) {
  pow <- D*a*outer(d, theta, "-")
  pr <- 1/(1 + exp(-pow))
  pr <- log(
    pmax(
      pmin(
        rbind(
          pr[1,],
          diff(pr),
          1/(1 + exp(pow[length(d),]))
        ),
        1 - machineValue
      ),
      machineValue
    )
  )
  c(pr[score + 1L,])
}

The final option is the same as the second, but uses Rfast::rowprods instead of exp(rowSums(log(:

library(Rfast) # for `rowprods`

GRM4 <- function(theta, d, score, a, D = 1.7, machineValue = sqrt(.Machine$double.eps)) {
  pow <- D*a*outer(d, theta, "-")
  pr <- 1/(1 + exp(-pow))
  pr <- pmax(
    pmin(
      rbind(
        pr[1,],
        diff(pr),
        1/(1 + exp(pow[length(d),]))
      ),
      1 - machineValue
    ),
    machineValue
  )
  c(pr[score + 1L,])
}

Benchmarking:

microbenchmark::microbenchmark(
  poly.pr = exp(sapply(1:length(nds), function(j) rowSums(log(sapply(1:I, function(i) GRM(nds[j], d.params[[i]], X2[,i], a.params[i], 1)))))),
  poly.pr2 = exp(sapply(1:length(nds), function(j) rowSums(sapply(1:I, function(i) GRM2(nds[j], d.params[[i]], X2[,i], a.params[i], 1))))),
  poly.pr3 = matrix(exp(rowSums(sapply(1:I, function(i) GRM3(nds, d.params[[i]], X2[,i], a.params[i], 1)))), ncol = length(nds)),
  poly.pr4 = matrix(rowprods(sapply(1:I, function(i) GRM4(nds, d.params[[i]], X2[,i], a.params[i], 1))), ncol = length(nds)),
  unit = "relative",
  check = "equal"
)
#> Unit: relative
#>      expr       min       lq     mean   median       uq      max neval
#>   poly.pr 10.169781 7.387566 6.543169 7.210486 6.794433 1.400273   100
#>  poly.pr2  2.890739 2.132149 2.024770 2.074921 2.031143 1.036986   100
#>  poly.pr3  2.266856 1.856143 1.813469 1.823121 1.747661 1.185563   100
#>  poly.pr4  1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100

I suspect the gains will be even more pronounced as the dimensions of nds and X2 increase.