I have a set of many random variables, each with a mean and standard deviation. I want to find all variables that are or are not dominated, i.e., that have the highest mean for their standard deviations. In other words, if X1 has a mean of 10 and and an sd of 10, and X2 has a mean of 9 and an sd of 11, then X2 is dominated by X1. Essentially, I will want to discard it. This is a common problem in portfolio optimization.
Easy when speed is not a concern:
aredominated1 <- function( smean, svar ) {
PN <- length(smean)
kill <- rep(FALSE, PN)
for (i in 1:PN) {
for (j in 1:PN) {
if (i==j) next
kill[i] <- if ((smean[i] < smean[j]) & (svar[i] > svar[j])) 1 else kill[i]
if (kill[i]) break ## no longer any need to check whether i is dominated by a subsequent j
}
}
kill
}
aredominated2 <- function( smean, svar ) {
PN <- length(smean)
kill <- rep(FALSE, PN)
for (i in 1:PN) {
kill[i] <- any( (smean[i] < smean) & (svar[i] > svar) )
}
kill
}
####
T <- 100000
smean <- rnorm(T)
svar <- abs(rnorm(T))
print( system.time( { v1 <- aredominated1( smean, svar ) } ) )
print( system.time( { v2 <- aredominated2( smean, svar ) } ) )
stopifnot( all( v1 == v2 ) )
Roughly speaking, aredominated1
takes 1.5 secs, while aredominated2
(more R-like) takes 60 secs. Is there a much faster way to write such a function in R, or am I close to the native R limit here?
Maybe you can optimize the loop like below
tic <- function(smean, svar) {
k <- order(smean)
v <- svar[k]
l <- length(v)
kill <- vector(length = l)
for (i in 1:(l - 1)) {
for (j in (i + 1):l) {
if (v[i] > v[j]) {
kill[i] <- 1
break
}
}
}
kill[order(k)]
}
ivo1 <- function(smean, svar) {
PN <- length(smean)
kill <- rep(FALSE, PN)
for (i in 1:PN) {
for (j in 1:PN) {
if (i == j) next
kill[i] <- if ((smean[i] < smean[j]) & (svar[i] > svar[j])) 1 else kill[i]
if (kill[i]) break ## no longer any need to check whether i is dominated by a subsequent j
}
}
kill
}
ivo2 <- function(smean, svar) {
PN <- length(smean)
kill <- rep(FALSE, PN)
for (i in 1:PN) {
kill[i] <- any((smean[i] < smean) & (svar[i] > svar))
}
+kill
}
tic <- function(smean, svar) {
k <- order(smean)
v <- svar[k]
l <- length(v)
kill <- vector(length = l)
for (i in 1:(l - 1)) {
for (j in (i + 1):l) {
if (v[i] > v[j]) {
kill[i] <- 1
break
}
}
}
kill[order(k)]
}
sbaldur <- function(smean, svar) {
dt <- data.table(smean, svar)[, row := .I]
setorder(dt, -smean, svar)
dt[, kill := as.numeric(svar > cummin(svar))]
dt$kill[order(dt$row)]
}
microbenchmark(
ivo1 = ivo1(smean, svar),
ivo2 = ivo2(smean, svar),
tic = tic(smean, svar),
sbaldur = sbaldur(smean, svar),
times = 10,
check = "equal",
unit = "relative"
)
shows
Unit: relative
expr min lq mean median uq max
ivo1 22.371324 19.695451 17.558189 18.104484 18.385273 11.2895773
ivo2 439.136523 390.277007 340.305299 355.019021 333.328700 220.5935180
tic 1.562904 1.397621 1.255117 1.340289 1.262332 0.7891665
sbaldur 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000
neval
10
10
10
10