R: Find all variables with non-dominated means and variances FAST

I have a set of many random variables, each with a mean and standard deviation. I want to find all variables that are or are not dominated, i.e., that have the highest mean for their standard deviations. In other words, if X1 has a mean of 10 and and an sd of 10, and X2 has a mean of 9 and an sd of 11, then X2 is dominated by X1. Essentially, I will want to discard it. This is a common problem in portfolio optimization.

Easy when speed is not a concern:


aredominated1 <- function( smean, svar ) {
    PN <- length(smean)
    kill <- rep(FALSE, PN)

    for (i in 1:PN) {
        for (j in 1:PN) {
            if (i==j) next
            kill[i] <- if ((smean[i] < smean[j]) & (svar[i] > svar[j])) 1 else kill[i]
            if (kill[i]) break  ## no longer any need to check whether i is dominated by a subsequent j
        }
    }

    kill
}


aredominated2 <- function( smean, svar ) {
    PN <- length(smean)
    kill <- rep(FALSE, PN)

    for (i in 1:PN) {
        kill[i] <- any( (smean[i] < smean) & (svar[i] > svar) )
    }
    kill
}


####

T <- 100000
smean <- rnorm(T)
svar <- abs(rnorm(T))

print( system.time( { v1 <- aredominated1( smean, svar ) } ) )
print( system.time( { v2 <- aredominated2( smean, svar ) } ) )
            
stopifnot( all( v1 == v2 ) )

Roughly speaking, aredominated1 takes 1.5 secs, while aredominated2 (more R-like) takes 60 secs. Is there a much faster way to write such a function in R, or am I close to the native R limit here?

Solution

Maybe you can optimize the loop like below

tic <- function(smean, svar) {
    k <- order(smean)
    v <- svar[k]
    l <- length(v)
    kill <- vector(length = l)
    for (i in 1:(l - 1)) {
        for (j in (i + 1):l) {
            if (v[i] > v[j]) {
                kill[i] <- 1
                break
            }
        }
    }
    kill[order(k)]
}

Benchmark

ivo1 <- function(smean, svar) {
    PN <- length(smean)
    kill <- rep(FALSE, PN)

    for (i in 1:PN) {
        for (j in 1:PN) {
            if (i == j) next
            kill[i] <- if ((smean[i] < smean[j]) & (svar[i] > svar[j])) 1 else kill[i]
            if (kill[i]) break ## no longer any need to check whether i is dominated by a subsequent j
        }
    }

    kill
}

ivo2 <- function(smean, svar) {
    PN <- length(smean)
    kill <- rep(FALSE, PN)

    for (i in 1:PN) {
        kill[i] <- any((smean[i] < smean) & (svar[i] > svar))
    }
    +kill
}

tic <- function(smean, svar) {
    k <- order(smean)
    v <- svar[k]
    l <- length(v)
    kill <- vector(length = l)
    for (i in 1:(l - 1)) {
        for (j in (i + 1):l) {
            if (v[i] > v[j]) {
                kill[i] <- 1
                break
            }
        }
    }
    kill[order(k)]
}

sbaldur <- function(smean, svar) {
    dt <- data.table(smean, svar)[, row := .I]
    setorder(dt, -smean, svar)
    dt[, kill := as.numeric(svar > cummin(svar))]
    dt$kill[order(dt$row)]
}



microbenchmark(
    ivo1 = ivo1(smean, svar),
    ivo2 = ivo2(smean, svar),
    tic = tic(smean, svar),
    sbaldur = sbaldur(smean, svar),
    times = 10,
    check = "equal",
    unit = "relative"
)

shows

Unit: relative
    expr        min         lq       mean     median         uq         max
    ivo1  22.371324  19.695451  17.558189  18.104484  18.385273  11.2895773
    ivo2 439.136523 390.277007 340.305299 355.019021 333.328700 220.5935180
     tic   1.562904   1.397621   1.255117   1.340289   1.262332   0.7891665
 sbaldur   1.000000   1.000000   1.000000   1.000000   1.000000   1.0000000
 neval
    10
    10
    10
    10