Search code examples
rperformancecomparison

R: Find all variables with non-dominated means and variances FAST


I have a set of many random variables, each with a mean and standard deviation. I want to find all variables that are or are not dominated, i.e., that have the highest mean for their standard deviations. In other words, if X1 has a mean of 10 and and an sd of 10, and X2 has a mean of 9 and an sd of 11, then X2 is dominated by X1. Essentially, I will want to discard it. This is a common problem in portfolio optimization.

Easy when speed is not a concern:


aredominated1 <- function( smean, svar ) {
    PN <- length(smean)
    kill <- rep(FALSE, PN)

    for (i in 1:PN) {
        for (j in 1:PN) {
            if (i==j) next
            kill[i] <- if ((smean[i] < smean[j]) & (svar[i] > svar[j])) 1 else kill[i]
            if (kill[i]) break  ## no longer any need to check whether i is dominated by a subsequent j
        }
    }

    kill
}


aredominated2 <- function( smean, svar ) {
    PN <- length(smean)
    kill <- rep(FALSE, PN)

    for (i in 1:PN) {
        kill[i] <- any( (smean[i] < smean) & (svar[i] > svar) )
    }
    kill
}


####

T <- 100000
smean <- rnorm(T)
svar <- abs(rnorm(T))

print( system.time( { v1 <- aredominated1( smean, svar ) } ) )
print( system.time( { v2 <- aredominated2( smean, svar ) } ) )
            
stopifnot( all( v1 == v2 ) )


Roughly speaking, aredominated1 takes 1.5 secs, while aredominated2 (more R-like) takes 60 secs. Is there a much faster way to write such a function in R, or am I close to the native R limit here?


Solution

  • Maybe you can optimize the loop like below

    tic <- function(smean, svar) {
        k <- order(smean)
        v <- svar[k]
        l <- length(v)
        kill <- vector(length = l)
        for (i in 1:(l - 1)) {
            for (j in (i + 1):l) {
                if (v[i] > v[j]) {
                    kill[i] <- 1
                    break
                }
            }
        }
        kill[order(k)]
    }
    

    Benchmark

    ivo1 <- function(smean, svar) {
        PN <- length(smean)
        kill <- rep(FALSE, PN)
    
        for (i in 1:PN) {
            for (j in 1:PN) {
                if (i == j) next
                kill[i] <- if ((smean[i] < smean[j]) & (svar[i] > svar[j])) 1 else kill[i]
                if (kill[i]) break ## no longer any need to check whether i is dominated by a subsequent j
            }
        }
    
        kill
    }
    
    ivo2 <- function(smean, svar) {
        PN <- length(smean)
        kill <- rep(FALSE, PN)
    
        for (i in 1:PN) {
            kill[i] <- any((smean[i] < smean) & (svar[i] > svar))
        }
        +kill
    }
    
    tic <- function(smean, svar) {
        k <- order(smean)
        v <- svar[k]
        l <- length(v)
        kill <- vector(length = l)
        for (i in 1:(l - 1)) {
            for (j in (i + 1):l) {
                if (v[i] > v[j]) {
                    kill[i] <- 1
                    break
                }
            }
        }
        kill[order(k)]
    }
    
    sbaldur <- function(smean, svar) {
        dt <- data.table(smean, svar)[, row := .I]
        setorder(dt, -smean, svar)
        dt[, kill := as.numeric(svar > cummin(svar))]
        dt$kill[order(dt$row)]
    }
    
    
    
    microbenchmark(
        ivo1 = ivo1(smean, svar),
        ivo2 = ivo2(smean, svar),
        tic = tic(smean, svar),
        sbaldur = sbaldur(smean, svar),
        times = 10,
        check = "equal",
        unit = "relative"
    )
    

    shows

    Unit: relative
        expr        min         lq       mean     median         uq         max
        ivo1  22.371324  19.695451  17.558189  18.104484  18.385273  11.2895773
        ivo2 439.136523 390.277007 340.305299 355.019021 333.328700 220.5935180
         tic   1.562904   1.397621   1.255117   1.340289   1.262332   0.7891665
     sbaldur   1.000000   1.000000   1.000000   1.000000   1.000000   1.0000000
     neval
        10
        10
        10
        10