Search code examples
rloopsmatrixvectorizationcorrelation

R: transform loops into vectorized execution for correlation between rows


I would like conditionally select values from dt2 based on values in dt1 in a row-wise manner and then pair-wise correlate rows in dt2 and save the correlation values in a new matrix, dt3. Before I start to explain in words, I guess the R code is much more descriptive. I do this by looping over the data frame, which is quite slow. I am sure there is the possibility to do this in a vectorized manner to increase performance. Has anyone a solution or suggestion? Thank you a lot!

library(data.table)

dt1 <- data.table(a=round(runif(100)), b=round(runif(100)), c=round(runif(100)), d=round(runif(100)), e=round(runif(100)), f=round(runif(100)))
dt2 <- data.table(a=runif(100), b=runif(100), c=runif(100), d=runif(100), e=runif(100), f=runif(100))

m <- nrow(dt2)
n <- m
dt3 <- matrix(nrow=m, ncol=n)

col_vec <- 1:n
for (r in 1:m) {
  for (p in col_vec) {
    selection <- dt1[r,] > 0 & dt1[p,] > 0 
    selection <- as.vector(selection)
    r_values <- as.numeric(dt2[p, ..selection])
    p_values <- as.numeric(dt2[r, ..selection])
    correlation_value <- cor(r_values, p_values, method='spearman', use='na.or.complete')
    dt3[r,p] <- correlation_value
    dt3[p,r] <- correlation_value
    
    print(glue('row {r} vs row {p}'))
  }
  col_vec <- col_vec[-1]
}

Solution

  • You could use the built-in NA exclusion mechanism with use = "pairwise.complete.obs".

    Set values in dt2 as missing if the corresponding dt1 value is 0, and then use one cor() call.

    library(data.table)
    
    n <- 4
    set.seed(42)
    
    dt1 <- data.table(a = round(runif(n)), b = round(runif(n)), c = round(runif(n)), d = round(runif(n)), e = round(runif(n)), f = round(runif(n)))
    dt2 <- data.table(a = runif(n), b = runif(n), c = runif(n), d = runif(n), e = runif(n), f = runif(n))
    
    replace(t(dt2), t(dt1) == 0, NA) |>
      cor(method = "spearman", use = "pairwise.complete.obs")
    #>      [,1] [,2] [,3] [,4]
    #> [1,]  1.0    1   -1  0.1
    #> [2,]  1.0    1   NA -1.0
    #> [3,] -1.0   NA    1   NA
    #> [4,]  0.1   -1   NA  1.0
    

    Both approaches in functions for benchmarking:

    f_loop <- function(dt1, dt2) {
      m <- nrow(dt2)
      n <- m
      dt3 <- matrix(nrow = m, ncol = n)
    
      col_vec <- 1:n
      for (r in 1:m) {
        for (p in col_vec) {
          selection <- dt1[r, ] > 0 & dt1[p, ] > 0
          selection <- as.vector(selection)
          
          r_values <- as.numeric(dt2[p, ..selection])
          p_values <- as.numeric(dt2[r, ..selection])
          
          correlation_value <- cor(r_values, p_values, method = "spearman", use = "na.or.complete")
          dt3[r, p] <- correlation_value
          dt3[p, r] <- correlation_value
    
          # print(glue::glue("row {r} vs row {p}"))
        }
        col_vec <- col_vec[-1]
      }
    
      dt3
    }
    
    f_repl <- function(dt1, dt2) {
      replace(t(dt2), t(dt1) == 0, NA) |>
        cor(method = "spearman", use = "pairwise.complete.obs")
    }
    

    And test with bigger data:

    n <- 100
    set.seed(42)
    
    dt1 <- data.table(a = round(runif(n)), b = round(runif(n)), c = round(runif(n)), d = round(runif(n)), e = round(runif(n)), f = round(runif(n)))
    dt2 <- data.table(a = runif(n), b = runif(n), c = runif(n), d = runif(n), e = runif(n), f = runif(n))
    
    # Check that we get the same result
    all.equal(f_loop(dt1, dt2), f_repl(dt1, dt2))
    #> [1] TRUE
    
    bench::system_time(f_loop(dt1, dt2))
    #> process    real 
    #>   5.86s   5.92s
    bench::system_time(f_repl(dt1, dt2))
    #> process    real 
    #>   188ms   198ms