Search code examples
rperformancesimilaritydoparallelparallel-foreach

Improve performance for computing Weighted Jaccard in a large matrix


R input: a matrix (measures x samples) (2291 x 265) (matrix [i,j]=a value between 0 and 1)

Output: a simmetric similarity matrix of the weighted jaccard computed between all the pairs of samples

Problem: find the fastest way to produce the output. I found a good way using "doParallel" and "foreach" but it is not enough because it is still too slow. I didn't find any package with a function able to compute the weighted jaccard but maybe I missed it. Anyway you can reply with the solution and the method that you like. Thanks to everyone will answer. This is my script for now:

rm(list=ls())

#Load libraries ----
require(doParallel)
require(foreach)
require(doSNOW)
require(doMPI)
#Imported data ----
dim(input_m) #2291 x 265

#Set clusters ----
no_cores <- 3
cl <- makeCluster(as.integer(no_cores))
registerDoParallel(cl)

#I build all the combinations of the pairs of samples ----
samples=seq(1:ncol(input_m))
combs<-as.matrix(expand.grid(samples,samples))
combs<-unique(t(parApply(cl=cl, combs, 1, sort)))

#Prepare the resulting matrix ----
res_m <- matrix(ncol = ncol(input_m), nrow = ncol(input_m))
rownames(res_m)=colnames(input_m)
colnames(res_m)=colnames(input_m)

#Compute Weighted Jaccard similarity btw all pairs of samples ----
sim_m=foreach(s = 1:nrow(combs), .combine=rbind, .noexport=c("pair","num","den"), .inorder=FALSE) %dopar% {
    pair=input_m[,c(combs[s,1],combs[s,2])]
    num=sum(apply(pair,1,min))
    den=sum(apply(pair,1,max))
    return(c(combs[s,1],combs[s,2],num/den))
}

#Fill the prepared matrix with the results in sim_m
for (k in 1:nrow(sim_m)){
    sim=sim_m[k,3]
    idx1=sim_m[k,1]
    idx2=sim_m[k,2]
    res_m[idx1,idx2]=sim
    res_m[idx2,idx1]=sim
}

#Stop clusters
stopCluster(cl)

Solution

  • using your answer and @HenrikB comments I managed to write a faster approach:

    ## simulate data
    nr <- 2291; nc <- 265
    set.seed(420)
    input_m <- matrix(rnorm(nr * nc), nrow = nr, ncol = nc)
    input_m[1:5, 1:5]
    #             [,1]       [,2]        [,3]        [,4]        [,5]
    # [1,] -0.76774389  1.2623614  2.44166184 -1.86900934  1.61130129
    # [2,] -1.44513238 -0.5469383 -0.31919480 -0.03155421  0.09293325
    # [3,] -0.71767075 -0.2753542  2.28792301  0.41545393 -0.47370802
    # [4,]  0.06410398  1.4956864  0.06859527  2.19689076 -0.96428109
    # [5,] -1.85365878  0.1609678 -0.52191522 -0.79557319 -0.33021108
    
    jaccardLuke <- function(input_m) {
      res_m = outer(1:ncol(input_m), 1:ncol(input_m) ,
                    FUN = Vectorize(function(r,c) {
                      require(matrixStats)
                      sum(rowMins(input_m[,c(r,c)]))/sum(rowMaxs(input_m[,c(r,c)]))
                      })
                    )
      rownames(res_m) = colnames(input_m)
      colnames(res_m) = colnames(input_m)
      res_m
    }
    
    jaccardHenrikB <- function(input_m) {
      require(matrixStats)
      res_m = outer(1:ncol(input_m), 1:ncol(input_m) ,
                    FUN = Vectorize(function(r, r2) {
                      x <- rowRanges(input_m, cols = c(r, r2))
                      s <- colSums(x)
                      s[1] / s[2]
                    })
      )
      rownames(res_m) = colnames(input_m)
      colnames(res_m) = colnames(input_m)
      res_m
    }
    

    My function:

    jaccardMinem <- function(input_m) {
      require(data.table)
      require(matrixStats)
    
      samples <- 1:ncol(input_m)
      comb <- CJ(samples, samples)
      comb[, i := .I]
      comb <- melt(comb, 'i')
      setorder(comb, value)
      v2 <- paste0("V", 1:2)
      comb[, variable2 := v2 , keyby = i]
      comb2 <- dcast(comb, i ~ variable2, value.var = 'value')
      combUnique <- unique(comb2, by = c('V1', 'V2'))
    
      XX <- apply(combUnique[, -'i'], 1, function(x) {
        x2 <- rowRanges(input_m, cols = x)
        s <- colSums2(x2)
        s[1] / s[2]
      })
    
      set(combUnique, j = 'xx', value = XX)
      rez2 <- merge(comb2, combUnique[, -'i'], by = c('V1', 'V2'), all.x = T)
      setorder(rez2, i)
      rez2 <- array(rez2$xx, dim = rep(ncol(input_m), 2))
      rownames(rez2) <- colnames(input_m)
      colnames(rez2) <- colnames(input_m)
      rez2
    }
    

    Test if all equal:

    all.equal(jaccardLuke(input_m), jaccardHenrikB(input_m))
    # [1] TRUE
    all.equal(jaccardLuke(input_m), jaccardMinem(input_m))
    # [1] TRUE
    

    benchmarking:

    system.time(jaccardLuke(input_m)) # 6.05 sek
    system.time(jaccardHenrikB(input_m)) # 2.75 sek
    system.time(jaccardMinem(input_m)) # 1.74 sek
    
    ## for larger data:
    nr <- 5000; nc <- 500
    set.seed(420)
    input_m <- matrix(rnorm(nr * nc), nrow = nr, ncol = nc)
    system.time(jaccardLuke(input_m)) # 41.55 sek
    system.time(jaccardHenrikB(input_m)) # 19.87 sek
    system.time(jaccardMinem(input_m)) # 11.17 sek
    

    the main difference is that I firstly calculate unique index combinations for which we need to calculate the values