Search code examples
rnested-loopslapplycorrelation

How to use lapply to replace nested for loop to get correlations between two data frames?


I would like to get the estimate, statistic and p value from correlating two data frames together. I know how to do this using a nested for loop or foreach using parallelisation. However, I do not know how to do this using lappply.

One data frame has 10 rows and 15 columns and the other has 10 rows and 20 columns.

df1 <- as.data.frame(matrix(runif(10*15, -1, 1), ncol=15))
df2 <- as.data.frame(matrix(runif(10*20, -1, 1), ncol=20))

colnames(df1) <- paste0("a", 1:ncol(df1))
colnames(df2) <- paste0("b", 1:ncol(df2))

combo <- expand.grid(colnames(df1), colnames(df2))  ## 300 combinations

Ideally, the row names would show the combinations between the two data frames (combo), and then there would be a column for estimate, statistic and p value e.g.

x <- list(df1, df2)
res1 <- lapply(x, function(x) cor.test(x[, 1], x[, 2], method="spearman"))
res2 <- lapply(res1, "[", c("estimate", "statistic", "p.value"))
stats <- do.call(rbind, lapply(res2, unlist))

or

stts <- c('estimate', 'statistic', 'p.value')
out <- lapply(x, function(x) cor.test(x[,2], x[,3], method="spearman" [stts])) %>% do.call(what=rbind)


        estimate    statistic    p.value   

a1:b1
a1:b2
a1:b3
a1:b4

Solution

  • Since you have two varying values you may want to use Map rather than lapply, i.e. provide the columns of combo separately instead of tediously subset a data frame. Then, lapply over the output list to extract desired elements and put unlist into an sapply (sapply already tries to get a matrix so no need to rbind) and transform the result. paste the combos together to get nice row names.

    > combo <- combo[with(combo, order(Var1, Var2)), ]  ## optional
    > res <- with(combo, Map(\(x, y) cor.test(df1[, x], df2[, y], meth='s'), Var1, Var2)) |>
    +   lapply(`[`, c('estimate', 'statistic', 'p.value')) |> 
    +   sapply(unlist) |> t() |> `rownames<-`(Reduce(\(...) paste(..., sep=':'), combo))
    > head(res)
          estimate.rho statistic.S    p.value
    a1:b1  -0.06666667         176 0.86475353
    a1:b2   0.26060606         122 0.46967525
    a1:b3   0.17575758         136 0.63196736
    a1:b4   0.27272727         120 0.44827216
    a1:b5   0.67272727          54 0.03938141
    a1:b6   0.29696970         116 0.40695023
    

    Since you mentioned parallelization, just switch to parallel::mcmapply. (Hope you running an unix-alike OS, otherwise it's not that easy)

    > res <- parallel::mcmapply(\(x, y) cor.test(df1[, x], df2[, y], meth='s'), 
    +                           combo$Var1, combo$Var2, 
    +                           mc.cores=parallel::detectCores() - 1,
    +                           SIMPLIFY=FALSE) |>
    +   lapply(`[`, c('estimate', 'statistic', 'p.value')) |> 
    +   sapply(unlist) |> t() |> `rownames<-`(Reduce(\(...) paste(..., sep=':'), 
    +                                                combo))
    > head(res)
          estimate.rho statistic.S    p.value
    a1:b1  -0.06666667         176 0.86475353
    a1:b2   0.26060606         122 0.46967525
    a1:b3   0.17575758         136 0.63196736
    a1:b4   0.27272727         120 0.44827216
    a1:b5   0.67272727          54 0.03938141
    a1:b6   0.29696970         116 0.40695023
    

    On Windows we can do

    > parallel::stopCluster(cl)
    > .CL <- parallel::makePSOCKcluster(parallel::detectCores() - 1L)
    > parallel::clusterExport(.CL, c('df1', 'df2'))
    > res <- parallel::clusterMap(.CL, \(x, y) cor.test(df1[, x], df2[, y], meth='s'), combo$Var1, combo$Var2) |>
    +   lapply(`[`, c('estimate', 'statistic', 'p.value')) |> 
    +   sapply(unlist) |> t() |> `rownames<-`(Reduce(\(...) paste(..., sep=':'), combo))
    > head(res)
          estimate.rho statistic.S   p.value
    a1:b1  -0.06666667         176 0.8647535
    a2:b1   0.33333333         110 0.3488462
    a3:b1   0.29696970         116 0.4069502
    a4:b1   0.07878788         152 0.8380041
    a5:b1  -0.49090909         246 0.1544427
    a6:b1   0.33333333         110 0.3488462
    > parallel::stopCluster(.CL)
    

    Data:

    set.seed(42)  ## for sake of reproducibility
    df1 <- data.frame(matrix(runif(10*15, -1, 1), ncol=15)) |> setNames(paste0("a", seq_len(15)))
    df2 <- data.frame(matrix(runif(10*20, -1, 1), ncol=20)) |> setNames(paste0("b", seq_len(20)))
    combo <- expand.grid(colnames(df1), colnames(df2))  ## 300 combinations