How to use lapply to replace nested for loop to get correlations between two data frames?

I would like to get the estimate, statistic and p value from correlating two data frames together. I know how to do this using a nested for loop or foreach using parallelisation. However, I do not know how to do this using lappply.

One data frame has 10 rows and 15 columns and the other has 10 rows and 20 columns.

df1 <- as.data.frame(matrix(runif(10*15, -1, 1), ncol=15))
df2 <- as.data.frame(matrix(runif(10*20, -1, 1), ncol=20))

colnames(df1) <- paste0("a", 1:ncol(df1))
colnames(df2) <- paste0("b", 1:ncol(df2))

combo <- expand.grid(colnames(df1), colnames(df2))  ## 300 combinations

Ideally, the row names would show the combinations between the two data frames (combo), and then there would be a column for estimate, statistic and p value e.g.

x <- list(df1, df2)
res1 <- lapply(x, function(x) cor.test(x[, 1], x[, 2], method="spearman"))
res2 <- lapply(res1, "[", c("estimate", "statistic", "p.value"))
stats <- do.call(rbind, lapply(res2, unlist))

stts <- c('estimate', 'statistic', 'p.value')
out <- lapply(x, function(x) cor.test(x[,2], x[,3], method="spearman" [stts])) %>% do.call(what=rbind)


        estimate    statistic    p.value   

a1:b1
a1:b2
a1:b3
a1:b4

Solution

Since you have two varying values you may want to use Map rather than lapply, i.e. provide the columns of combo separately instead of tediously subset a data frame. Then, lapply over the output list to extract desired elements and put unlist into an sapply (sapply already tries to get a matrix so no need to rbind) and transform the result. paste the combos together to get nice row names.

> combo <- combo[with(combo, order(Var1, Var2)), ]  ## optional
> res <- with(combo, Map(\(x, y) cor.test(df1[, x], df2[, y], meth='s'), Var1, Var2)) |>
+   lapply(`[`, c('estimate', 'statistic', 'p.value')) |> 
+   sapply(unlist) |> t() |> `rownames<-`(Reduce(\(...) paste(..., sep=':'), combo))
> head(res)
      estimate.rho statistic.S    p.value
a1:b1  -0.06666667         176 0.86475353
a1:b2   0.26060606         122 0.46967525
a1:b3   0.17575758         136 0.63196736
a1:b4   0.27272727         120 0.44827216
a1:b5   0.67272727          54 0.03938141
a1:b6   0.29696970         116 0.40695023

Since you mentioned parallelization, just switch to parallel::mcmapply. (Hope you running an unix-alike OS, otherwise it's not that easy)

> res <- parallel::mcmapply(\(x, y) cor.test(df1[, x], df2[, y], meth='s'), 
+                           combo$Var1, combo$Var2, 
+                           mc.cores=parallel::detectCores() - 1,
+                           SIMPLIFY=FALSE) |>
+   lapply(`[`, c('estimate', 'statistic', 'p.value')) |> 
+   sapply(unlist) |> t() |> `rownames<-`(Reduce(\(...) paste(..., sep=':'), 
+                                                combo))
> head(res)
      estimate.rho statistic.S    p.value
a1:b1  -0.06666667         176 0.86475353
a1:b2   0.26060606         122 0.46967525
a1:b3   0.17575758         136 0.63196736
a1:b4   0.27272727         120 0.44827216
a1:b5   0.67272727          54 0.03938141
a1:b6   0.29696970         116 0.40695023

On Windows we can do

> parallel::stopCluster(cl)
> .CL <- parallel::makePSOCKcluster(parallel::detectCores() - 1L)
> parallel::clusterExport(.CL, c('df1', 'df2'))
> res <- parallel::clusterMap(.CL, \(x, y) cor.test(df1[, x], df2[, y], meth='s'), combo$Var1, combo$Var2) |>
+   lapply(`[`, c('estimate', 'statistic', 'p.value')) |> 
+   sapply(unlist) |> t() |> `rownames<-`(Reduce(\(...) paste(..., sep=':'), combo))
> head(res)
      estimate.rho statistic.S   p.value
a1:b1  -0.06666667         176 0.8647535
a2:b1   0.33333333         110 0.3488462
a3:b1   0.29696970         116 0.4069502
a4:b1   0.07878788         152 0.8380041
a5:b1  -0.49090909         246 0.1544427
a6:b1   0.33333333         110 0.3488462
> parallel::stopCluster(.CL)

Data:

set.seed(42)  ## for sake of reproducibility
df1 <- data.frame(matrix(runif(10*15, -1, 1), ncol=15)) |> setNames(paste0("a", seq_len(15)))
df2 <- data.frame(matrix(runif(10*20, -1, 1), ncol=20)) |> setNames(paste0("b", seq_len(20)))
combo <- expand.grid(colnames(df1), colnames(df2))  ## 300 combinations