I would like to get the estimate, statistic and p value from correlating two data frames together. I know how to do this using a nested for loop or foreach using parallelisation. However, I do not know how to do this using lappply.
One data frame has 10 rows and 15 columns and the other has 10 rows and 20 columns.
df1 <- as.data.frame(matrix(runif(10*15, -1, 1), ncol=15))
df2 <- as.data.frame(matrix(runif(10*20, -1, 1), ncol=20))
colnames(df1) <- paste0("a", 1:ncol(df1))
colnames(df2) <- paste0("b", 1:ncol(df2))
combo <- expand.grid(colnames(df1), colnames(df2)) ## 300 combinations
Ideally, the row names would show the combinations between the two data frames (combo), and then there would be a column for estimate, statistic and p value e.g.
x <- list(df1, df2)
res1 <- lapply(x, function(x) cor.test(x[, 1], x[, 2], method="spearman"))
res2 <- lapply(res1, "[", c("estimate", "statistic", "p.value"))
stats <- do.call(rbind, lapply(res2, unlist))
or
stts <- c('estimate', 'statistic', 'p.value')
out <- lapply(x, function(x) cor.test(x[,2], x[,3], method="spearman" [stts])) %>% do.call(what=rbind)
estimate statistic p.value
a1:b1
a1:b2
a1:b3
a1:b4
Since you have two varying values you may want to use Map
rather than lapply
, i.e. provide the columns of combo separately instead of tediously subset a data frame. Then, lapply
over the output list to extract desired elements and put unlist
into an sapply
(sapply
already tries to get a matrix so no need to rbind
) and t
ransform the result. paste
the combos together to get nice row names.
> combo <- combo[with(combo, order(Var1, Var2)), ] ## optional
> res <- with(combo, Map(\(x, y) cor.test(df1[, x], df2[, y], meth='s'), Var1, Var2)) |>
+ lapply(`[`, c('estimate', 'statistic', 'p.value')) |>
+ sapply(unlist) |> t() |> `rownames<-`(Reduce(\(...) paste(..., sep=':'), combo))
> head(res)
estimate.rho statistic.S p.value
a1:b1 -0.06666667 176 0.86475353
a1:b2 0.26060606 122 0.46967525
a1:b3 0.17575758 136 0.63196736
a1:b4 0.27272727 120 0.44827216
a1:b5 0.67272727 54 0.03938141
a1:b6 0.29696970 116 0.40695023
Since you mentioned parallelization, just switch to parallel::mcmapply
. (Hope you running an unix-alike OS, otherwise it's not that easy)
> res <- parallel::mcmapply(\(x, y) cor.test(df1[, x], df2[, y], meth='s'),
+ combo$Var1, combo$Var2,
+ mc.cores=parallel::detectCores() - 1,
+ SIMPLIFY=FALSE) |>
+ lapply(`[`, c('estimate', 'statistic', 'p.value')) |>
+ sapply(unlist) |> t() |> `rownames<-`(Reduce(\(...) paste(..., sep=':'),
+ combo))
> head(res)
estimate.rho statistic.S p.value
a1:b1 -0.06666667 176 0.86475353
a1:b2 0.26060606 122 0.46967525
a1:b3 0.17575758 136 0.63196736
a1:b4 0.27272727 120 0.44827216
a1:b5 0.67272727 54 0.03938141
a1:b6 0.29696970 116 0.40695023
On Windows we can do
> parallel::stopCluster(cl)
> .CL <- parallel::makePSOCKcluster(parallel::detectCores() - 1L)
> parallel::clusterExport(.CL, c('df1', 'df2'))
> res <- parallel::clusterMap(.CL, \(x, y) cor.test(df1[, x], df2[, y], meth='s'), combo$Var1, combo$Var2) |>
+ lapply(`[`, c('estimate', 'statistic', 'p.value')) |>
+ sapply(unlist) |> t() |> `rownames<-`(Reduce(\(...) paste(..., sep=':'), combo))
> head(res)
estimate.rho statistic.S p.value
a1:b1 -0.06666667 176 0.8647535
a2:b1 0.33333333 110 0.3488462
a3:b1 0.29696970 116 0.4069502
a4:b1 0.07878788 152 0.8380041
a5:b1 -0.49090909 246 0.1544427
a6:b1 0.33333333 110 0.3488462
> parallel::stopCluster(.CL)
Data:
set.seed(42) ## for sake of reproducibility
df1 <- data.frame(matrix(runif(10*15, -1, 1), ncol=15)) |> setNames(paste0("a", seq_len(15)))
df2 <- data.frame(matrix(runif(10*20, -1, 1), ncol=20)) |> setNames(paste0("b", seq_len(20)))
combo <- expand.grid(colnames(df1), colnames(df2)) ## 300 combinations