Search code examples
rdataframerowmultiple-columnscorrelation

Systematic Spearman correlations between columns in two data.frames by row.names in R


I have two data.frames in r:

  set.seed(12345)
  df1 <- data.frame(a=rnorm(20,0,0.4),
                      b=rnorm(20,0.3,0.8),
                      c=rnorm(20,-0.1,0.6),
                      d=rnorm(20,-0.23,0.3),
                      e=rnorm(20,0.2,0.4))
  library(purrr)
  df1 <- as.data.frame(map_df(df1, function(x) {x[sample(c(TRUE, NA), prob = c(0.8, 0.2), 
       size = length(x), replace = TRUE)]}))
  rownames(df1) <- sample(LETTERS, 20, replace=FALSE)
  df2 <- data.frame(one=rnorm(23,6,2),
                      two=rnorm(23,8,4),
                      three=rnorm(23,12,5),
                      four=rnorm(23,4,0.4),
                      five=rnorm(23,3,0.2))
  df2 <- as.data.frame(map_df(df2, function(x) {x[sample(c(TRUE, NA), prob = c(0.7, 0.3), 
       size = length(x), replace = TRUE)]}))
  rownames(df2) <- sample(LETTERS, 23, replace=FALSE)

How can I systematically determine Spearman correlations and p values between each of the columns in the two data.frames using matching row ids? So between column "a" in df1 and column "one" in df2, column "a" in df1 and column "two" in df2, ..., column "e" in df1 and column "five" in df2, to create a new data.frame with the results?

Expected result:

  df3 <- data.frame(letter=c(rep("a", 5), rep("b", 2),"..."),
                      number=c("one","two","three","four","five","one","two", "..."),
                      Spearman.r=c(-0.6352, 0.0182, 0.5944, 0.3846, -0.6606, 0.1154, 0.2364, "..."),
                      p.value=c(0.0171, 0.9730, 0.0457, 0.2183, 0.0438, 0.7097, 0.4854, "..."))

My attempt (unsuccessful):

I tried this, but this gave different results than expected. I don't know how to solve this!!!

 library(dplyr)
 # Create empty data.frame for results
 df3 <- data.frame(letter = character(),
                   number = character(),
                   Spearman.r = numeric(),
                   p.value = numeric(),
                   stringsAsFactors = FALSE)
 # Loop through each column in df1 and df2
 for (col1 in colnames(df1)) {
   for (col2 in colnames(df2)) {
 # Check for missing values in both 'x' and 'y'
     valid_rows <- !is.na(df1[[col1]]) & !is.na(df2[[col2]])
     x <- df1[[col1]][valid_rows]
     y <- df2[[col2]][valid_rows]
 # Calculate Spearman correlation and p-value
     if (length(x) > 1 & length(y) > 1) {
       result <- cor.test(x, y, method = "spearman")
 # Append the results to df3
       df3 <- df3 %>%
         add_row(letter = col1,
                 number = col2,
                 Spearman.r = result$estimate,
                 p.value = result$p.value)
     }
   }
 }

I think the problem above is that the row names are ignored/not matched. How can we fix that???


Solution

  • I would put element names of desired stats in a vector, Map cor.test over the respective columns and rbind the result — it's more concise.

    sts <- c('estimate', 'p.value')
    Map(cor.test, df1, df2[1:20, ]) |> lapply(`[`, sts) |>
      sapply(do.call, what=rbind) |> t() |> `colnames<-`(sts)
    #      estimate    p.value
    # a  0.31461249 0.21873250
    # b  0.03575649 0.90341200
    # c  0.33568792 0.41627099
    # d  0.53666496 0.07202434
    # e -0.34248509 0.27582473
    

    Note, that both columns should have same length, I subsetted df2 accordingly.


    Data:

    > dput(df1)
    structure(list(a = c(0.234211527137542, NA, -0.0437213258724216, 
    -0.181398869385105, 0.242354982336157, -0.727182387081492, 0.252039420427356, 
    -0.110473642090086, -0.113663897577348, -0.367728800989651, -0.0464991225408008, 
    0.726924817481687, 0.148251145703182, 0.208086583021983, NA, 
    0.326759935808233, -0.354543008497285, -0.132631035977021, 0.448285060667822, 
    0.119489479706917), b = c(0.923697539644259, 1.46462806598149, 
    -0.215462743385042, -0.942509924183749, NA, 1.74407801504866, 
    NA, 0.796303841038738, 0.789698794120679, 0.170151218465499, 
    0.949498542843088, 2.05746683707803, 1.93935226992495, 1.60595651158437, 
    0.503416954251244, 0.692950623418047, NA, NA, 1.71418708069837, 
    0.320640838918247), c = c(NA, NA, NA, 0.462284324109745, 0.412671032198332, 
    0.776437641862453, NA, 0.240441952054689, 0.249912592061411, 
    -0.884079300078655, -0.424231644317263, NA, -0.0678458377855273, 
    0.110997704333142, NA, NA, 0.31470276372493, 0.394277197223652, 
    1.18703901228817, -1.50816638701802), d = c(NA, NA, NA, NA, -0.40606387831613, 
    -0.77971319186282, 0.0364418297956053, 0.248046541706882, NA, 
    -0.618701504088447, -0.213615327401187, -0.465394811957463, -0.54480584568688, 
    NA, 0.190811614860229, 0.0527802552074576, 0.0178774861457288, 
    -0.473462147057324, -0.0871255157739038, NA), e = c(0.458153229682431, 
    0.617257420620576, 0.0782523549225354, 1.19084436678519, 0.588488269214978, 
    0.946839673826688, 0.468816987605915, 0.0768186476650648, 0.414609486684349, 
    0.52994802614067, -0.185560591605521, NA, 0.954778776810265, 
    0.0432722512019271, NA, 0.474932840222817, NA, 1.06308792660257, 
    -0.0399190255299342, -0.0778186773083004)), row.names = c("B", 
    "S", "H", "R", "E", "Z", "K", "N", "D", "W", "Q", "X", "F", "J", 
    "T", "U", "G", "A", "M", "Y"), class = "data.frame")
    > dput(df2)
    structure(list(one = c(5.51605196074149, 5.0365328649962, 4.01639427148655, 
    5.43870172239114, 7.26603468836034, 3.52036331378428, 9.52862813798864, 
    5.95264022369398, 6.39984096765487, 8.69438555528002, 6.0721469755994, 
    7.64916226113347, NA, 6.96190031053536, NA, 6.80272997501121, 
    6.43035434149409, 2.3685752941944, 4.17652115153425, 5.90191061829275, 
    5.18922504663637, 8.26076359569371, NA), two = c(8.30567007619257, 
    NA, 9.49648433374505, 7.3163837769897, NA, 10.1740884317625, 
    5.97925598859351, 11.1471831576081, 9.20379760643377, 13.2408956269895, 
    NA, 11.4034417504577, 6.22572812762124, 6.21290084956168, 8.0532201671046, 
    2.2554171750917, NA, 8.97408706250136, 12.2334489333839, 11.3253952716601, 
    NA, 1.03314762217058, NA), three = c(12.4855211233804, 11.6163315273091, 
    16.9597533885087, NA, 10.592101558378, NA, NA, NA, 15.3014668907241, 
    3.38898796427443, 1.32686975825689, 12.344727990685, NA, 0.54977910270623, 
    11.2490485589584, 10.6560910360448, 20.9566602057366, 15.3613402084905, 
    NA, 12.0609125306219, 19.670584321922, NA, 12.3921876637446), 
        four = c(3.68829557182812, NA, NA, NA, 3.81284465221339, 
        NA, 3.74330545689202, 4.25106872929897, 4.09933205072919, 
        3.71996967331032, 3.77303936595233, 3.89544242964422, 3.57444598568537, 
        NA, 4.30844149587023, 5.09896141689949, 3.96642609716967, 
        4.21742705370011, 4.30114448425416, NA, 4.40044793895629, 
        NA, 3.42629987702757), five = c(2.94693903674438, NA, 2.91699579425768, 
        2.90808486475521, NA, 2.76829217330492, 3.14217799968969, 
        3.25352035044765, 2.97136978884987, NA, 3.29657823686164, 
        2.96748221801475, NA, 3.09660797922777, NA, 2.86728525217144, 
        2.87307002115739, NA, 3.11537007577426, 2.57738392585485, 
        NA, 3.22942543805793, 3.00295872971124)), row.names = c("B", 
    "J", "U", "X", "A", "P", "I", "F", "R", "Z", "G", "K", "H", "S", 
    "C", "O", "N", "Y", "T", "L", "M", "W", "D"), class = "data.frame")