Search code examples
rmatrixequationpairwise

Running a equation on all pairs in dataframe, output matrix


I have a data frame with rows corresponding to measurements for each individual. I need to run an equation on all pairs of individuals, and output the results as a matrix with values corresponding to the output for each pair.

Data look like this:

df = data.frame(
sample=c("sample01","sample02","sample03","sample04","sample05"),
start=c(233,99,288,313,346),
min_01=c(2.94,3.26,3.15,2.55,2.59),
min_02=c(4.22,4.97,3.51,4.14,4.12),
min_03=c(5.7,6.61,4.86,5.44,5.47),
min_04=c(7.15,8.26,6.3,7.14,7.04),
min_05=c(10.52,11.9,9.7,10.49,10.25),
min_06=c(13.81,15.51,13.02,14.55,14.62),
min_07=c(16.15,18.98,16.63,18.19,17.49),
min_08=c(15.34,18.43,15.83,17.86,17.08),
min_09=c(14.27,15.59,13.27,14.87,14.6),
min_10=c(9.83,10.9,9,10.14,9.83),
min_11=c(5.53,5.95,4.31,5.26,5.18),
min_12=c(3.12,2.98,2.96,2.35,2.3),
max_01=c(13.13,14.1,14.92,14.46,13.34),
max_02=c(15.83,16.92,16.86,16.35,15.74),
max_03=c(18.49,19.75,19.23,18.99,18.47),
max_04=c(22.86,23.46,22.99,20.93,22.89),
max_05=c(27.53,28.75,27.74,26.12,28.42),
max_06=c(31.88,33.4,32.29,31.09,33.46),
max_07=c(35.23,36.78,36.02,35.51,37.3),
max_08=c(34.68,36.15,35.56,35.4,36.61),
max_09=c(32.44,32.97,32.3,32.31,33.11),
max_10=c(26.66,26.94,26.27,26.22,26.87),
max_11=c(17.96,19.2,19.08,19.06,18.51),
max_12=c(13.06,14.12,14.74,14.17,13.26))

The equation to be run is:

sample01-02-01 = (sample01$max_01-sample02$min_01)/SQRT((sample01$max_01-sample01$min_01)*(sample02$max_01-sample02$min_01))

sample01-02-02 = (sample01$max_02-sample02$min_02)/SQRT((sample01$max_02-sample01$min_02)*(sample02$max_02-sample02$min_02))

sample01-02-03 = (sample01$max_03... etc

... for a total of 12 per pair (up though sample01-02-12), summed to generate a single value for the pairwise output matrix.

Any help would be appreciated!


Solution

  • Here's a shot, using some reshaping from tidyr and grouping operations:

    library(dplyr)
    library(tidyr)
    longdf <- df %>%
      pivot_longer(
        cols = min_01:max_12,
        names_to = c(".value", "set"),
        names_pattern = "(.*)_(.*)"
      )
    longdf
    # # A tibble: 60 x 5
    #    sample   start set     min   max
    #    <fct>    <dbl> <chr> <dbl> <dbl>
    #  1 sample01   233 01     2.94  13.1
    #  2 sample01   233 02     4.22  15.8
    #  3 sample01   233 03     5.7   18.5
    #  4 sample01   233 04     7.15  22.9
    #  5 sample01   233 05    10.5   27.5
    #  6 sample01   233 06    13.8   31.9
    #  7 sample01   233 07    16.2   35.2
    #  8 sample01   233 08    15.3   34.7
    #  9 sample01   233 09    14.3   32.4
    # 10 sample01   233 10     9.83  26.7
    # # ... with 50 more rows
    

    From here, let's join it back on itself in a cartesion fashion, removing self-self comparisons and ensuring the sets are the same (min_01 with min_01 of the other sample):

    crossing(
      rename_all(longdf, ~ paste0(., "1")),
      rename_all(longdf, ~ paste0(., "2"))
    ) %>% 
      filter(sample1 != sample2, set1 == set2)
    # # A tibble: 240 x 10
    #    sample1  start1 set1   min1  max1 sample2  start2 set2   min2  max2
    #    <fct>     <dbl> <chr> <dbl> <dbl> <fct>     <dbl> <chr> <dbl> <dbl>
    #  1 sample01    233 01     2.94  13.1 sample02     99 01     3.26  14.1
    #  2 sample01    233 01     2.94  13.1 sample03    288 01     3.15  14.9
    #  3 sample01    233 01     2.94  13.1 sample04    313 01     2.55  14.5
    #  4 sample01    233 01     2.94  13.1 sample05    346 01     2.59  13.3
    #  5 sample01    233 02     4.22  15.8 sample02     99 02     4.97  16.9
    #  6 sample01    233 02     4.22  15.8 sample03    288 02     3.51  16.9
    #  7 sample01    233 02     4.22  15.8 sample04    313 02     4.14  16.4
    #  8 sample01    233 02     4.22  15.8 sample05    346 02     4.12  15.7
    #  9 sample01    233 03     5.7   18.5 sample02     99 03     6.61  19.8
    # 10 sample01    233 03     5.7   18.5 sample03    288 03     4.86  19.2
    # # ... with 230 more rows
    

    From here, it's "just" some math:

    crossing(
      rename_all(longdf, ~ paste0(., "1")),
      rename_all(longdf, ~ paste0(., "2"))
    ) %>% 
      filter(sample1 != sample2, set1 == set2) %>%
      mutate(
        out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
      )
    # # A tibble: 240 x 11
    #    sample1  start1 set1   min1  max1 sample2  start2 set2   min2  max2   out
    #    <fct>     <dbl> <chr> <dbl> <dbl> <fct>     <dbl> <chr> <dbl> <dbl> <dbl>
    #  1 sample01    233 01     2.94  13.1 sample02     99 01     3.26  14.1 0.939
    #  2 sample01    233 01     2.94  13.1 sample03    288 01     3.15  14.9 0.911
    #  3 sample01    233 01     2.94  13.1 sample04    313 01     2.55  14.5 0.960
    #  4 sample01    233 01     2.94  13.1 sample05    346 01     2.59  13.3 1.01 
    #  5 sample01    233 02     4.22  15.8 sample02     99 02     4.97  16.9 0.922
    #  6 sample01    233 02     4.22  15.8 sample03    288 02     3.51  16.9 0.990
    #  7 sample01    233 02     4.22  15.8 sample04    313 02     4.14  16.4 0.982
    #  8 sample01    233 02     4.22  15.8 sample05    346 02     4.12  15.7 1.01 
    #  9 sample01    233 03     5.7   18.5 sample02     99 03     6.61  19.8 0.916
    # 10 sample01    233 03     5.7   18.5 sample03    288 03     4.86  19.2 1.01 
    # # ... with 230 more rows
    

    ... and summing by group:

    crossing(
      rename_all(longdf, ~ paste0(., "1")),
      rename_all(longdf, ~ paste0(., "2"))
    ) %>% 
      filter(sample1 != sample2, set1 == set2) %>%
      mutate(
        out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
      ) %>%
      group_by(sample1, sample2) %>%
      summarize(out = sum(out)) %>%
      ungroup()
    # # A tibble: 20 x 3
    #    sample1  sample2    out
    #    <fct>    <fct>    <dbl>
    #  1 sample01 sample02  11.1
    #  2 sample01 sample03  11.9
    #  3 sample01 sample04  11.8
    #  4 sample01 sample05  11.8
    #  5 sample02 sample01  12.9
    #  6 sample02 sample03  12.8
    #  7 sample02 sample04  12.7
    #  8 sample02 sample05  12.6
    #  9 sample03 sample01  12.1
    # 10 sample03 sample02  11.3
    # 11 sample03 sample04  12.0
    # 12 sample03 sample05  11.9
    # 13 sample04 sample01  12.2
    # 14 sample04 sample02  11.3
    # 15 sample04 sample03  12.1
    # 16 sample04 sample05  11.9
    # 17 sample05 sample01  12.3
    # 18 sample05 sample02  11.4
    # 19 sample05 sample03  12.1
    # 20 sample05 sample04  12.1
    

    And if you need them in a matrix-like layout, then

    crossing(
      rename_all(longdf, ~ paste0(., "1")),
      rename_all(longdf, ~ paste0(., "2"))
    ) %>% 
      filter(sample1 != sample2, set1 == set2) %>%
      mutate(
        out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
      ) %>%
      group_by(sample1, sample2) %>%
      summarize(out = sum(out)) %>%
      ungroup() %>%
      pivot_wider(sample1, names_from = "sample2", values_from = "out") %>%
      select(c("sample1", setdiff(sort(colnames(.)), "sample1")))
    # # A tibble: 5 x 6
    #   sample1  sample01 sample02 sample03 sample04 sample05
    #   <fct>       <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
    # 1 sample01     NA       11.1     11.9     11.8     11.8
    # 2 sample02     12.9     NA       12.8     12.7     12.6
    # 3 sample03     12.1     11.3     NA       12.0     11.9
    # 4 sample04     12.2     11.3     12.1     NA       11.9
    # 5 sample05     12.3     11.4     12.1     12.1     NA