I have a data frame with rows corresponding to measurements for each individual. I need to run an equation on all pairs of individuals, and output the results as a matrix with values corresponding to the output for each pair.
Data look like this:
df = data.frame(
sample=c("sample01","sample02","sample03","sample04","sample05"),
start=c(233,99,288,313,346),
min_01=c(2.94,3.26,3.15,2.55,2.59),
min_02=c(4.22,4.97,3.51,4.14,4.12),
min_03=c(5.7,6.61,4.86,5.44,5.47),
min_04=c(7.15,8.26,6.3,7.14,7.04),
min_05=c(10.52,11.9,9.7,10.49,10.25),
min_06=c(13.81,15.51,13.02,14.55,14.62),
min_07=c(16.15,18.98,16.63,18.19,17.49),
min_08=c(15.34,18.43,15.83,17.86,17.08),
min_09=c(14.27,15.59,13.27,14.87,14.6),
min_10=c(9.83,10.9,9,10.14,9.83),
min_11=c(5.53,5.95,4.31,5.26,5.18),
min_12=c(3.12,2.98,2.96,2.35,2.3),
max_01=c(13.13,14.1,14.92,14.46,13.34),
max_02=c(15.83,16.92,16.86,16.35,15.74),
max_03=c(18.49,19.75,19.23,18.99,18.47),
max_04=c(22.86,23.46,22.99,20.93,22.89),
max_05=c(27.53,28.75,27.74,26.12,28.42),
max_06=c(31.88,33.4,32.29,31.09,33.46),
max_07=c(35.23,36.78,36.02,35.51,37.3),
max_08=c(34.68,36.15,35.56,35.4,36.61),
max_09=c(32.44,32.97,32.3,32.31,33.11),
max_10=c(26.66,26.94,26.27,26.22,26.87),
max_11=c(17.96,19.2,19.08,19.06,18.51),
max_12=c(13.06,14.12,14.74,14.17,13.26))
The equation to be run is:
sample01-02-01 = (sample01$max_01-sample02$min_01)/SQRT((sample01$max_01-sample01$min_01)*(sample02$max_01-sample02$min_01))
sample01-02-02 = (sample01$max_02-sample02$min_02)/SQRT((sample01$max_02-sample01$min_02)*(sample02$max_02-sample02$min_02))
sample01-02-03 = (sample01$max_03... etc
... for a total of 12 per pair (up though sample01-02-12), summed to generate a single value for the pairwise output matrix.
Any help would be appreciated!
Here's a shot, using some reshaping from tidyr
and grouping operations:
library(dplyr)
library(tidyr)
longdf <- df %>%
pivot_longer(
cols = min_01:max_12,
names_to = c(".value", "set"),
names_pattern = "(.*)_(.*)"
)
longdf
# # A tibble: 60 x 5
# sample start set min max
# <fct> <dbl> <chr> <dbl> <dbl>
# 1 sample01 233 01 2.94 13.1
# 2 sample01 233 02 4.22 15.8
# 3 sample01 233 03 5.7 18.5
# 4 sample01 233 04 7.15 22.9
# 5 sample01 233 05 10.5 27.5
# 6 sample01 233 06 13.8 31.9
# 7 sample01 233 07 16.2 35.2
# 8 sample01 233 08 15.3 34.7
# 9 sample01 233 09 14.3 32.4
# 10 sample01 233 10 9.83 26.7
# # ... with 50 more rows
From here, let's join it back on itself in a cartesion fashion, removing self-self comparisons and ensuring the sets are the same (min_01
with min_01
of the other sample):
crossing(
rename_all(longdf, ~ paste0(., "1")),
rename_all(longdf, ~ paste0(., "2"))
) %>%
filter(sample1 != sample2, set1 == set2)
# # A tibble: 240 x 10
# sample1 start1 set1 min1 max1 sample2 start2 set2 min2 max2
# <fct> <dbl> <chr> <dbl> <dbl> <fct> <dbl> <chr> <dbl> <dbl>
# 1 sample01 233 01 2.94 13.1 sample02 99 01 3.26 14.1
# 2 sample01 233 01 2.94 13.1 sample03 288 01 3.15 14.9
# 3 sample01 233 01 2.94 13.1 sample04 313 01 2.55 14.5
# 4 sample01 233 01 2.94 13.1 sample05 346 01 2.59 13.3
# 5 sample01 233 02 4.22 15.8 sample02 99 02 4.97 16.9
# 6 sample01 233 02 4.22 15.8 sample03 288 02 3.51 16.9
# 7 sample01 233 02 4.22 15.8 sample04 313 02 4.14 16.4
# 8 sample01 233 02 4.22 15.8 sample05 346 02 4.12 15.7
# 9 sample01 233 03 5.7 18.5 sample02 99 03 6.61 19.8
# 10 sample01 233 03 5.7 18.5 sample03 288 03 4.86 19.2
# # ... with 230 more rows
From here, it's "just" some math:
crossing(
rename_all(longdf, ~ paste0(., "1")),
rename_all(longdf, ~ paste0(., "2"))
) %>%
filter(sample1 != sample2, set1 == set2) %>%
mutate(
out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
)
# # A tibble: 240 x 11
# sample1 start1 set1 min1 max1 sample2 start2 set2 min2 max2 out
# <fct> <dbl> <chr> <dbl> <dbl> <fct> <dbl> <chr> <dbl> <dbl> <dbl>
# 1 sample01 233 01 2.94 13.1 sample02 99 01 3.26 14.1 0.939
# 2 sample01 233 01 2.94 13.1 sample03 288 01 3.15 14.9 0.911
# 3 sample01 233 01 2.94 13.1 sample04 313 01 2.55 14.5 0.960
# 4 sample01 233 01 2.94 13.1 sample05 346 01 2.59 13.3 1.01
# 5 sample01 233 02 4.22 15.8 sample02 99 02 4.97 16.9 0.922
# 6 sample01 233 02 4.22 15.8 sample03 288 02 3.51 16.9 0.990
# 7 sample01 233 02 4.22 15.8 sample04 313 02 4.14 16.4 0.982
# 8 sample01 233 02 4.22 15.8 sample05 346 02 4.12 15.7 1.01
# 9 sample01 233 03 5.7 18.5 sample02 99 03 6.61 19.8 0.916
# 10 sample01 233 03 5.7 18.5 sample03 288 03 4.86 19.2 1.01
# # ... with 230 more rows
... and summing by group:
crossing(
rename_all(longdf, ~ paste0(., "1")),
rename_all(longdf, ~ paste0(., "2"))
) %>%
filter(sample1 != sample2, set1 == set2) %>%
mutate(
out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
) %>%
group_by(sample1, sample2) %>%
summarize(out = sum(out)) %>%
ungroup()
# # A tibble: 20 x 3
# sample1 sample2 out
# <fct> <fct> <dbl>
# 1 sample01 sample02 11.1
# 2 sample01 sample03 11.9
# 3 sample01 sample04 11.8
# 4 sample01 sample05 11.8
# 5 sample02 sample01 12.9
# 6 sample02 sample03 12.8
# 7 sample02 sample04 12.7
# 8 sample02 sample05 12.6
# 9 sample03 sample01 12.1
# 10 sample03 sample02 11.3
# 11 sample03 sample04 12.0
# 12 sample03 sample05 11.9
# 13 sample04 sample01 12.2
# 14 sample04 sample02 11.3
# 15 sample04 sample03 12.1
# 16 sample04 sample05 11.9
# 17 sample05 sample01 12.3
# 18 sample05 sample02 11.4
# 19 sample05 sample03 12.1
# 20 sample05 sample04 12.1
And if you need them in a matrix-like layout, then
crossing(
rename_all(longdf, ~ paste0(., "1")),
rename_all(longdf, ~ paste0(., "2"))
) %>%
filter(sample1 != sample2, set1 == set2) %>%
mutate(
out = (max1 - min2) / sqrt((max1 - min1) * (max2 - min2))
) %>%
group_by(sample1, sample2) %>%
summarize(out = sum(out)) %>%
ungroup() %>%
pivot_wider(sample1, names_from = "sample2", values_from = "out") %>%
select(c("sample1", setdiff(sort(colnames(.)), "sample1")))
# # A tibble: 5 x 6
# sample1 sample01 sample02 sample03 sample04 sample05
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 sample01 NA 11.1 11.9 11.8 11.8
# 2 sample02 12.9 NA 12.8 12.7 12.6
# 3 sample03 12.1 11.3 NA 12.0 11.9
# 4 sample04 12.2 11.3 12.1 NA 11.9
# 5 sample05 12.3 11.4 12.1 12.1 NA