I have the following DF:
structure(list(AgeGroup = structure(c(1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Young",
"Old"), class = "factor"), variable = structure(c(1L, 1L, 2L,
2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L,
10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L,
17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L,
23L, 24L, 24L, 25L, 25L, 26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L,
30L, 30L, 31L, 31L, 32L, 32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L,
36L, 37L, 37L, 38L, 38L, 39L, 39L, 40L, 40L, 41L, 41L, 42L, 42L,
43L, 43L, 44L, 44L, 45L, 45L, 46L, 46L, 47L, 47L, 48L, 48L, 49L,
49L, 50L, 50L), .Label = c("Point.1", "Point.2", "Point.3", "Point.4",
"Point.5", "Point.6", "Point.7", "Point.8", "Point.9", "Point.10",
"Point.11", "Point.12", "Point.13", "Point.14", "Point.15", "Point.16",
"Point.17", "Point.18", "Point.19", "Point.20", "Point.21", "Point.22",
"Point.23", "Point.24", "Point.25", "Point.26", "Point.27", "Point.28",
"Point.29", "Point.30", "Point.31", "Point.32", "Point.33", "Point.34",
"Point.35", "Point.36", "Point.37", "Point.38", "Point.39", "Point.40",
"Point.41", "Point.42", "Point.43", "Point.44", "Point.45", "Point.46",
"Point.47", "Point.48", "Point.49", "Point.50"), class = "factor"),
value = c(0.714518666666667, 0.723876630952381, 0.728961368421053,
0.735228897233202, 0.701283807017544, 0.71396457312253, 0.663229964912281,
0.68923661660079, 0.613014666666667, 0.652671079051383, 0.547104,
0.602951166007905, 0.504106245614035, 0.558832648221344,
0.487034052631579, 0.515752438735178, 0.451825245614035,
0.476300007905138, 0.442370175438596, 0.441173656126482,
0.438668315789474, 0.435859173913043, 0.450059526315789,
0.434047494071146, 0.478947649122807, 0.450561841897233,
0.481134438596491, 0.461228027667984, 0.446763543859649,
0.451031316205534, 0.396206754385965, 0.406836889328063,
0.357049368421053, 0.368716249011858, 0.343943631578947,
0.368048932806324, 0.376060403508772, 0.398834193675889,
0.413613877192982, 0.434683889328063, 0.434964894736842,
0.448746023715415, 0.451208631578947, 0.450663276679842,
0.470569192982456, 0.473143399209486, 0.515300736842105,
0.502499193675889, 0.543379719298246, 0.507495533596838,
0.550050701754386, 0.498506288537549, 0.541725807017544,
0.482379664031621, 0.517293315789474, 0.458068636363636,
0.485205245614035, 0.423109671936759, 0.438844403508772,
0.385925747035573, 0.39522349122807, 0.362403612648221, 0.374209192982456,
0.350889750988142, 0.354036315789474, 0.336213118577075,
0.340668122807018, 0.327800648221344, 0.326388666666667,
0.322577146245059, 0.328114842105263, 0.319440624505929,
0.342721666666667, 0.323974818181818, 0.357620473684211,
0.335501339920949, 0.372856842105263, 0.343831292490119,
0.377362315789474, 0.361571442687747, 0.393890736842105,
0.377489727272727, 0.419330684210526, 0.38274228458498, 0.419797666666667,
0.387899881422925, 0.423127684210526, 0.385955055335968,
0.42140750877193, 0.377730351778656, 0.403711631578947, 0.366319122529644,
0.390753140350877, 0.355189754940711, 0.373226596491228,
0.347452173913044, 0.348689877192982, 0.340376324110672,
0.329466947368421, 0.344867375494071)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
which a subset looks like:
A tibble: 100 x 3
AgeGroup variable value
<fct> <fct> <dbl>
1 Young Point.1 0.715
2 Old Point.1 0.724
3 Young Point.2 0.729
4 Old Point.2 0.735
5 Young Point.3 0.701
6 Old Point.3 0.714
7 Young Point.4 0.663
8 Old Point.4 0.689
9 Young Point.5 0.613
10 Old Point.5 0.653
I have an output using:
Cho_D <- DF %>%
rstatix::cohens_d(value ~ variable, var.equal = TRUE)
But this provides me with a lot of unnecessary calculations like Point.1 and Point.3, Point.1 and Point.4, etc.
I would like to calculate Cohen's D for each successive points. So for example:
Point.1:Point.2
, Point.2:Point.3
, etc. The end goal is to plot D
values on the Y-axis and Points 1 through 50 on the X-axis.
We could nest
the 'value', get the lead
of the list
column, and apply cohen.d
by looping over the two list
library(dplyr)
library(effsize)
library(purrr)
out <- DF %>%
select(-AgeGroup) %>%
nest(data = value) %>%
mutate(across(everything(), lead, .names = "{.col}_lead")) %>%
slice(-n()) %>%
mutate(cohen_d = map2_dbl(data, data_lead,
~ cohen.d(.x$value, .y$value)$estimate))
-output
head(out, 2)
# A tibble: 2 x 5
variable data variable_lead data_lead cohen_d
<fct> <list> <fct> <list> <dbl>
1 Point.1 <tibble [2 × 1]> Point.2 <tibble [2 × 1]> -2.29
2 Point.2 <tibble [2 × 1]> Point.3 <tibble [2 × 1]> 3.46
comparing with OP's filter
ed output
Cho_D %>%
slice(seq(1, n(), by = 49)) %>%
slice_head(n = 2)
# A tibble: 2 x 7
# .y. group1 group2 effsize n1 n2 magnitude
# <chr> <chr> <chr> <dbl> <int> <int> <ord>
#1 value Point.1 Point.2 -2.29 2 2 large
#2 value Point.2 Point.3 3.46 2 2 large
With the number of comparisons reduced, the timings are below
system.time({Cho_D <- DF %>%
rstatix::cohens_d(value ~ variable, var.equal = TRUE)
})
# user system elapsed
# 16.316 0.060 16.330
system.time({out <- DF %>%
select(-AgeGroup) %>%
nest(data = value) %>%
mutate(across(everything(), lead, .names = "{.col}_lead")) %>%
slice(-n()) %>%
mutate(cohen_d = map2_dbl(data, data_lead,
~ cohen.d(.x$value, .y$value)$estimate))})
# user system elapsed
# 0.031 0.005 0.037