I have the following example data frame and would like to calculate the actual and percentage differences across multiple paired variables ("10" and "20" correspond to year tested) at once:
sample data:
Group| A_10 | A_20 | B_10 | B_20
0 20 21 20 23
1 30 10 19 11
2 10 53 30 34
1 22 32 25 20
2 34 40 32 30
0 30 50 NA 40
0 39 40 19 20
1 40 NA 20 20
2 50 10 20 10
0 34 23 30 10
This is the current working code:
library(dplyr)
# Assuming data frame is named 'df' and has the following structure:
# 'var1_1', 'var1_2', ... represent the first set of variables
# 'var2_1', 'var2_2', ... represent the second set of variables
# Define the pairs of variables for which you want to calculate the differences
variable_pairs <- list(
c("A_10", "A_20"),
c("B_10", "B_20")) # I have another 20 paired variabels
# Calculate the actual and percentage differences for each variable pair
df6 <- df %>%
mutate(
across(
all_of(unlist(variable_pairs)),
~ .x - get(variable_pairs[[cur_column()]][2]),
.names = "{.col}_actual_diff"
),
across(
all_of(unlist(variable_pairs)),
~ (.x - get(variable_pairs[[cur_column()]][2])) / get(variable_pairs[[cur_column()]][2]) * 100,
.names = "{.col}_percentage_diff"
)
)
Unfortunately I am going wrong somewhere or overcomplicating things. The above code give this error: Error in `mutate()`: ℹ In argument: `across(...)`. Caused by error in `across()`: ! Can't compute column `vo2mlkg_12_actual_diff`. Caused by error in `get()`: ! invalid first argument Run `rlang::last_trace()` to see where the error occurred.
Can anyone suggest a fix or a simpler solution.
addendum:
long data
Group| variable | phase | Value |
0 A 10 20
1 B 20 19
2 C 20 30
1 D 10 25
2 E 20 32
0 F 10 NA
0 G 20 19
1 H 10 20
2 I 10 20
0 J 20 30
Solution thanks to @Maël:
library(dplyr)
library(tidyr)
library(magrittr)
df2 <- df[,-2]
df2 %<>%
pivot_longer(-group, names_sep = "_", names_to = c("set", ".value")) %>%
{colnames(.) <- c("group", "set", "pre", "post"); .} %>%
mutate(
diff = post - pre,
diff_perc = ((post - pre) / pre) * 100
)%>%
group_by(group, set) %>%
summarize(
mean_diff = mean(diff, na.rm = TRUE),
mean_diff_perc = mean(diff_perc, na.rm = TRUE)
) %>%
pivot_wider(names_from = set, values_from = c(mean_diff, mean_diff_perc))
You can use multiple across
:
library(dplyr)
df %>%
mutate(across(matches("_post$"), .names = "{gsub('post','', .col)}diff") - across(matches("_pre$")),
(across(matches("_post$"), .names = "{gsub('post','', .col)}perc_diff") - across(matches("_pre$"))) / across(matches("_post$"))) %>%
# # A tibble: 10 × 9
# Group A_pre A_post B_pre B_post A_diff B_diff A_perc_diff B_perc_diff
# <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl>
# 1 0 20 21 20 23 1 3 0.0476 0.130
# 2 1 30 10 19 11 -20 -8 -2 -0.727
# 3 2 10 53 30 34 43 4 0.811 0.118
# 4 1 22 32 25 20 10 -5 0.312 -0.25
# 5 2 34 40 32 30 6 -2 0.15 -0.0667
# 6 0 30 50 NA 40 20 NA 0.4 NA
# 7 0 39 40 19 20 1 1 0.025 0.05
# 8 1 40 NA 20 20 NA 0 NA 0
# 9 2 50 10 20 10 -40 -10 -4 -1
# 10 0 34 23 30 10 -11 -20 -0.478 -2
Or, probably simpler, you can pivot your data first, and then compute the differences:
library(tidyr)
df %>%
pivot_longer(-Group, names_sep = "_", names_to = c("set", ".value")) %>%
mutate(diff = post - pre,
diff_perc = (post - pre) / post)
# # A tibble: 20 × 6
# Group set pre post diff diff_perc
# <int> <chr> <int> <int> <int> <dbl>
# 1 0 A 20 21 1 0.0476
# 2 0 B 20 23 3 0.130
# 3 1 A 30 10 -20 -2
# 4 1 B 19 11 -8 -0.727
# 5 2 A 10 53 43 0.811
# 6 2 B 30 34 4 0.118
# 7 1 A 22 32 10 0.312
# 8 1 B 25 20 -5 -0.25
# 9 2 A 34 40 6 0.15
# 10 2 B 32 30 -2 -0.0667
# 11 0 A 30 50 20 0.4
# 12 0 B NA 40 NA NA
# 13 0 A 39 40 1 0.025
# 14 0 B 19 20 1 0.05
# 15 1 A 40 NA NA NA
# 16 1 B 20 20 0 0
# 17 2 A 50 10 -40 -4
# 18 2 B 20 10 -10 -1
# 19 0 A 34 23 -11 -0.478
# 20 0 B 30 10 -20 -2