Search code examples
rdplyrpercentagemutateacross

Calculating the actual difference and percentage difference for multiple paired variables simultaneously


I have the following example data frame and would like to calculate the actual and percentage differences across multiple paired variables ("10" and "20" correspond to year tested) at once:

sample data:

Group| A_10  |    A_20 |  B_10 |  B_20 

0       20          21        20        23
1       30          10        19        11
2       10          53        30        34
1       22          32        25        20
2       34          40        32        30
0       30          50        NA        40
0       39          40        19        20
1       40          NA        20        20
2       50          10        20        10
0       34          23        30        10

This is the current working code:

library(dplyr)

# Assuming data frame is named 'df' and has the following structure:
# 'var1_1', 'var1_2', ... represent the first set of variables
# 'var2_1', 'var2_2', ... represent the second set of variables

# Define the pairs of variables for which you want to calculate the differences
variable_pairs <- list(
  c("A_10", "A_20"),
  c("B_10", "B_20")) # I have another 20 paired variabels    

# Calculate the actual and percentage differences for each variable pair
df6 <- df %>%
  mutate(
    across(
      all_of(unlist(variable_pairs)),
      ~ .x - get(variable_pairs[[cur_column()]][2]),
      .names = "{.col}_actual_diff"
    ),
    across(
      all_of(unlist(variable_pairs)),
      ~ (.x - get(variable_pairs[[cur_column()]][2])) / get(variable_pairs[[cur_column()]][2]) * 100,
      .names = "{.col}_percentage_diff"
    )
  )

Unfortunately I am going wrong somewhere or overcomplicating things. The above code give this error: Error in `mutate()`: ℹ In argument: `across(...)`. Caused by error in `across()`: ! Can't compute column `vo2mlkg_12_actual_diff`. Caused by error in `get()`: ! invalid first argument Run `rlang::last_trace()` to see where the error occurred.

Can anyone suggest a fix or a simpler solution.

addendum:

long data

Group| variable  | phase |  Value | 

0       A           10        20        
1       B           20        19        
2       C           20        30        
1       D           10        25        
2       E           20        32       
0       F           10        NA        
0       G           20        19        
1       H           10        20        
2       I           10        20        
0       J           20        30        

Solution thanks to @Maël:


library(dplyr)
library(tidyr)
library(magrittr)

df2 <- df[,-2] 
df2 %<>%
  pivot_longer(-group, names_sep = "_", names_to = c("set", ".value")) %>%
  {colnames(.) <- c("group", "set", "pre", "post"); .} %>%
  mutate(
    diff = post - pre,
    diff_perc = ((post - pre) / pre) * 100
  )%>%
  group_by(group, set) %>%
  summarize(
    mean_diff = mean(diff, na.rm = TRUE),
    mean_diff_perc = mean(diff_perc, na.rm = TRUE)
  ) %>%
  pivot_wider(names_from = set, values_from = c(mean_diff, mean_diff_perc))

Solution

  • You can use multiple across:

    library(dplyr)
    df %>% 
      mutate(across(matches("_post$"), .names = "{gsub('post','', .col)}diff") - across(matches("_pre$")),
             (across(matches("_post$"), .names = "{gsub('post','', .col)}perc_diff") - across(matches("_pre$"))) / across(matches("_post$"))) %>% 
    
    # # A tibble: 10 × 9
    #    Group A_pre A_post B_pre B_post A_diff B_diff A_perc_diff B_perc_diff
    #    <int> <int>  <int> <int>  <int>  <int>  <int>       <dbl>       <dbl>
    #  1     0    20     21    20     23      1      3      0.0476      0.130 
    #  2     1    30     10    19     11    -20     -8     -2          -0.727 
    #  3     2    10     53    30     34     43      4      0.811       0.118 
    #  4     1    22     32    25     20     10     -5      0.312      -0.25  
    #  5     2    34     40    32     30      6     -2      0.15       -0.0667
    #  6     0    30     50    NA     40     20     NA      0.4        NA     
    #  7     0    39     40    19     20      1      1      0.025       0.05  
    #  8     1    40     NA    20     20     NA      0     NA           0     
    #  9     2    50     10    20     10    -40    -10     -4          -1     
    # 10     0    34     23    30     10    -11    -20     -0.478      -2     
    

    Or, probably simpler, you can pivot your data first, and then compute the differences:

    library(tidyr)
    df %>% 
      pivot_longer(-Group, names_sep = "_", names_to = c("set", ".value")) %>% 
      mutate(diff = post - pre,
             diff_perc = (post - pre) / post)
    
    # # A tibble: 20 × 6
    #    Group set     pre  post  diff diff_perc
    #    <int> <chr> <int> <int> <int>     <dbl>
    #  1     0 A        20    21     1    0.0476
    #  2     0 B        20    23     3    0.130 
    #  3     1 A        30    10   -20   -2     
    #  4     1 B        19    11    -8   -0.727 
    #  5     2 A        10    53    43    0.811 
    #  6     2 B        30    34     4    0.118 
    #  7     1 A        22    32    10    0.312 
    #  8     1 B        25    20    -5   -0.25  
    #  9     2 A        34    40     6    0.15  
    # 10     2 B        32    30    -2   -0.0667
    # 11     0 A        30    50    20    0.4   
    # 12     0 B        NA    40    NA   NA     
    # 13     0 A        39    40     1    0.025 
    # 14     0 B        19    20     1    0.05  
    # 15     1 A        40    NA    NA   NA     
    # 16     1 B        20    20     0    0     
    # 17     2 A        50    10   -40   -4     
    # 18     2 B        20    10   -10   -1     
    # 19     0 A        34    23   -11   -0.478 
    # 20     0 B        30    10   -20   -2