Search code examples
rcastingduplicatessubtractionmelt

How to substract multiple columns of values in dataframe based on the row being duplicated just once?


Imagine I have a dataframe as such:

#Stack example
df <- data.frame(DATE = c("2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", 
                          "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
                          "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", 
                          "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
                          "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", 
                          "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
                          "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", 
                          "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
                          "2022-09-05", "2022-08-29", "2022-09-05", "2022-08-29", "2022-09-05", "2022-08-29"), 
                 ID = c("1", "2", "3", "4", "5", "6", 
                        "1", "2", "3", "4", "5", "6",
                        "1", "2", "3", "4", "5", "6", 
                        "1", "2", "3", "4", "5", "6",
                        "1", "2", "3", "4", "5", "6", 
                        "1", "2", "3", "4", "5", "6",
                        "1", "2", "3", "4", "5", "6", 
                        "1", "2", "3", "4", "5", "6",
                        "7", "8", "9", "10", "9", "10"),
                 CV = c("SV", "SV", "SV", "SV", "SV", "SV", 
                        "SV", "SV", "SV", "SV", "SV", "SV", 
                        "PD", "PD", "PD", "PD", "PD", "PD",
                        "PD", "PD", "PD", "PD", "PD", "PD",
                        "SV", "SV", "SV", "SV", "SV", "SV", 
                        "SV", "SV", "SV", "SV", "SV", "SV", 
                        "PD", "PD", "PD", "PD", "PD", "PD",
                        "PD", "PD", "PD", "PD", "PD", "PD",
                        "PD", "PD", "PD", "SV", "PD", "SV"),
                 TR= c("T1", "T1", "T1", "T1", "T1", "T1", 
                       "T1", "T1", "T1", "T1", "T1", "T1", 
                       "T1", "T1", "T1", "T1", "T1", "T1",
                       "T1", "T1", "T1", "T1", "T1", "T1",
                       "T2", "T2", "T2", "T2", "T2", "T2",
                       "T2", "T2", "T2", "T2", "T2", "T2",
                       "T2", "T2", "T2", "T2", "T2", "T2",
                       "T2", "T2", "T2", "T2", "T2", "T2",
                       "T2", "T2", "T2", "T2", "T2", "T2"),
                 Values_1 = c(34.9003695, 34.9003695, 28.2389394, 28.2389394, 26.0821875, 26.0821875,
                              30.5515533, 30.5515533, 22.6469958, 22.6469958, 34.5662974, 34.5662974,
                              35.2881883, 35.2881883, 41.3885176, 41.3885176, 19.9440042, 19.9440042,
                              5.6987524,  5.6987524, 37.4641052, 37.4641052,  2.4808126,  2.4808126,
                              1.7883822, 21.1799057, 21.1799057, 21.1799057,  2.7334442,  2.7334442,
                              2.7334442, 11.3880187, 11.3880187, 11.3880187,  7.8442267,  7.8442267,
                              7.8442267,  5.2445510,  5.2445510,  5.2445510, 20.4706600, 20.4706600,
                              15.7275634, 15.7275634,  4.4575814,  4.4575814, 17.0854186, 17.0854186,
                              5.6987524,  5.6987524, 37.4641052, 37.4641052,  2.4808126,  2.4808126),
                 Values_2 = c(76.24359,  76.24359,  58.52421,  58.52421,  80.14131,  80.14131,  
                              59.05000, 102.19699, 102.19699,  72.39848,  72.39848,  58.15000,
                              68.31217, 68.31217,  53.67941,  53.67941,  56.88980,  56.88980, 
                              108.98399,  96.64207,  96.64207,  38.88542,  38.88542,  54.60000, 
                              52.12500,  52.12500,  17.20875,  17.20875,  47.26923,  47.26923, 
                              67.80738,  60.41250,  60.41250,  83.93404,  83.93404,  37.20336, 
                              50.02500,  50.02500,  94.73309,  94.73309,  41.31748,  41.31748, 
                              56.88344,  59.74702,  59.74702,  48.23750,  48.23750,  95.14831,
                              108.98399,  96.64207,  96.64207,  38.88542,  38.88542,  54.60000))

For every ID, CV, TR combination that has two timepoints (being "2022-08-29" & "2022-09-05") I would like to subtract the first timepoint values (Values_1 & Values 2) from the last timepoint and return the substracted difference, generating the following output:

   TR CV ID    Values_1  Values_2
1  T1 PD  1 -29.5894359  40.67182
2  T1 PD  2 -29.5894359  28.32990
3  T1 PD  3  -3.9244124  42.96266
4  T1 PD  4  -3.9244124 -14.79399
5  T1 PD  5 -17.4631916 -18.00438
6  T1 PD  6 -17.4631916  -2.28980
7  T1 SV  1  -4.3488162 -17.19359
8  T1 SV  2  -4.3488162  25.95340
9  T1 SV  3  -5.5919436  43.67278
10 T1 SV  4  -5.5919436  13.87427
11 T1 SV  5   8.4841099  -7.74283
12 T1 SV  6   8.4841099 -21.99131
13 T2 PD  1   7.8833367   6.85844
14 T2 PD  2  10.4830124   9.72202
15 T2 PD  3  -0.7869696 -34.98607
16 T2 PD  4  -0.7869696 -46.49559
17 T2 PD  5  -3.3852414   6.92002
18 T2 PD  6  -3.3852414  53.83083
19 T2 SV  1   0.9450620  15.68238
20 T2 SV  2  -9.7918870   8.28750
21 T2 SV  3  -9.7918870  43.20375
22 T2 SV  4  -9.7918870  66.72529
23 T2 SV  5   5.1107825  36.66481
24 T2 SV  6   5.1107825 -10.06587

I found the following solution, but it does not return the desired result, especially not when there are stray rows or additional duplications because my solution is based on sorting the dataframe and assuming I have a correct sorted dataframe:

#Remove rows that are not duplicated (do not have two timepoints)
dupe = df[,c("ID", "CV", "TR")] # select columns to check duplicates
ONLY_DUPES=df[duplicated(dupe) | duplicated(dupe, fromLast=TRUE),]

#Now we substract last timepoint with first timepoint
#First we melt the data
ONLY_DUPES <- as.data.frame(ONLY_DUPES)
melted<-melt(ONLY_DUPES, id=c("DATE", "ID", "CV", "TR"))

#Then we cast with diff
#First we order the dataframe
melted= melted[with(melted, order(TR, CV, ID, variable, DATE)),]
casted=cast(melted, TR+CV+ID  ~variable, fun.aggregate = diff)

#Returns:
   TR CV ID    Values_1  Values_2
1  T1 PD  1 -29.5894359  40.67182
2  T1 PD  2 -29.5894359  28.32990
3  T1 PD  3  -3.9244124  42.96266
4  T1 PD  4  -3.9244124 -14.79399
5  T1 PD  5 -17.4631916 -18.00438
6  T1 PD  6 -17.4631916  -2.28980
7  T1 SV  1  -4.3488162 -17.19359
8  T1 SV  2  -4.3488162  25.95340
9  T1 SV  3  -5.5919436  43.67278
10 T1 SV  4  -5.5919436  13.87427
11 T1 SV  5   8.4841099  -7.74283
12 T1 SV  6   8.4841099 -21.99131
13 T2 PD  1   7.8833367   6.85844
14 T2 PD  2  10.4830124   9.72202
15 T2 PD  3  -0.7869696 -34.98607
16 T2 PD  4  -0.7869696 -46.49559
17 T2 PD  5  -3.3852414   6.92002
18 T2 PD  6  -3.3852414  53.83083
19 T2 PD  9 -34.9832926 -57.75665 !
20 T2 SV  1   0.9450620  15.68238
21 T2 SV 10 -34.9832926  15.71458 !
22 T2 SV  2  -9.7918870   8.28750
23 T2 SV  3  -9.7918870  43.20375
24 T2 SV  4  -9.7918870  66.72529
25 T2 SV  5   5.1107825  36.66481
26 T2 SV  6   5.1107825 -10.06587

What method can I apply to perform this operation in a fail-proof way?


Solution

  • Update for >2 value columns:

    If you have multiple columns you can use across() to define the columns you want to apply a function. With this you only have to enter the start and end column you want to manipulate:

    For this example I just copied the Values_1 and Values_2 and stored them in new variables Values_3 and Values_4 respectively.

    library(tidyverse)
    
    df %>%
      group_by(ID, CV, TR) %>%
      summarise(across(Values_1:Values_4, ~ .x[DATE == "2022-09-05"] - .x[DATE == "2022-08-29"])) %>%
      arrange(TR, CV) %>%
      print(., n = 24)
    
    #> `summarise()` has grouped output by 'ID', 'CV', 'TR'. You can override using
    #> the `.groups` argument.
    #> # A tibble: 24 × 7
    #> # Groups:   ID, CV, TR [24]
    #>    ID    CV    TR    Values_1 Values_2 Values_3 Values_4
    #>    <chr> <chr> <chr>    <dbl>    <dbl>    <dbl>    <dbl>
    #>  1 1     PD    T1     -29.6      40.7   -29.6      40.7 
    #>  2 2     PD    T1     -29.6      28.3   -29.6      28.3 
    #>  3 3     PD    T1      -3.92     43.0    -3.92     43.0 
    #>  4 4     PD    T1      -3.92    -14.8    -3.92    -14.8 
    #>  5 5     PD    T1     -17.5     -18.0   -17.5     -18.0 
    #>  6 6     PD    T1     -17.5      -2.29  -17.5      -2.29
    #>  7 1     SV    T1      -4.35    -17.2    -4.35    -17.2 
    #>  8 2     SV    T1      -4.35     26.0    -4.35     26.0 
    #>  9 3     SV    T1      -5.59     43.7    -5.59     43.7 
    #> 10 4     SV    T1      -5.59     13.9    -5.59     13.9 
    #> 11 5     SV    T1       8.48     -7.74    8.48     -7.74
    #> 12 6     SV    T1       8.48    -22.0     8.48    -22.0 
    #> 13 1     PD    T2       7.88      6.86    7.88      6.86
    #> 14 2     PD    T2      10.5       9.72   10.5       9.72
    #> 15 3     PD    T2      -0.787   -35.0    -0.787   -35.0 
    #> 16 4     PD    T2      -0.787   -46.5    -0.787   -46.5 
    #> 17 5     PD    T2      -3.39      6.92   -3.39      6.92
    #> 18 6     PD    T2      -3.39     53.8    -3.39     53.8 
    #> 19 1     SV    T2       0.945    15.7     0.945    15.7 
    #> 20 2     SV    T2      -9.79      8.29   -9.79      8.29
    #> 21 3     SV    T2      -9.79     43.2    -9.79     43.2 
    #> 22 4     SV    T2      -9.79     66.7    -9.79     66.7 
    #> 23 5     SV    T2       5.11     36.7     5.11     36.7 
    #> 24 6     SV    T2       5.11    -10.1     5.11    -10.1
    

    Created on 2022-09-06 with reprex v2.0.2

    First Answer:

    This should work. Using the tidyverse package, you can first group by your Variables ID, CV and TR and then use summarise() to calculate the subtractions based on their value on DATE 1 and DATE 2:

    Mind that the last command print(., n = 24) is only used to show the output here and can be omitted when you copy the code.

    library(tidyverse)
    
    df %>%
      group_by(ID, CV, TR) %>%
      summarise(Values_1_new = Values_1[DATE == "2022-09-05"] - Values_1[DATE == "2022-08-29"],
                Values_2_new = Values_2[DATE == "2022-09-05"] - Values_2[DATE == "2022-08-29"]) %>%
      arrange(TR, CV) %>%
      print(., n = 24)
    #> `summarise()` has grouped output by 'ID', 'CV', 'TR'. You can override using
    #> the `.groups` argument.
    #> # A tibble: 24 × 5
    #> # Groups:   ID, CV, TR [24]
    #>    ID    CV    TR    Values_1_new Values_2_new
    #>    <chr> <chr> <chr>        <dbl>        <dbl>
    #>  1 1     PD    T1         -29.6          40.7 
    #>  2 2     PD    T1         -29.6          28.3 
    #>  3 3     PD    T1          -3.92         43.0 
    #>  4 4     PD    T1          -3.92        -14.8 
    #>  5 5     PD    T1         -17.5         -18.0 
    #>  6 6     PD    T1         -17.5          -2.29
    #>  7 1     SV    T1          -4.35        -17.2 
    #>  8 2     SV    T1          -4.35         26.0 
    #>  9 3     SV    T1          -5.59         43.7 
    #> 10 4     SV    T1          -5.59         13.9 
    #> 11 5     SV    T1           8.48         -7.74
    #> 12 6     SV    T1           8.48        -22.0 
    #> 13 1     PD    T2           7.88          6.86
    #> 14 2     PD    T2          10.5           9.72
    #> 15 3     PD    T2          -0.787       -35.0 
    #> 16 4     PD    T2          -0.787       -46.5 
    #> 17 5     PD    T2          -3.39          6.92
    #> 18 6     PD    T2          -3.39         53.8 
    #> 19 1     SV    T2           0.945        15.7 
    #> 20 2     SV    T2          -9.79          8.29
    #> 21 3     SV    T2          -9.79         43.2 
    #> 22 4     SV    T2          -9.79         66.7 
    #> 23 5     SV    T2           5.11         36.7 
    #> 24 6     SV    T2           5.11        -10.1
    

    Created on 2022-09-06 with reprex v2.0.2