Search code examples
rdplyrgroup-bysummarizemutate

combine redundant row items in r


I have a dataset with the the names of many different plant species (column MTmatch), some of which appear repeatedly. Each of these has a column (ReadSum) with a sum associated with it (as well as many other pieces of information). How do I combine/aggregate all of the redundant plant species and sum the associated ReadSum with each, while leaving the non-redundant rows alone?

I would like to take a dataset like this, and either have it transformed so that each sample has the aggregate of the combined rows, or at least an additional column showing the sum of the ReadSum column for the combined redundant species. Sorry if this is confusing, I'm not sure how to ask this question.

I have been messing about with dplyr, using group_by() and summarise(), but that seems to be summarizing across the whole column rather than just the new group.

structure(list(ESVID = c("ESV_000090", "ESV_000682", "ESV_000028", 
"ESV_000030", "ESV_000010", "ESV_000182", "ESV_000040", "ESV_000135", 
"ESV_000383"), S026401.R1 = c(0.222447727, 0, 0, 0, 0, 0, 0.029074432, 
0, 0), S026404.R1 = c(0.022583349, 0, 0, 0, 0, 0, 0.016390389, 
0.001257217, 0), S026406.R1 = c(0.360895503, 0, 0, 0.00814677, 
0, 0, 0.01513888, 0, 0.00115466)), row.names = c(NA, -9L), class = "data.frame")
> dput(samp5[1:9])
structure(list(ESVID = c("ESV_000090", "ESV_000682", "ESV_000028", 
"ESV_000030", "ESV_000010", "ESV_000182", "ESV_000040", "ESV_000135", 
"ESV_000383"), S026401.R1 = c(0.222447727, 0, 0, 0, 0, 0, 0.029074432, 
0, 0), S026404.R1 = c(0.022583349, 0, 0, 0, 0, 0, 0.016390389, 
0.001257217, 0), S026406.R1 = c(0.360895503, 0, 0, 0.00814677, 
0, 0, 0.01513888, 0, 0.00115466), S026409.R1 = c(0.221175955, 
0, 0, 0, 0, 0, 0.005146173, 0, 0), S026412.R1 = c(0.026058888, 
0, 0, 0, 0, 0, 0, 0, 0), MAX = c(0.400577608, 0.009933177, 0.124412855, 
0.00814677, 0.009824944, 0.086475106, 0.154850408, 0.015593835, 
0.008340888), ReadSum = c(3.54892343, 0.012059346, 0.203303936, 
0.021075546, 0.009824944, 0.128007863, 0.859687787, 0.068159534, 
0.050266853), SPECIES = c("Abies ", "Abies ", "Acer", "Alnus", 
"Berberis", "Betula ", "Boykinia", "Boykinia", "Boykinia")), row.names = c(NA, 
-9L), class = "data.frame")

Solution

  • Do either of these approached produce your intended outcome?

    Data:

    df <- structure(list(ESVID = c("ESV_000090", "ESV_000682", "ESV_000028", 
                             "ESV_000030", "ESV_000010", "ESV_000182", "ESV_000040", "ESV_000135", 
                             "ESV_000383"), S026401.R1 = c(0.222447727, 0, 0, 0, 0, 0, 0.029074432, 
                                                           0, 0), S026404.R1 = c(0.022583349, 0, 0, 0, 0, 0, 0.016390389, 
                                                                                 0.001257217, 0), S026406.R1 = c(0.360895503, 0, 0, 0.00814677, 
                                                                                                                 0, 0, 0.01513888, 0, 0.00115466), S026409.R1 = c(0.221175955, 
                                                                                                                                                                  0, 0, 0, 0, 0, 0.005146173, 0, 0), S026412.R1 = c(0.026058888, 
                                                                                                                                                                                                                    0, 0, 0, 0, 0, 0, 0, 0), MAX = c(0.400577608, 0.009933177, 0.124412855, 
                                                                                                                                                                                                                                                     0.00814677, 0.009824944, 0.086475106, 0.154850408, 0.015593835, 
                                                                                                                                                                                                                                                     0.008340888), ReadSum = c(3.54892343, 0.012059346, 0.203303936, 
                                                                                                                                                                                                                                                                               0.021075546, 0.009824944, 0.128007863, 0.859687787, 0.068159534, 
                                                                                                                                                                                                                                                                               0.050266853), SPECIES = c("Abies ", "Abies ", "Acer", "Alnus", 
                                                                                                                                                                                                                                                                                                         "Berberis", "Betula ", "Boykinia", "Boykinia", "Boykinia")), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                    -9L), class = "data.frame")
    

    Create a new column "combined_ReadSum" (2nd col) which is the sum of "ReadSum" for each "SPECIES":

    library(dplyr)
    
    df %>%
      group_by(SPECIES) %>%
      summarise(combined_ReadSum = sum(ReadSum)) %>%
      left_join(df, by = "SPECIES")
    #> # A tibble: 9 × 10
    #>   SPECIES  combi…¹ ESVID S0264…² S0264…³ S0264…⁴ S0264…⁵ S0264…⁶     MAX ReadSum
    #>   <chr>      <dbl> <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
    #> 1 "Abies " 3.56    ESV_…  0.222  0.0226  0.361   0.221    0.0261 0.401   3.55   
    #> 2 "Abies " 3.56    ESV_…  0      0       0       0        0      0.00993 0.0121 
    #> 3 "Acer"   0.203   ESV_…  0      0       0       0        0      0.124   0.203  
    #> 4 "Alnus"  0.0211  ESV_…  0      0       0.00815 0        0      0.00815 0.0211 
    #> 5 "Berber… 0.00982 ESV_…  0      0       0       0        0      0.00982 0.00982
    #> 6 "Betula… 0.128   ESV_…  0      0       0       0        0      0.0865  0.128  
    #> 7 "Boykin… 0.978   ESV_…  0.0291 0.0164  0.0151  0.00515  0      0.155   0.860  
    #> 8 "Boykin… 0.978   ESV_…  0      0.00126 0       0        0      0.0156  0.0682 
    #> 9 "Boykin… 0.978   ESV_…  0      0       0.00115 0        0      0.00834 0.0503 
    #> # … with abbreviated variable names ¹​combined_ReadSum, ²​S026401.R1,
    #> #   ³​S026404.R1, ⁴​S026406.R1, ⁵​S026409.R1, ⁶​S026412.R1
    

    Or, summarise columns by summing the values for each unique species:

    library(dplyr)
    
    df %>%
      group_by(SPECIES) %>%
      summarise(across(where(is.numeric), sum))
    #> # A tibble: 6 × 8
    #>   SPECIES    S026401.R1 S026404.R1 S026406.R1 S026409.R1 S0264…¹     MAX ReadSum
    #>   <chr>           <dbl>      <dbl>      <dbl>      <dbl>   <dbl>   <dbl>   <dbl>
    #> 1 "Abies "       0.222      0.0226    0.361      0.221    0.0261 0.411   3.56   
    #> 2 "Acer"         0          0         0          0        0      0.124   0.203  
    #> 3 "Alnus"        0          0         0.00815    0        0      0.00815 0.0211 
    #> 4 "Berberis"     0          0         0          0        0      0.00982 0.00982
    #> 5 "Betula "      0          0         0          0        0      0.0865  0.128  
    #> 6 "Boykinia"     0.0291     0.0176    0.0163     0.00515  0      0.179   0.978  
    #> # … with abbreviated variable name ¹​S026412.R1
    

    Created on 2022-10-28 by the reprex package (v2.0.1)