Search code examples
rtidyverse

calculate percentages using tidyverse


I'm looking to divide the counts of each group from Type and Stream_Order by the total counts of all groups. I have the dataframe below. For example, for the Sediment Sample with Stream Order "1" in group "stoich", I'd like to divide the "n" value 1445 by it's counterpart in group "all": (1445/6312)*100 = 22.89 to calculate a percentage. So far I have

perc %>%
  group_by(Type,Stream_Order) %>%
  mutate(percentage = n/total*100)

But I realize I'm missing a lot. Sure I can do these by hand, but I would like to be prepared in the future for larger datasets.

> dput(perc)
structure(list(Type = c("Sediment", "Sediment", "Surface Water", 
"Surface Water", "Sediment", "Sediment", "Surface Water", "Surface Water", 
"Sediment", "Sediment", "Surface Water", "Surface Water", "Sediment", 
"Sediment", "Surface Water", "Surface Water"), Stream_Order = structure(c(1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "Not_Provided"), class = "factor"), 
    n = c(1445L, 639L, 1736L, 815L, 4580L, 2456L, 3999L, 1743L, 
    287L, 109L, 656L, 450L, 6312L, 3204L, 6391L, 3008L), meancorr = c(0.292071913949931, 
    0.262339039457943, -0.089326777121153, -0.259460896530476, 
    7.82761635773663, 8.66618572293474, 9.85787757997926, 8.06533374329178, 
    -3.04777412402461, -3.0807907346078, -3.07173828840263, -3.17877354374679, 
    5.60801895761982, 6.59050580264329, 5.82874680645636, 4.12764893257951
    ), max = c(1.99996435865152, 1.98765952360616, 1.99718852523326, 
    1.99609913106005, 32.9381587597398, 33.1286964665043, 38.3287173847001, 
    28.8178499883634, -2.00562491853763, -2.01247690847229, -2.00017475758744, 
    -2.00391643807592, 32.9381587597398, 33.1286964665043, 38.3287173847001, 
    28.8178499883634), min = c(-1.99674391114338, -1.99895657727752, 
    -1.99815485165027, -1.9986527287077, 2.00180602762732, 2.00187366008464, 
    2.01245579505433, 2.0066032552998, -7.194006200101, -6.68838345991404, 
    -6.69361085662866, -6.27066223419871, -7.194006200101, -6.68838345991404, 
    -6.69361085662866, -6.27066223419871), Group = c("stoch", 
    "stoch", "stoch", "stoch", "hetero", "hetero", "hetero", 
    "hetero", "homo", "homo", "homo", "homo", "all", "all", "all", 
    "all")), class = "data.frame", row.names = c(NA, -16L))

Solution

  • We can subset 'n' using a logical expression on the 'Group' column (Group == 'all')

    library(dplyr)
    perc <- perc %>%
      group_by(Type, Stream_Order) %>%
      mutate(percentage = n/n[Group == 'all']*100) %>%
      ungroup
    

    -output

    perc
    # A tibble: 16 × 8
       Type          Stream_Order     n meancorr   max   min Group  percentage
       <chr>         <fct>        <int>    <dbl> <dbl> <dbl> <chr>       <dbl>
     1 Sediment      1             1445   0.292   2.00 -2.00 stoch       22.9 
     2 Sediment      2              639   0.262   1.99 -2.00 stoch       19.9 
     3 Surface Water 1             1736  -0.0893  2.00 -2.00 stoch       27.2 
     4 Surface Water 2              815  -0.259   2.00 -2.00 stoch       27.1 
     5 Sediment      1             4580   7.83   32.9   2.00 hetero      72.6 
     6 Sediment      2             2456   8.67   33.1   2.00 hetero      76.7 
     7 Surface Water 1             3999   9.86   38.3   2.01 hetero      62.6 
     8 Surface Water 2             1743   8.07   28.8   2.01 hetero      57.9 
     9 Sediment      1              287  -3.05   -2.01 -7.19 homo         4.55
    10 Sediment      2              109  -3.08   -2.01 -6.69 homo         3.40
    11 Surface Water 1              656  -3.07   -2.00 -6.69 homo        10.3 
    12 Surface Water 2              450  -3.18   -2.00 -6.27 homo        15.0 
    13 Sediment      1             6312   5.61   32.9  -7.19 all        100   
    14 Sediment      2             3204   6.59   33.1  -6.69 all        100   
    15 Surface Water 1             6391   5.83   38.3  -6.69 all        100   
    16 Surface Water 2             3008   4.13   28.8  -6.27 all        100