Search code examples
rgroup-bydplyrsummarize

Calculate total values when summarising grouped data


I have a dataframe where I have grouped data, I am running summary statistics by group but also want to get summary statistics for everything combined. Is there a simpler way of doing it than doing it twice and combining like follows?

dataDF <- data.frame(
  group = rep(c('a', 'b', 'c'), 10),
  value1 = rnorm(30),
  value2 = 1:30
)

grouped <- dataDF %>%
  group_by(group) %>% 
  summarise(
    mean1 = mean(value1),
    mean2 = mean(value2),
    sd1 = sd(value1),
    sd2 = sd(value2),
    max1 = max(value1),
    max2 = max(value2)
    )

total <- dataDF %>%
  summarise(
    mean1 = mean(value1),
    mean2 = mean(value2),
    sd1 = sd(value1),
    sd2 = sd(value2),
    max1 = max(value1),
    max2 = max(value2)
  )

combined <- rbind(
  grouped,
  data.frame(
    group = 'All',
    total
  ))

> combined
# A tibble: 4 x 7
  group  mean1 mean2   sd1   sd2  max1  max2
  <fct>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a      0.336  14.5 1.15   9.08  1.98    28
2 b     -0.215  15.5 1.17   9.08  1.30    29
3 c      0.332  16.5 0.874  9.08  2.19    30
4 All    0.151  15.5 1.07   8.80  2.19    30

Solution

  • I have written a function to do this

    summarise_with_total <- function(data, func, ...){
    
      # Gets the cohorts which the DF is grouped by
      cohorts <- groups(data)
    
      # Results split by cohorts
      split <- data %>% 
        func(...) %>% 
        data.frame()
    
      # results combined
      combined <- data.frame(
        setNames(data.frame(matrix(data = "All",
                                   ncol = length(cohorts),
                                   nrow = 1)),
                 cohorts),
        ungroup(data) %>% 
          func(...)
      )
    
      return(rbind(split, combined))
    
    }
    
    dataDF %>% 
      group_by(group) %>% 
      summarise_with_total(summarise_at,
                          .vars = c('value1', 'value2'), 
                          .funs = funs(mean, sd, max))