Search code examples
rdplyrregressionaggregateweighted-average

Why are these means different when computed by dplyr mutate vs summarize in group_by?


My dataframe contains:

  • a column deceased on which I compute aggregated means later on (mortality ratios, by gender)
  • a weighting variable n.group
  • a categorical sex (1: female, 2: male)

I don't understand why the means and weighted-means m.mortf, w.mortf are wrong when calculated below in one single mutate/summarize expression.

Dataframe:

red11 <- structure(list(hosptg = structure(c(3L, 3L, 1L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 
3L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L, 
3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1", 
"2", "3"), class = "factor"), quarter.adm = structure(c(4L, 11L, 
3L, 12L, 7L, 8L, 12L, 9L, 1L, 11L, 7L, 1L, 2L, 2L, 10L, 10L, 
8L, 11L, 6L, 1L, 4L, 6L, 10L, 10L, 6L, 11L, 11L, 7L, 3L, 6L, 
10L, 12L, 7L, 6L, 6L, 3L, 6L, 12L, 4L, 4L, 12L, 1L, 6L, 5L, 11L, 
9L, 4L, 4L, 3L, 10L, 4L, 8L, 10L, 3L, 7L, 1L, 12L, 5L, 4L, 6L, 
6L, 3L, 9L, 7L, 8L, 3L, 7L, 8L, 7L, 6L, 5L, 11L, 9L, 11L, 1L, 
4L, 6L, 5L, 5L, 6L, 5L, 5L, 11L, 3L, 4L, 12L, 12L, 1L, 9L, 9L, 
6L, 9L, 1L, 4L, 8L, 1L, 5L, 2L, 9L, 11L), .Label = c("2011Q1", 
"2011Q2", "2011Q3", "2011Q4", "2012Q1", "2012Q2", "2012Q3", "2012Q4", 
"2013Q1", "2013Q2", "2013Q3", "2013Q4"), class = "factor"), g.mdc = c("08", 
"05", "09", "08", "14", "15", "15", "11", "09", "01", "08", "11", 
"16", "14", "08", "06", "08", "06", "06", "08", "15", "14", "14", 
"08", "11", "09", "08", "08", "06", "06", "06", "08", "03", "05", 
"05", "15", "02", "05", "08", "04", "04", "10", "06", "01", "08", 
"05", "03", "06", "01", "01", "06", "08", "08", "04", "12", "05", 
"01", "15", "08", "01", "08", "01", "05", "15", "15", "01", "06", 
"15", "01", "08", "01", "05", "08", "02", "15", "03", "06", "05", 
"05", "03", "09", "08", "11", "12", "06", "04", "08", "01", "06", 
"01", "08", "06", "15", "05", "08", "07", "08", "13", "08", "08"
), sex = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 
2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 
2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 
1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"), 
    age = c(23L, 83L, 51L, 54L, 37L, 0L, 0L, 82L, 45L, 88L, 84L, 
    58L, 41L, 33L, 71L, 79L, 67L, 42L, 73L, 66L, 0L, 26L, 38L, 
    65L, 31L, 87L, 38L, 38L, 77L, 44L, 54L, 74L, 38L, 70L, 44L, 
    0L, 78L, 65L, 56L, 85L, 70L, 83L, 89L, 46L, 39L, 34L, 5L, 
    85L, 18L, 5L, 41L, 73L, 18L, 41L, 75L, 77L, 36L, 0L, 84L, 
    83L, 58L, 93L, 83L, 0L, 0L, 2L, 49L, 0L, 55L, 46L, 40L, 81L, 
    60L, 51L, 0L, 22L, 78L, 69L, 75L, 65L, 31L, 15L, 79L, 87L, 
    72L, 78L, 48L, 16L, 81L, 63L, 84L, 17L, 0L, 60L, 60L, 74L, 
    44L, 44L, 53L, 71L), deceased = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), 
    n.group = c(3L, 2L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 2L, 1L, 
    1L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 
    2L, 1L, 3L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 3L, 1L, 2L, 1L, 
    1L, 2L, 2L, 2L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 
    1L, 3L, 1L, 3L, 3L, 2L, 1L, 3L, 3L, 1L, 3L, 1L, 3L, 2L, 2L, 
    2L, 1L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 2L, 1L, 
    1L, 1L, 1L, 3L, 1L, 2L, 1L, 3L, 1L, 2L, 1L, 2L, 2L)), class = c("tbl_df", 
"tbl", "data.frame"), .Names = c("hosptg", "quarter.adm", "g.mdc", 
"sex", "age", "deceased", "n.group"), row.names = c(NA, -100L
))

Grouping - using mutate:

red111 <- red11 %>%
  group_by(hosptg, quarter.adm, g.mdc)  %>%
    mutate(n=n(),
      female = mean(sex == '1', na.rm=T), 
      age = mean(age, na.rm=T),
      m.mortf = mean(deceased == '1', na.rm=T),
      w.mortf = weighted.mean(deceased == '1', n.group, na.rm=T))

Grouping - using summarize (i.e. aggregation):

red211 <- red11 %>%
  group_by(hosptg, quarter.adm, g.mdc) %>%  
  summarize(n=n(),
    female  = mean(sex == '1', na.rm=T),
    age     = mean(age, na.rm=T),
    m.mortf = mean(deceased == '1', na.rm=T),
    w.mortf = weighted.mean(deceased == '1', n.group, na.rm=T))

I would have expected the ratio being the same and foremost keeping the initial mean. I understand what the aggregation does this is also illustrated by the sum(redxx$n) but I struggle comprehending the full background.

Initial data frame mean:

 mean(red11$deceased == 1, na.rm=T)  [1] 0.02

Mutate mean and sum:

sum(red211$n)           [1] 170
> mean(red111$female)   [1] 0.52
> mean(red111$w.mortf)  [1] 0.02
> mean(red111$m.mortf)  [1] 0.02

Summarized mean and sum:

sum(red211$n)           [1] 100
mean(red211$female)     [1] 0.4977169
mean(red211$w.mortf)    [1] 0.02739726
mean(red211$m.mortf)    [1] 0.02739726

What I would like to have is an aggregated data frame (i.e. reduced number of lines) maintaining the initial mean throughout. And, why does the weighting variable not compensate for it?

EDIT: My basic intention is that I am using a big data file where I have single entries where a case may be deceased. Then I calculate mortality ratios. But this can logically only be done at aggregated level. That is why I create a data frame like red211. Thereafter I base my regression models on it. But them again means are based on that second data frame and not the original values. Thus my results are distorted in size. That is why I am "desperately" looking for a solution that will get me closer to my original mean values. I hope this helps.

The model I use is a straight forward difference in difference:

lm(w.mortf ~ treatment * year, data = red) 

where: treatment is the treatment group / year the intervention year / red the aggregated data frame

===========================================================
             w.mortf                m.mortf             
-----------------------------------------------------------
(Intercept)    0.037 (0.001) ***       0.037 (0.001) ***
year           0.003 (0.001) *         0.003 (0.001) *  
tg1           -0.003 (0.001) *        -0.003 (0.001) *  
year:tg1      -0.001 (0.002)          -0.001 (0.002)    
-----------------------------------------------------------
Adj. R^2          0.000                   0.000            
Num. obs.    126031                  126031                
RMSE              0.172                   0.179            
===========================================================

The original data frame mean is approx. 0.018 - thus I think to far away from being interpretable - or where I am misled?

The figure below illustrates the issue. Where 2012Q1 should be the reference value findable based on the above regression.

three groups showing weighted means based on aggregated data


Solution

  • You have to apply the weighting after aggregation:

    red311 <- red11 %>% 
      group_by(hosptg, quarter.adm, g.mdc)  %>%  
      summarize(n= n()
                , female    = mean(sex == '1', na.rm=T) 
                , age       = mean(age, na.rm=T)
                , m.mortf   = mean(deceased == '1', na.rm=T))
    weighted.mean(red311$female, red311$n)
    #> [1] 0.52
    weighted.mean(red311$m.mortf, red311$n)
    #> [1] 0.02
    

    Edit: If the (unweighted) averages in red311 would correspond to the averages in red11, then the values in red311would be pretty meaningless. One can see this by going through the math or from a simple example:

    suppressPackageStartupMessages(library(dplyr))
    df <- data.frame(key = c('a', 'b', 'b', 'b'), value = 1:4, stringsAsFactors = FALSE)
    df
    #>   key value
    #> 1   a     1
    #> 2   b     2
    #> 3   b     3
    #> 4   b     4
    mean(df$value)
    #> [1] 2.5
    df1 <- df %>%
      group_by(key) %>%
      summarize(n = n(), value = mean(value)) %>%
      ungroup() %>%
      mutate(weighted = value * n * n() / sum(n))
    df1
    #> # A tibble: 2 x 4
    #>   key       n value weighted
    #>   <chr> <int> <dbl>    <dbl>
    #> 1 a         1  1.00    0.500
    #> 2 b         3  3.00    4.50
    mean(df1$value)
    #> [1] 2
    mean(df1$weighted)
    #> [1] 2.5
    weighted.mean(df1$value, df1$n)
    #> [1] 2.5
    

    So while it is possible to introduce the weighted column with average equal to the original average, the values in there are pretty meaningless from my point of view.

    Edit 2: The re-weighting schema used above is general and can also be applied to the original data:

    red411 <- red11 %>% 
      group_by(hosptg, quarter.adm, g.mdc)  %>%  
      summarize(n= n()
                , female    = mean(sex == '1', na.rm=T) 
                , age       = mean(age, na.rm=T)
                , m.mortf   = mean(deceased == '1', na.rm=T)) %>%
      ungroup() %>%
      mutate(w.mortf = m.mortf * n * n() / sum(n))
    mean(red411$w.mortf)
    #> [1] 0.02
    

    However, I am unsure how to interpret w.mortf.