Search code examples
rdplyrsummarize

Filter within dplyr::summarise to make calculation


I need to use information from a subset of my data within a dplyr::summarise function.

My example data is grouped by unit. Each unit has a number of parts of different type with a number of dates.

library(dplyr)
library(lubridate)

q = data.frame(unit = c(rep(1,4), rep(2,3), rep(3,2)) ,
              type = c("a", "b", "a", "a", "a", "b", "a", "a", "a"),
               create = dmy(c("01/01/2001", "02/02/2002", "10/03/2003", "04/04/2004", "01/01/2001", "02/02/2002", "03/03/2003", "01/01/2001", "02/01/2001")),
               fail = dmy(c("05/05/2001", "10/10/2003", "30/03/2004", NA, "01/01/2002", "01/03/2003", "01/06/2003", "01/01/2001", NA)),
               last = dmy(c(rep("11/03/2008", 4), rep("01/01/2009", 3), rep("01/03/2001",2)  )) )%>%
  group_by(unit)%>%
  mutate(last_for_unit = case_when(row_number() == n() ~T,
                                    T~F),
         atleast_6m = case_when(interval(create,last)/months(1) >=6 | !is.na(fail)~T,
                                T~F))

 q
# A tibble: 9 x 7
# Groups:   unit [3]
   unit type  create     fail       last       last_for_unit atleast_6m
  <dbl> <chr> <date>     <date>     <date>     <lgl>         <lgl>     
1     1 a     2001-01-01 2001-05-05 2008-03-11 FALSE         TRUE      
2     1 b     2002-02-02 2003-10-10 2008-03-11 FALSE         TRUE      
3     1 a     2003-03-10 2004-03-30 2008-03-11 FALSE         TRUE      
4     1 a     2004-04-04 NA         2008-03-11 TRUE          TRUE      
5     2 a     2001-01-01 2002-01-01 2009-01-01 FALSE         TRUE      
6     2 b     2002-02-02 2003-03-01 2009-01-01 FALSE         TRUE      
7     2 a     2003-03-03 2003-06-01 2009-01-01 TRUE          TRUE      
8     3 a     2001-01-01 2001-01-01 2001-03-01 FALSE         TRUE      
9     3 a     2001-01-02 NA         2001-03-01 TRUE          FALSE                    

I group by the type and for each type calculate the number that meet a rule. These are the working_at_6m.

I now want to calculate the proportion of working_at_6m of the total where atleast_6m ==T.

The output should be:

type  Total working_at_6m `working_at_6m%`
a         7             4            0.667   #i.e 4/6
b         2             2            1    

This is what I have tried:

q_sum = q%>%
  ungroup()%>%
  group_by(type)%>%
  summarise(Total = n(),
            working_at_6m = sum(case_when(!is.na(fail) & interval(create,fail)/months(1) >= 6 ~T,
                                          last_for_unit ==T & interval(create,last)/months(1) >= 6 ~T,
                                          T~F)),
            `working_at_6m%` = working_at_6m/Total[atleast_6m ==T])

Which produces

q_sum
# A tibble: 8 x 4
# Groups:   type [2]
  type  Total working_at_6m `working_at_6m%`
  <chr> <int>         <int>            <dbl>
1 a         7             4            0.571
2 a         7             4           NA    
3 a         7             4           NA    
4 a         7             4           NA    
5 a         7             4           NA    
6 a         7             4           NA    
7 b         2             2            1    
8 b         2             2           NA 

Solution

  • You can try,

    library(dplyr)
    
    q%>%
      ungroup()%>%
      group_by(type)%>%
      summarise(Total = n(),
                working_at_6m = sum(case_when(!is.na(fail) & interval(create,fail)/months(1) >= 6 ~T,
                                              last_for_unit ==T & interval(create,last)/months(1) >= 6 ~T,
                                              T~F)), 
                `work_at_6m%` = working_at_6m/sum(atleast_6m))
    
    # A tibble: 2 x 4
      type  Total working_at_6m `work_at_6m%`
      <chr> <int>         <int>         <dbl>
    1 a         7             4         0.667
    2 b         2             2         1