I need to use information from a subset of my data within a dplyr::summarise
function.
My example data is grouped by unit
. Each unit has a number of parts of different type
with a number of dates.
library(dplyr)
library(lubridate)
q = data.frame(unit = c(rep(1,4), rep(2,3), rep(3,2)) ,
type = c("a", "b", "a", "a", "a", "b", "a", "a", "a"),
create = dmy(c("01/01/2001", "02/02/2002", "10/03/2003", "04/04/2004", "01/01/2001", "02/02/2002", "03/03/2003", "01/01/2001", "02/01/2001")),
fail = dmy(c("05/05/2001", "10/10/2003", "30/03/2004", NA, "01/01/2002", "01/03/2003", "01/06/2003", "01/01/2001", NA)),
last = dmy(c(rep("11/03/2008", 4), rep("01/01/2009", 3), rep("01/03/2001",2) )) )%>%
group_by(unit)%>%
mutate(last_for_unit = case_when(row_number() == n() ~T,
T~F),
atleast_6m = case_when(interval(create,last)/months(1) >=6 | !is.na(fail)~T,
T~F))
q
# A tibble: 9 x 7
# Groups: unit [3]
unit type create fail last last_for_unit atleast_6m
<dbl> <chr> <date> <date> <date> <lgl> <lgl>
1 1 a 2001-01-01 2001-05-05 2008-03-11 FALSE TRUE
2 1 b 2002-02-02 2003-10-10 2008-03-11 FALSE TRUE
3 1 a 2003-03-10 2004-03-30 2008-03-11 FALSE TRUE
4 1 a 2004-04-04 NA 2008-03-11 TRUE TRUE
5 2 a 2001-01-01 2002-01-01 2009-01-01 FALSE TRUE
6 2 b 2002-02-02 2003-03-01 2009-01-01 FALSE TRUE
7 2 a 2003-03-03 2003-06-01 2009-01-01 TRUE TRUE
8 3 a 2001-01-01 2001-01-01 2001-03-01 FALSE TRUE
9 3 a 2001-01-02 NA 2001-03-01 TRUE FALSE
I group by the type
and for each type
calculate the number that meet a rule. These are the working_at_6m
.
I now want to calculate the proportion of working_at_6m
of the total where atleast_6m
==T
.
The output should be:
type Total working_at_6m `working_at_6m%`
a 7 4 0.667 #i.e 4/6
b 2 2 1
This is what I have tried:
q_sum = q%>%
ungroup()%>%
group_by(type)%>%
summarise(Total = n(),
working_at_6m = sum(case_when(!is.na(fail) & interval(create,fail)/months(1) >= 6 ~T,
last_for_unit ==T & interval(create,last)/months(1) >= 6 ~T,
T~F)),
`working_at_6m%` = working_at_6m/Total[atleast_6m ==T])
Which produces
q_sum
# A tibble: 8 x 4
# Groups: type [2]
type Total working_at_6m `working_at_6m%`
<chr> <int> <int> <dbl>
1 a 7 4 0.571
2 a 7 4 NA
3 a 7 4 NA
4 a 7 4 NA
5 a 7 4 NA
6 a 7 4 NA
7 b 2 2 1
8 b 2 2 NA
You can try,
library(dplyr)
q%>%
ungroup()%>%
group_by(type)%>%
summarise(Total = n(),
working_at_6m = sum(case_when(!is.na(fail) & interval(create,fail)/months(1) >= 6 ~T,
last_for_unit ==T & interval(create,last)/months(1) >= 6 ~T,
T~F)),
`work_at_6m%` = working_at_6m/sum(atleast_6m))
# A tibble: 2 x 4
type Total working_at_6m `work_at_6m%`
<chr> <int> <int> <dbl>
1 a 7 4 0.667
2 b 2 2 1