Search code examples
rdataframebucketdifftime

Age groups into monthly buckets


I'm struggling to find a solution for the following problem. I have a df with id's/ dob's and another monthbucket df as following


set.seed(33)

df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10),
                 id = seq(1:10) )


monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
                          startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
                          endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)

I want to get an output which gives me the count of members within age groups (<19, 19-64, >64) for each of my monthly buckets. The count obviously switches over the year when people have birthdays.

I got the age calculation with something like:

age.fct <- function(dob, bucketdate) {

  period <- as.period(interval(dob, bucketdate),unit = "year")
  period$year}

I guess the general approach would be to calculate the age for each monthbucket, assign into one of the 3 age groups and count it up by month. Any suggestions?

EDIT 1.

Thanks for all the different approaches, I just run a brief benchmark on the solutions to determine which answer to accept. Somehow the data table solution didn't work on my test data set but I will check as soon as I have a few minutes in the next days.

set.seed(33)

df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10000),
                 id = seq(1:10000) )


monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
                          startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
                          endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)


birth_days <- df$dob
month_bucket <- monthbucket$startmonth

and the benchmark


microbenchmark::microbenchmark(
  MM=  monthbucket %>% group_by_all %>% expand(id=df$id) %>%  left_join(.,{df %>% mutate(birth_month =cut(dob, "month"))},by="id") %>%  mutate(age=time_length(difftime(startmonth, birth_month),"years")) %>% 
    mutate(age_cat=case_when(age<19 ~ "<19", age>64 ~ ">64",TRUE ~ "19-64")) %>%  group_by(month) %>% count(age_cat) %>%  gather(variable, count, n) %>%
    unite(variable, age_cat) %>% spread(variable, count)
  ,
  AkselA = {ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
  ages <- do.call(data.frame, lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
  ages <- sapply(ages, table)
  colnames(ages) <- monthbucket$month
  },
  Cole1 ={t(table(apply(X = outer(month_bucket, birth_days, `-`) / 365.25, MARGIN = 2, FUN = cut, c(0,19,65, Inf)), rep(format(month_bucket,'%Y-%m'), length(birth_days))))
   },
  # cole2={ cast(CJ(month_bucket, birth_days)[, .N, by = .(month_bucket , cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))], month_bucket ~ cut, value.var = 'N')
  # },
  # 
  Cole3={crossing(month_bucket, birth_days)%>%count(month_bucket, age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf)))%>%spread(age_range, n)
  },

  Cole4={all_combos <- expand.grid(month_bucket =  month_bucket, birth_days = birth_days) 
  all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
  all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))
  reshape(data = aggregate( all_combos$month_bucket, by = list(bucket = all_combos$month_bucket,age_group = all_combos$cut_r), FUN = length), timevar = 'age_group' , idvar = 'bucket', direction = 'wide'  )
},
times = 1L)

Unit: milliseconds
   expr        min         lq       mean     median         uq        max neval
     MM 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810     1
 AkselA   17.12697   17.12697   17.12697   17.12697   17.12697   17.12697     1
  Cole1 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534     1
  Cole3   23.63945   23.63945   23.63945   23.63945   23.63945   23.63945     1
  Cole4  877.92782  877.92782  877.92782  877.92782  877.92782  877.92782     1

Based on speed AkselA's approach seems to be the fastest but I get a different result for M-M's approach compared to all others (once AkselA's changes to 65 in the cut part cut, c(0, 19, 64, Inf)... I will accept answer based on speed but will look into the differences in the results!


Solution

  • Assuming I understand your request.

    ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
    
    ages <- do.call(data.frame, 
      lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
    
    ages <- sapply(ages, table)
    colnames(ages) <- monthbucket$month
    ages
    #       2010-01 2010-02 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12 2011-01
    # 0-19        2       2       2       2       2       2       2       2       2       2       2       2       2
    # 19-64       7       7       7       7       7       7       7       7       7       7       7       7       7
    # 64+         1       1       1       1       1       1       1       1       1       1       1       1       1
    #