Search code examples
rdplyrlubridate

Calculating unique masons by grouping consequent last months in a dataset


I have a dataset with columns:

set.seed(123)
df <- data.frame(Mason_Id = sample(c("Mason1", "Mason2","Mason3","Mason4","Mason5","Mason6"), 12, T),
                 Registration_Date = c("01-08-2020", "01-08-2020","05-08-2020","07-08-2020",
                          "02-09-2020", "02-09-2020","02-09-2020",
                          "03-09-2020","04-09-2020","01-10-2020","02-10-2020",
                          "06-10-2020"),
                 Token_Count = runif(12, 10, 100), stringsAsFactors = F)

#calculate last day of every month
library(lubridate)
df$month_end_date=paste(format(df$Registration_Date, format="%m-%Y"),"-", days_in_month(df$Registration_Date), sep="")

I need to find the unique number of masons in the last 3 months starting October and moving backwards, in the following format:

Registration_Date | Unique_Masons
31-10-2020   |   5(unique masons in Oct,Sep, Aug) 
30-09-2020   |   x1(unique masons in Sep, Aug, July)
31-08-2020   |   x2(unique masons in Aug, July, June)
... and so on.

I have tried summarizing the data by quarter and monthly basis but it has not worked for me. Kindly help. Thanks in advance.


Solution

  • Base R solution:

    clean_df <- transform(
      df,
      Month_Vec = as.Date(gsub("^\\d{2}", "01", Registration_Date), "%d-%m-%Y"),
      Registration_Date = as.Date(Registration_Date, "%d-%m-%Y")
    )
    
    drng <- range(clean_df$Month_Vec)+31
    
    eom_df <- merge(clean_df, 
                data.frame(eom = seq(drng[1], drng[2], by = "1 month")-1, 
                Month_Vec = sort(unique(clean_df$Month_Vec))), by = "Month_Vec", all.x = TRUE)
    
    lapply(unique(eom_df$Month_Vec), 
      function(x){
        lower_lim <- seq(x, length = 2, by = "-3 months")[2]
        sbst <- subset(eom_df, Month_Vec >= lower_lim & Month_Vec <= x)
        data.frame(
          Registration_Date = max(sbst$eom),
          Unique_Masons = paste0(length(unique(sbst$Mason_Id)), "(Unique Masons in ",
            paste0(unique(month.abb[as.integer(substr(sbst$Month_Vec, 6, 7))]), 
            collapse = ", "), ")"
          )
        )
      }
    )