I have a dataset with columns:
set.seed(123)
df <- data.frame(Mason_Id = sample(c("Mason1", "Mason2","Mason3","Mason4","Mason5","Mason6"), 12, T),
Registration_Date = c("01-08-2020", "01-08-2020","05-08-2020","07-08-2020",
"02-09-2020", "02-09-2020","02-09-2020",
"03-09-2020","04-09-2020","01-10-2020","02-10-2020",
"06-10-2020"),
Token_Count = runif(12, 10, 100), stringsAsFactors = F)
#calculate last day of every month
library(lubridate)
df$month_end_date=paste(format(df$Registration_Date, format="%m-%Y"),"-", days_in_month(df$Registration_Date), sep="")
I need to find the unique number of masons in the last 3 months starting October and moving backwards, in the following format:
Registration_Date | Unique_Masons
31-10-2020 | 5(unique masons in Oct,Sep, Aug)
30-09-2020 | x1(unique masons in Sep, Aug, July)
31-08-2020 | x2(unique masons in Aug, July, June)
... and so on.
I have tried summarizing the data by quarter and monthly basis but it has not worked for me. Kindly help. Thanks in advance.
Base R solution:
clean_df <- transform(
df,
Month_Vec = as.Date(gsub("^\\d{2}", "01", Registration_Date), "%d-%m-%Y"),
Registration_Date = as.Date(Registration_Date, "%d-%m-%Y")
)
drng <- range(clean_df$Month_Vec)+31
eom_df <- merge(clean_df,
data.frame(eom = seq(drng[1], drng[2], by = "1 month")-1,
Month_Vec = sort(unique(clean_df$Month_Vec))), by = "Month_Vec", all.x = TRUE)
lapply(unique(eom_df$Month_Vec),
function(x){
lower_lim <- seq(x, length = 2, by = "-3 months")[2]
sbst <- subset(eom_df, Month_Vec >= lower_lim & Month_Vec <= x)
data.frame(
Registration_Date = max(sbst$eom),
Unique_Masons = paste0(length(unique(sbst$Mason_Id)), "(Unique Masons in ",
paste0(unique(month.abb[as.integer(substr(sbst$Month_Vec, 6, 7))]),
collapse = ", "), ")"
)
)
}
)