I'm trying to add the list of month and year between two dates.
I have the following dataset:
PKID Name Gender DateStart DateEnd
68 PAUL 1 24/11/2021 23/02/2022
68 PAUL 1 24/04/2022 23/06/2023
40 KATE 2 01/01/2000 14/03/2000
40 KATE 2 03/12/2000 31/01/2001
And I Want to create the following dataset:
PKID Name Gender DateStart DateEnd year Month
68 PAUL 1 24/11/2021 23/02/2022 2021 11
68 PAUL 1 24/11/2021 23/02/2022 2021 12
68 PAUL 1 24/11/2021 23/02/2022 2022 1
68 PAUL 1 24/11/2021 23/02/2022 2022 2
68 PAUL 1 24/04/2022 23/06/2023 2022 4
68 PAUL 1 24/04/2022 23/06/2023 2022 5
68 PAUL 1 24/04/2022 23/06/2023 2022 6
40 KATE 2 01/01/2000 14/03/2000 2000 1
40 KATE 2 01/01/2000 14/03/2000 2000 2
40 KATE 2 01/01/2000 14/03/2000 2000 3
40 KATE 2 03/12/2000 31/01/2001 2000 12
40 KATE 2 03/12/2000 31/01/2001 2001 1
Where month corresponds to the month between the star date and the end date and year correspond to the month.
I have tried the following:
# Load necessary libraries
library(dplyr)
library(tidyr)
library(lubridate) # For handling date operations
# Sample data
df <- read.table(text = "
PKID Name Gender DateStart DateEnd
68 PAUL 1 24/11/2021 23/02/2022
68 PAUL 1 24/04/2022 23/06/2023
40 KATE 2 01/01/2000 14/03/2000
40 KATE 2 03/12/2000 31/01/2001
", header = TRUE, stringsAsFactors = FALSE)
# Convert date columns to Date format
df$DateStart <- dmy(df$DateStart)
df$DateEnd <- dmy(df$DateEnd)
# Generate sequence of dates for each row
df <- df %>%
group_by(PKID, Name, Gender, DateStart, DateEnd) %>%
complete(Date = seq.Date(DateStart, DateEnd, by = "month")) %>%
ungroup() %>%
mutate(
year = year(Date), # Extract year
Month = month(Date) # Extract month
) %>%
select(-Date) # Remove the temporary Date column
# Print the result
print(df)
But I get the following error:
Error in reframe():
ℹ In argument: complete(data = pick(everything()), ..., fill = fill, explicit = explicit).
ℹ In group 1: PKID = 40, Name = "KATE", Gender = 2, DateStart = 2000-01-01, DateEnd = 2000-03-14.
Caused by error:
! object 'DateStart' not found
You can try:
library(dplyr)
library(purrr)
library(tidyr)
library(lubridate)
df |>
mutate(date = map2(DateStart, rollforward(DateEnd), \(x, y) seq(x, y, by = "month"))) |>
unnest(date) |>
mutate(year = year(date),
month = month(date),
date = NULL)
# A tibble: 26 × 7
PKID Name Gender DateStart DateEnd year month
<int> <chr> <int> <date> <date> <dbl> <dbl>
1 68 PAUL 1 2021-11-24 2022-02-23 2021 11
2 68 PAUL 1 2021-11-24 2022-02-23 2021 12
3 68 PAUL 1 2021-11-24 2022-02-23 2022 1
4 68 PAUL 1 2021-11-24 2022-02-23 2022 2
5 68 PAUL 1 2022-04-24 2023-06-23 2022 4
6 68 PAUL 1 2022-04-24 2023-06-23 2022 5
7 68 PAUL 1 2022-04-24 2023-06-23 2022 6
8 68 PAUL 1 2022-04-24 2023-06-23 2022 7
9 68 PAUL 1 2022-04-24 2023-06-23 2022 8
10 68 PAUL 1 2022-04-24 2023-06-23 2022 9
# ℹ 16 more rows
# ℹ Use `print(n = ...)` to see more rows
Or alternatively with reframe()
df |>
reframe(date = seq(DateStart, rollforward(DateEnd), by = "month"),
year = year(date),
month = month(date),
.by = everything()) |>
select(-date)