Given this dataframe:
library(tidyverse)
library(lubridate)
df <- tribble(
~person, ~start, ~end,
'1', '2000-01-01', '2000-12-31',
'1', '2001-01-01', '2002-07-31',
'1', '2001-04-01', '2001-06-30',
'1', '2001-08-01', '2001-12-31',
'1', '2002-01-01', '2002-12-31',
'2', '2000-01-01', '2000-11-30',
'2', '2001-01-01', '2001-12-31') |>
mutate(start = lubridate::as_date(start),
end = lubridate::as_date(end))
I am trying to produce output to show:
person start end
1 2000-01-01 2002-12-31
2 2000-01-01 2001-11-30
2 2001-01-01 2001-12-31
These data have overlapping dates so I am trying to create a group of the start and end date that are consecutive without a lapse.
I tried this
df |>
distinct() |>
arrange(person, start, end) |>
mutate(
gap = start - lag(end, default = min(start))
) |>
group_by(person) |>
summarise(
MIN_START = min(start),
MAX_END = max(end)
)
but this takes the first min and last max dates, and ignores any breaks in-between. Any suggestions on how to accomplish this?
A more frugal function, this one uses a fraction of the memory that the first function did (below, kept for comparison).
fun <- function(starts, ends) {
S <- starts; E <- ends
for (i in rev(seq_along(S))) {
ind <- ind <- which(S[-i] <= S[i] & S[i] <= E[-i] + 1)
if (length(ind)) {
ind <- max(ind) + (max(ind) > i)
S[ind] <- min(S[c(ind, i)])
E[ind] <- max(E[c(ind, i)])
S <- S[-i]
E <- E[-i]
}
}
data.frame(start=S, end=E)
}
reframe(df, .by=person, fun(start, end))
# # A tibble: 3 × 3
# person start end
# <chr> <date> <date>
# 1 1 2000-01-01 2002-12-31
# 2 2 2000-01-01 2000-11-30
# 3 2 2001-01-01 2001-12-31
(Old answer, not needed.)
Using a helper function (which is admittedly brute-force):
fun <- function(starts, ends) {
x <- unique(sort(do.call(c, Map(seq, starts, ends, list(by="day")))))
x <- split(x, cumsum(c(TRUE, diff(x) > 1))) |>
lapply(function(z) setNames(data.frame(as.list(range(z))),
c("start", "end")))
do.call(rbind, x)
}
We then can do
dplyr::reframe(df, .by=person, fun(start, end))
# # A tibble: 3 × 3
# person start end
# <chr> <date> <date>
# 1 1 2000-01-01 2002-12-31
# 2 2 2000-01-01 2000-11-30
# 3 2 2001-01-01 2001-12-31