Given a tibble of historic changes I want to efficiently create a tibble with what value each id had at a given date.
Example
library(lubridate)
library(tidyverse)
df <- tribble(
~id, ~value, ~date_created,
1, "a", as_date("2020-01-01"),
1, "b", as_date("2020-01-06"),
1, "c", as_date("2020-02-01"),
2, "Y", as_date("2020-01-01"),
2, "Z", as_date("2020-01-02")
)
# function should output a tibble with one row per id with the value it had at that date
get_value_at_date <- function(df, date){}
get_value_at_date(df, as_date("2019-01-01"))
should have output tribble(~id,~value,1,NA,2,NA)
get_value_at_date(df, as_date("2020-01-06"))
should have output tribble(~id,~value,1,"b",2,"Z")
get_value_at_date(df, as_date("2020-03-01"))
should have output tribble(~id,~value,1,"c",2,"Z")
Example Solution Function
get_value_at_date <- function(df, date){
# find the last change before the date
value_at_date_df <- df %>%
arrange(id, date_created) %>%
group_by(id) %>%
filter(date_created <= date) %>%
slice_tail(n = 1) %>%
select(id,value)
# value could be of many class types, and need a unique NA for each
value_class <- class(df %>% select(value) %>% pull())
# we're assuming as.CLASS(NA) works for all CLASS inputs
bespoke_na <- eval(parse(text=paste0("as.",value_class,"(NA)")))
# find any that have been removed so should be blank
missed_ids <- df %>%
anti_join(value_at_date_df, by = "id") %>%
pull(id) %>%
unique()
# make it a df
missed_ids_df <- tibble(
id = missed_ids,
value = bespoke_na
)
# attach the 2 dfs
out_df <- bind_rows(value_at_date_df,missed_ids_df) %>%
arrange(id) %>%
ungroup()
return(out_df)
}
I have the following two issues with my solution:
eval
to guess the class of NA does not feel like good practice. The reason for this is the function's input tibble could have the value column as any class. I do not know if for every class --class-- the function as.--class-- exists.The .preserve
argument to filter
removes the need to handle removed groups.
last
uses dplyr:::default_missing
for sensible missing values, but that can be overriden if needed.
get_value_at_date_2 <- function(df, date){
df %>%
group_by(id) %>%
dplyr::filter(date_created <= date, .preserve = TRUE) %>%
summarize(value = dplyr::last(value, order_by = date_created))
}
get_value_at_date_2(df, as_date("2019-01-01"))
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 2 x 2
#> id value
#> <dbl> <chr>
#> 1 1 <NA>
#> 2 2 <NA>
get_value_at_date_2(df, as_date("2020-01-06"))
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 2 x 2
#> id value
#> <dbl> <chr>
#> 1 1 b
#> 2 2 Z
get_value_at_date_2(df, as_date("2020-03-01"))
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 2 x 2
#> id value
#> <dbl> <chr>
#> 1 1 c
#> 2 2 Z
(Benchmarking is left out, as it is better conducted at mid to actual scale on real data, and would probably be meaningless on example data above. If performance remains an issue, consider the data.table
package, that can mix with the tidyverse
).
I ended up running a benchmark, and the performance is not really better
create_df <- function(n, rows_to_groups_ratio, seed = 123) {
set.seed(seed)
tibble(
id = sample(sample(n %/% rows_to_groups_ratio, 1), n, replace = TRUE),
value = sample(c(letters, LETTERS), n, replace = TRUE),
date_declared = sample(seq(as.Date("2019-01-01"), as.Date("2020-07-01"), "day"),
n, replace = TRUE)
)
}
mybench <- bench::press(
n = c(1e3, 10e3, 100e3, 1e6),
rows_to_groups_ratio = c(3, 5, 10, 50, 100),
{
df <- create_df(n, rows_to_groups_ratio)
date <- as.Date("2020-01-01")
bench::mark(
get_value_at_date(df, date),
get_value_at_date_2(df, date)
)
}
)
autoplot(mybench)