I have a data frame of records with a date and "group". I would like to find every combination of records which satisfies the constraints of:
My data is small enough that I should be able to find every combination. If I can apply the constraints while creating the combinations it should keep the size manageable. But so far I've only been able to create all combinations and then filter, which is very slow.
I'm guessing data.table
would be useful? Below is a MRE:
library(tidyverse)
set.seed(0)
#Set some parameters to be used later
n_rows <- 1000
n_elig_groups <- 4
max_date_diff <- 8
#Create a data frame with id, date & group
df <- tibble(
id = 1:n_rows,
date = sample(seq.Date(ymd(20230101), ymd(20231231), by = "day"),
n_rows,
replace = TRUE),
group = rep(letters, length.out = n_rows)
)
#What are the eligible groups?
elig_groups <- letters[sample(1:26, n_elig_groups)]
#Find every combination of records within the eligible groups where:
#There is one and only one record from each group.
#Each record within the combination has a distinct date.
#All records within a combination are within x days of each other.
#Bad approach: this is technically correct but super inefficient
#Create every possible combination of records within the eligible groups
combos <- df %>%
filter(group == elig_groups[1]) %>%
select(-group) %>%
rename_all(function(x){paste0(x, "_", elig_groups[1])})
for(group_i in elig_groups[-1]){
combos <- combos %>%
expand_grid({
df %>%
filter(group == group_i) %>%
select(-group) %>%
rename_all(function(x){paste0(x, "_", group_i)})
})
}
#Now find the combinations which meet our constraints
valid_combos <- combos %>%
#First, pivot to get a row for every record
mutate(combo_id = row_number()) %>%
pivot_longer(-combo_id,
names_to = c(".value", "group"),
names_sep = "_") %>%
#Apply our constraints by combo
group_by(combo_id) %>%
filter(n_distinct(date) == n_elig_groups,
max(date) - min(date) <= max_date_diff) %>%
ungroup()
Form valid combination affiliations with a data.table
non-equi join on the dates followed by filtering on id
and group
, then use igraph::cliques
to find the all combinations of size 4:
library(data.table)
library(igraph)
dt <- setDT(df)[group %in% elig_groups]
combos <- matrix(
as(
names(
unlist(
cliques(
graph_from_data_frame(
dt[,datePlus := date + max_date_diff][
dt,
on = .(date > date, date <= datePlus),
.(id, i.id, group, i.group)
][
id != i.id & group != i.group
][,3:4 := NULL],
FALSE
),
n_elig_groups,
n_elig_groups
)
)
), class(df$id)
),
ncol = n_elig_groups, byrow = TRUE
)
There are 61 combinations:
dim(combos)
#> [1] 61 4
The first combination:
dt[id %in% combos[1,], 1:3]
#> id date group
#> 1: 39 2023-02-08 m
#> 2: 186 2023-02-10 d
#> 3: 323 2023-02-06 k
#> 4: 510 2023-02-13 p