Select closest date and corresponding result from multiple columns based on another date in R

df <- structure (list(
  subject_id = c("232-5467", "232-6784", "232-3457", "232-0987", "232-1245", "232-1945"), 
  HIV_VL_result_date_1 = c("2015-10-11","2015-10-10","2015-11-06","2016-02-02","2017-12-04","2019-02-15"),
  VL_results_1 = c("LDL", "LDL", "LDL", "<100", "44405", "2322"), 
  HIV_VL_result_date_2 = c("2017-05-21", "2022-04-07", "2016-08-21", "2016-11-01", "2018-02-26",NA),
  VL_results_2 = c("LDL", "5613", "LDL", "LDL", "93356", NA), 
  HIV_VL_result_date_3 = c("2018-06-27", "2022-07-15", "2022-04-13", "2017-03-01","2018-05-19",NA), 
  VL_results_3 = c("LDL", "6590", "LDL", "LDL", "19078",NA), 
  HIV_VL_result_date_4 = c("2020-04-16", "2022-08-15", NA, "2022-06-07", "2020-01-16",NA),
  VL_results_4 = c("LDL", "375", NA, "36", "44",NA),
  HIV_VL_result_date_5 = c("2021-03-25", "2023-01-28", NA, NA, "2022-05-03",NA),
  VL_results_5 = c("LDL", "9125", NA, NA, "LDL",NA),
  HIV_VL_result_date_6 = c("2022-03-07", NA, NA, NA, "2022-11-15",NA),
  VL_results_6 = c("LDL", NA, NA, NA, "<20",NA),
  preg_date = c("2022-03-04","2022-08-13","2022-05-04","2022-06-02","2022-04-14",NA)), 
  class = "data.frame", row.names = c(NA, -6L))

I need to do the following:

Out of all 6 HIV_VL_result_date, select the HIV_VL_result_date closest to preg_date, as well as the corresponding VL_results.
Rename the selected date and result to vl_closest_date and vl_results_closest respectively

What I've tried: Started by calculating the date diff between each HIV_VL_result_date and preg date. Resulting in 6 new columns.

date_columns <- grep("HIV_VL_result_date_", names(df), value = TRUE)

df <- df %>%
  mutate(
    across(
      all_of(date_columns),
      .fns = diff_lambda_list,
      .names = "{.col}_diff_from_pregnancy_outcome"
    )
  )

Then I was going to select the absolute lowest value per row then somehow pull the original date corresponding to that lowest value. But I am struggling with this and surely there has to be a simpler way of doing it. I've tried searching for similar questions but can't find one with similar data structure.

Solution

We can do with rowwise and c_across, but I think its just better to work in a longer format.

Try this:

# library(tidyverse)

# Finding the closest results (after or before `preg_date`)
aux <- df %>% 
  # Wrangling and cleaning
  pivot_longer(cols = contains("VL_result")) %>%
  mutate(
    result_id = as.integer(str_extract(name, "\\d+$")),
    name = str_remove(name, "_\\d+$")) %>%
  pivot_wider(names_from = name, values_from = value) %>%
  mutate(across(contains("_date"), \(x) ymd(x))) %>%
  
  # Sorting to find the closest
  arrange(abs(int_length(interval(preg_date, HIV_VL_result_date)))) %>% 
  
  # Picking the closest for each subject
  slice_head(by = subject_id, n = 1) %>% 
  transmute(
    subject_id,
    closest_result_id = result_id,
    vl_closest_date = HIV_VL_result_date, 
    vl_results_closest = VL_results)

# Joining all together
df <- left_join(df, aux, by = "subject_id")

Output:

# Closest results (after or before `preg_date`)
> aux
# A tibble: 6 × 4
  subject_id closest_result_id vl_closest_date vl_results_closest
  <chr>                  <int> <date>          <chr>             
1 232-6784                   4 2022-08-15      375               
2 232-5467                   6 2022-03-07      LDL               
3 232-0987                   4 2022-06-07      36                
4 232-1245                   5 2022-05-03      LDL               
5 232-3457                   3 2022-04-13      LDL               
6 232-1945                   1 2019-02-15      2322 

# Not all columns - just for convenience
> select(df, 1:3, contains("closest"))
  subject_id HIV_VL_result_date_1 VL_results_1 closest_result_id vl_closest_date vl_results_closest
1   232-5467           2015-10-11          LDL                 6      2022-03-07                LDL
2   232-6784           2015-10-10          LDL                 4      2022-08-15                375
3   232-3457           2015-11-06          LDL                 3      2022-04-13                LDL
4   232-0987           2016-02-02         <100                 4      2022-06-07                 36
5   232-1245           2017-12-04        44405                 5      2022-05-03                LDL
6   232-1945           2019-02-15         2322                 1      2019-02-15               2322