Search code examples
rdplyr

Select closest date and corresponding result from multiple columns based on another date in R


df <- structure (list(
  subject_id = c("232-5467", "232-6784", "232-3457", "232-0987", "232-1245", "232-1945"), 
  HIV_VL_result_date_1 = c("2015-10-11","2015-10-10","2015-11-06","2016-02-02","2017-12-04","2019-02-15"),
  VL_results_1 = c("LDL", "LDL", "LDL", "<100", "44405", "2322"), 
  HIV_VL_result_date_2 = c("2017-05-21", "2022-04-07", "2016-08-21", "2016-11-01", "2018-02-26",NA),
  VL_results_2 = c("LDL", "5613", "LDL", "LDL", "93356", NA), 
  HIV_VL_result_date_3 = c("2018-06-27", "2022-07-15", "2022-04-13", "2017-03-01","2018-05-19",NA), 
  VL_results_3 = c("LDL", "6590", "LDL", "LDL", "19078",NA), 
  HIV_VL_result_date_4 = c("2020-04-16", "2022-08-15", NA, "2022-06-07", "2020-01-16",NA),
  VL_results_4 = c("LDL", "375", NA, "36", "44",NA),
  HIV_VL_result_date_5 = c("2021-03-25", "2023-01-28", NA, NA, "2022-05-03",NA),
  VL_results_5 = c("LDL", "9125", NA, NA, "LDL",NA),
  HIV_VL_result_date_6 = c("2022-03-07", NA, NA, NA, "2022-11-15",NA),
  VL_results_6 = c("LDL", NA, NA, NA, "<20",NA),
  preg_date = c("2022-03-04","2022-08-13","2022-05-04","2022-06-02","2022-04-14",NA)), 
  class = "data.frame", row.names = c(NA, -6L))

I need to do the following:

  1. Out of all 6 HIV_VL_result_date, select the HIV_VL_result_date closest to preg_date, as well as the corresponding VL_results.
  2. Rename the selected date and result to vl_closest_date and vl_results_closest respectively

What I've tried: Started by calculating the date diff between each HIV_VL_result_date and preg date. Resulting in 6 new columns.

date_columns <- grep("HIV_VL_result_date_", names(df), value = TRUE)

df <- df %>%
  mutate(
    across(
      all_of(date_columns),
      .fns = diff_lambda_list,
      .names = "{.col}_diff_from_pregnancy_outcome"
    )
  )

Then I was going to select the absolute lowest value per row then somehow pull the original date corresponding to that lowest value. But I am struggling with this and surely there has to be a simpler way of doing it. I've tried searching for similar questions but can't find one with similar data structure.


Solution

  • We can do with rowwise and c_across, but I think its just better to work in a longer format.

    Try this:

    # library(tidyverse)
    
    # Finding the closest results (after or before `preg_date`)
    aux <- df %>% 
      # Wrangling and cleaning
      pivot_longer(cols = contains("VL_result")) %>%
      mutate(
        result_id = as.integer(str_extract(name, "\\d+$")),
        name = str_remove(name, "_\\d+$")) %>%
      pivot_wider(names_from = name, values_from = value) %>%
      mutate(across(contains("_date"), \(x) ymd(x))) %>%
      
      # Sorting to find the closest
      arrange(abs(int_length(interval(preg_date, HIV_VL_result_date)))) %>% 
      
      # Picking the closest for each subject
      slice_head(by = subject_id, n = 1) %>% 
      transmute(
        subject_id,
        closest_result_id = result_id,
        vl_closest_date = HIV_VL_result_date, 
        vl_results_closest = VL_results)
    
    # Joining all together
    df <- left_join(df, aux, by = "subject_id")
    

    Output:

    # Closest results (after or before `preg_date`)
    > aux
    # A tibble: 6 × 4
      subject_id closest_result_id vl_closest_date vl_results_closest
      <chr>                  <int> <date>          <chr>             
    1 232-6784                   4 2022-08-15      375               
    2 232-5467                   6 2022-03-07      LDL               
    3 232-0987                   4 2022-06-07      36                
    4 232-1245                   5 2022-05-03      LDL               
    5 232-3457                   3 2022-04-13      LDL               
    6 232-1945                   1 2019-02-15      2322 
    
    # Not all columns - just for convenience
    > select(df, 1:3, contains("closest"))
      subject_id HIV_VL_result_date_1 VL_results_1 closest_result_id vl_closest_date vl_results_closest
    1   232-5467           2015-10-11          LDL                 6      2022-03-07                LDL
    2   232-6784           2015-10-10          LDL                 4      2022-08-15                375
    3   232-3457           2015-11-06          LDL                 3      2022-04-13                LDL
    4   232-0987           2016-02-02         <100                 4      2022-06-07                 36
    5   232-1245           2017-12-04        44405                 5      2022-05-03                LDL
    6   232-1945           2019-02-15         2322                 1      2019-02-15               2322