df <- structure (list(
subject_id = c("232-5467", "232-6784", "232-3457", "232-0987", "232-1245", "232-1945"),
HIV_VL_result_date_1 = c("2015-10-11","2015-10-10","2015-11-06","2016-02-02","2017-12-04","2019-02-15"),
VL_results_1 = c("LDL", "LDL", "LDL", "<100", "44405", "2322"),
HIV_VL_result_date_2 = c("2017-05-21", "2022-04-07", "2016-08-21", "2016-11-01", "2018-02-26",NA),
VL_results_2 = c("LDL", "5613", "LDL", "LDL", "93356", NA),
HIV_VL_result_date_3 = c("2018-06-27", "2022-07-15", "2022-04-13", "2017-03-01","2018-05-19",NA),
VL_results_3 = c("LDL", "6590", "LDL", "LDL", "19078",NA),
HIV_VL_result_date_4 = c("2020-04-16", "2022-08-15", NA, "2022-06-07", "2020-01-16",NA),
VL_results_4 = c("LDL", "375", NA, "36", "44",NA),
HIV_VL_result_date_5 = c("2021-03-25", "2023-01-28", NA, NA, "2022-05-03",NA),
VL_results_5 = c("LDL", "9125", NA, NA, "LDL",NA),
HIV_VL_result_date_6 = c("2022-03-07", NA, NA, NA, "2022-11-15",NA),
VL_results_6 = c("LDL", NA, NA, NA, "<20",NA),
preg_date = c("2022-03-04","2022-08-13","2022-05-04","2022-06-02","2022-04-14",NA)),
class = "data.frame", row.names = c(NA, -6L))
I need to do the following:
What I've tried: Started by calculating the date diff between each HIV_VL_result_date and preg date. Resulting in 6 new columns.
date_columns <- grep("HIV_VL_result_date_", names(df), value = TRUE)
df <- df %>%
mutate(
across(
all_of(date_columns),
.fns = diff_lambda_list,
.names = "{.col}_diff_from_pregnancy_outcome"
)
)
Then I was going to select the absolute lowest value per row then somehow pull the original date corresponding to that lowest value. But I am struggling with this and surely there has to be a simpler way of doing it. I've tried searching for similar questions but can't find one with similar data structure.
We can do with rowwise
and c_across
, but I think its just better to work in a longer format.
Try this:
# library(tidyverse)
# Finding the closest results (after or before `preg_date`)
aux <- df %>%
# Wrangling and cleaning
pivot_longer(cols = contains("VL_result")) %>%
mutate(
result_id = as.integer(str_extract(name, "\\d+$")),
name = str_remove(name, "_\\d+$")) %>%
pivot_wider(names_from = name, values_from = value) %>%
mutate(across(contains("_date"), \(x) ymd(x))) %>%
# Sorting to find the closest
arrange(abs(int_length(interval(preg_date, HIV_VL_result_date)))) %>%
# Picking the closest for each subject
slice_head(by = subject_id, n = 1) %>%
transmute(
subject_id,
closest_result_id = result_id,
vl_closest_date = HIV_VL_result_date,
vl_results_closest = VL_results)
# Joining all together
df <- left_join(df, aux, by = "subject_id")
Output:
# Closest results (after or before `preg_date`)
> aux
# A tibble: 6 × 4
subject_id closest_result_id vl_closest_date vl_results_closest
<chr> <int> <date> <chr>
1 232-6784 4 2022-08-15 375
2 232-5467 6 2022-03-07 LDL
3 232-0987 4 2022-06-07 36
4 232-1245 5 2022-05-03 LDL
5 232-3457 3 2022-04-13 LDL
6 232-1945 1 2019-02-15 2322
# Not all columns - just for convenience
> select(df, 1:3, contains("closest"))
subject_id HIV_VL_result_date_1 VL_results_1 closest_result_id vl_closest_date vl_results_closest
1 232-5467 2015-10-11 LDL 6 2022-03-07 LDL
2 232-6784 2015-10-10 LDL 4 2022-08-15 375
3 232-3457 2015-11-06 LDL 3 2022-04-13 LDL
4 232-0987 2016-02-02 <100 4 2022-06-07 36
5 232-1245 2017-12-04 44405 5 2022-05-03 LDL
6 232-1945 2019-02-15 2322 1 2019-02-15 2322