Search code examples
rdatejoin

R : Join two dataframe on multiple conditions


I've got two dataframes. data.frame1 has information about patient ID, Visit Date, Visit number and treatment. data.frame2 has information about patient ID, Visit Date and a biological variable "Gly". I would like to merge those two df in order to know which treatment the patient had when he had his biological test.

data.frame1 <- data.frame(
  ID = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3),
  Date1 = c(1, 5, 8, 12, 4, 9, 18, 22, 24, 33, 12, 22, 34, 56),
  Num_Visit = c(1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4),
  Treatment = c("A", "A", "B", "C", "B", "B", "E", "E", "A", "A", "D", "D", "C", "E")
)

data.frame2 <- data.frame(
  ID = c(1, 1, 2, 2, 2, 3, 3, 3),
  Date2 = c(1, 8, 20, 24, 34, 12, 41, 56),
  Num_Visit = c(1, 2, 1, 2, 3, 1, 2, 3),
  Gly = c(1.1, 1.3, 1.1, 0.9, 1.4, 1.0, 1.8, 0.8)
)

(And no the Num_Visit don't match)

I've been looking around and tried this:

result <- data.frame1 %>%
  left_join(data.frame2, by = "ID") %>%
  group_by(ID, Date1) %>%
  filter(between(Date2, lag(Date1), lead(Date1, default = last(Date1)))) %>%
  summarise(Gly = first(Gly))

but that doesn't seem to work. I would like to have a data frame with a Date2, the matching biological variable "Gly" for this date and also a matching treatment for this date. It would look :

data.frame3 <- data.frame(
  ID = c(1, 1, 2, 2, 2, 3, 3, 3),
  Date2 = c(1, 8, 20, 24, 34, 25, 41, 57),
  Num_Visit = c(1, 2, 1, 2, 3, 1, 2, 3),
  Gly = c(1.1, 1.3, 1.1, 0.9, 1.4, 1.0, 1.8, 0.8),
  Treatment = c("A", "B", "E", "A", "A", "D", "C", "E")
)

Solution

  • A data.table solution using rolling joins.

    library(data.table)
    df1 <- data.table(
      ID = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3),
      Date = c(1, 5, 8, 12, 4, 9, 18, 22, 24, 33, 12, 22, 34, 56),
      Num_Visit = c(1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4),
      Treatment = c('A', 'A', 'B', 'C', 'B', 'B', 'E', 
                    'E', 'A', 'A', 'D', 'D', 'C', 'E')
    )
    
    df2 <- data.table(
      ID = c(1, 1, 2, 2, 2, 3, 3, 3),
      Date = c(1, 8, 20, 24, 34, 12, 41, 56),
      Num_Visit = c(1, 2, 1, 2, 3, 1, 2, 3),
      Gly = c(1.1, 1.3, 1.1, 0.9, 1.4, 1.0, 1.8, 0.8)
    )
    
    setkey(df1, ID, Date)
    setkey(df2, ID, Date)
    result <- df1[df2, .(ID, Date2 = Date, Num_Visit = i.Num_Visit, Gly, Treatment), 
                  roll = TRUE, rollends = TRUE]
    
       ID Date2 Num_Visit Gly Treatment
    1:  1     1         1 1.1         A
    2:  1     8         2 1.3         B
    3:  2    20         1 1.1         E
    4:  2    24         2 0.9         A
    5:  2    34         3 1.4         A
    6:  3    12         1 1.0         D
    7:  3    41         2 1.8         C
    8:  3    56         3 0.8         E