Search code examples
rleft-joinintervals

Joining two dataframes only when both start times and end times of one data frame fall into intervals of the other dataframe


I want to do an exact join of two temporally aligned dataframes using interval_left_join:

df1 <- structure(list(Utterance = c("(5.127)", ">like how old 's your mom¿ ", 
                                "(0.855)", "eh six:ty:::-one= ", "(0.101)", "(0.166)", "=NOW ", 
                                "(0.622)", "!how! this always plays out  ", "(0.726)", "[when was] that¿= ", 
                                "[yes] ", "(0.163)", "=!this! was on °Wednesday° ", "(0.273)", 
                                "oka[y] "), Starttime_ms = c(0, 5127, 6830, 7685, 9889, 176800, 
                                                             176966, 177372, 177994, 179328, 180054, 180135, 180668, 180831, 
                                                             181720, 181993), Endtime_ms = c(5127, 6830, 7685, 9889, 9990, 
                                                                                             176966, 177372, 177994, 179328, 180054, 180668, 180555, 180831, 
                                                                                             181720, 181993, 182398)), row.names = c(NA, -16L), class = c("tbl_df", 
                                                                                                                                                          "tbl", "data.frame"))

df2 <- structure(list(Q_segment = c(">like I don 't understand< ", "sorry like how old 's your mom¿", 
                                    "[when was] that¿=", "=[did she let you in?]", "=holy [sh:]i:[:t", 
                                    "does it happen] often? ", "[like] ", "[what] type of:: tours is it ", 
                                    "is it [(like a long] ti:me¿", "[or]"), Starttime_ms = c(5127, 
                                                                                             5963, 180054, 253226, 345450, 347454, 348459, 478500, 480669, 
                                                                                             481763), Endtime_ms = c(5963, 6830, 180668, 254156, 347454, 348459, 
                                                                                                                     348724, 480669, 481763, 482000)), row.names = c(NA, -10L), class = c("tbl_df", 
                                                                                                                                                                                          "tbl", "data.frame"))

Joining these two ata frames produces more joins than is desired presumably because interval_left_join joins when either one of Starttime_ms and Endtime_ms of df2 is identical with the first millisecond of Endtime_ms or Starttime_ms of df2:

# install package "IRanges":
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# 
# BiocManager::install("IRanges")
library(BiocManager)
library(fuzzyjoin)
library(data.table)

interval_left_join(x = df1,
                   y = df2,
                   by = c("Starttime_ms", "Endtime_ms"))
# A tibble: 17 × 6
   Utterance                       Starttime_ms.x Endtime_ms.x Q_segment            Starttime_ms.y Endtime_ms.y
   <chr>                                    <dbl>        <dbl> <chr>                         <dbl>        <dbl>
 1 "(5.127)"                                    0         5127 >like                          5127         5963
 2 ">like how old 's your mom¿ "             5127         6830 >like                          5127         5963
 3 ">like how old 's your mom¿ "             5127         6830 how old 's your mom¿           5963         6830
 4 "(0.855)"                                 6830         7685 how old 's your mom¿           5963         6830
 5 "eh six:ty:::-one= "                      7685         9889 NA                               NA           NA
 6 "(0.101)"                                 9889         9990 NA                               NA           NA
 7 "(0.166)"                               176800       176966 NA                               NA           NA
 8 "=NOW "                                 176966       177372 NA                               NA           NA
 9 "(0.622)"                               177372       177994 NA                               NA           NA
10 "!how! this always plays out  "         177994       179328 NA                               NA           NA
11 "(0.726)"                               179328       180054 [when was] that¿=            180054       180668
12 "[when was] that¿= "                    180054       180668 [when was] that¿=            180054       180668
13 "[yes] "                                180135       180555 [when was] that¿=            180054       180668
14 "(0.163)"                               180668       180831 [when was] that¿=            180054       180668
15 "=!this! was on °Wednesday° "           180831       181720 NA                               NA           NA
16 "(0.273)"                               181720       181993 NA                               NA           NA
17 "oka[y] "                               181993       182398 NA                               NA           NA

How can I join the two data frames more precisely such that both Starttime_ms and Endtime_ms of df2 fall into the respective time intervals of df1?

The desired result is this:

   Utterance                       Starttime_ms.x Endtime_ms.x Q_segment            Starttime_ms.y Endtime_ms.y
   <chr>                                    <dbl>        <dbl> <chr>                         <dbl>        <dbl>
 1 "(5.127)"                                    0         5127 NA                               NA           NA
 2 ">like how old 's your mom¿ "             5127         6830 >like                          5127         5963
 3 ">like how old 's your mom¿ "             5127         6830 how old 's your mom¿           5963         6830
 4 "(0.855)"                                 6830         7685 NA                               NA           NA
 5 "eh six:ty:::-one= "                      7685         9889 NA                               NA           NA
 6 "(0.101)"                                 9889         9990 NA                               NA           NA
 7 "(0.166)"                               176800       176966 NA                               NA           NA
 8 "=NOW "                                 176966       177372 NA                               NA           NA
 9 "(0.622)"                               177372       177994 NA                               NA           NA
10 "!how! this always plays out  "         177994       179328 NA                               NA           NA
11 "(0.726)"                               179328       180054 NA                               NA           NA
12 "[when was] that¿= "                    180054       180668 [when was] that¿=            180054       180668
13 "[yes] "                                180135       180555 NA                               NA           NA
14 "(0.163)"                               180668       180831 NA                               NA           NA
15 "=!this! was on °Wednesday° "           180831       181720 NA                               NA           NA
16 "(0.273)"                               181720       181993 NA                               NA           NA
17 "oka[y] "                               181993       182398 NA                               NA           NA

Solution

  • I'm struggling to understand your desired result, but have you tried dplyr::join_by()? It gives you much more control over your join.

    df1 |> 
      dplyr::left_join(
        df2,
        dplyr::join_by(
          y$Starttime_ms >= x$Starttime_ms,
          y$Endtime_ms <= x$Endtime_ms
        )
      ) |>
      dplyr::select(1:4) # remove duplicate columns
    
    #> # A tibble: 17 x 4
    #>    Utterance                       Starttime_ms.x Endtime_ms.x Q_segment        
    #>    <chr>                                    <dbl>        <dbl> <chr>            
    #>  1 "(5.127)"                                    0         5127  <NA>            
    #>  2 ">like how old 's your mom¿ "             5127         6830 ">like I don 't ~
    #>  3 ">like how old 's your mom¿ "             5127         6830 "sorry like how ~
    #>  4 "(0.855)"                                 6830         7685  <NA>            
    #>  5 "eh six:ty:::-one= "                      7685         9889  <NA>            
    #>  6 "(0.101)"                                 9889         9990  <NA>            
    #>  7 "(0.166)"                               176800       176966  <NA>            
    #>  8 "=NOW "                                 176966       177372  <NA>            
    #>  9 "(0.622)"                               177372       177994  <NA>            
    #> 10 "!how! this always plays out  "         177994       179328  <NA>            
    #> 11 "(0.726)"                               179328       180054  <NA>            
    #> 12 "[when was] that¿= "                    180054       180668 "[when was] that~
    #> 13 "[yes] "                                180135       180555  <NA>            
    #> 14 "(0.163)"                               180668       180831  <NA>            
    #> 15 "=!this! was on °Wednesday° "           180831       181720  <NA>            
    #> 16 "(0.273)"                               181720       181993  <NA>            
    #> 17 "oka[y] "                               181993       182398  <NA>
    

    Created on 2023-09-22 with reprex v2.0.2