Search code examples
rdplyrfuzzyjoin

Function to `interval_left_join` multiple dataframes


I have several dataframes I want to interval_left_join. I could in theory join the dataframes step-by-step but would prefer a function to perform the joins in one go:

Data:

df1 <- data.frame(
  line = 1:4,
  key = c("a", "b", NA, "a"),
  start = c(75,100,170,240),
  end = c(100,150,190,300)
)

df2 <- data.frame(
  v2 = c("A","B","C","D","E","F","G","H","I","J","K","F"),
  start = c(0,10,30,90,120,130,154,161,175,199,205,300),
  end = c(10,20,50,110,130,140,160,165,180,250,300,305)
)

df3 <- data.frame(
  v3 = c("a","b","c","d","e","f"),
  start = c(5,90,200,333,1000,1500),
  end = c(75,171,210,400,1001,1600)
)

df4 <- data.frame(
  v4 = c("x","y","z","xx","yy","zz"),
  start = c(55,90,200,333,1000,1500),
  end = c(1005,171,210,400,1001,1600)
)

The variables that I want to funnel into df1 based on their start to end interval are v2, v3, v4. What I've tried so far is the code below: it outputs partyl incorrect data for v2 and fails for v3and v4 entirely -- what's missing or wrong here?

# install package "IRanges":
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# 
# BiocManager::install("IRanges")

library(BiocManager)
library(fuzzyjoin)
library(data.table)
library(dplyr)
join_dataframes <- function(df1, df2) {
  interval_left_join(x = df1,
                     y = df2,
                     by = c("start", "end")) %>%
    group_by(grp = rleid(key)) %>%
    summarise(across(c(line, start.x, end.x), first), 
                    key = unique(key),
                    v2 = str_c(if_else(!is.na(v2), v2, "*" ), collapse = ",")) %>%
    rename(start = start.x, end = end.x) %>%
    select(-grp)
}

list_df <- list(df1, df2, df3, df4)
Reduce(join_dataframes, list_df)

The desired result is this:

# A tibble: 4 x 7
   line key   v2    v3    start   end v4   
  <int> <chr> <chr> <chr> <dbl> <dbl> <chr>
1     1 a     D     a,b      75   100 x,y  
2     2 b     D,E,F b       100   150 x,y  
3     3 NA    I     b       170   190 x,y  
4     4 a     J,K,F *       240   300 x 

Solution

  • Perform only the join in Reduce, v2, v3, v4 columns can be summarised after the join.

    library(dplyr)
    library(fuzzyjoin)
    library(data.table)
    
    join_dataframes <- function(df1, df2) {
      interval_left_join(x = df1,
                         y = df2,
                         by = c("start", "end")) %>%
        select(-c(start.y, end.y)) %>%
        rename(start = start.x, end = end.x)
    }
    
    list_df <- list(df1, df2, df3, df4)
    
    Reduce(join_dataframes, list_df) %>%
      group_by(grp = rleid(key)) %>%
      summarise(across(c(line, start, end), first), 
                across(v2:v4, ~toString(unique(if_else(!is.na(.), ., "*")))), 
                key = unique(key))
    
    
    #    grp  line start   end v2      v3    v4    key  
    #* <int> <int> <dbl> <dbl> <chr>   <chr> <chr> <chr>
    #1     1     1    75   100 D       a, b  x, y  a    
    #2     2     2   100   150 D, E, F b     x, y  b    
    #3     3     3   170   190 I       b     x, y  NA   
    #4     4     4   240   300 J, K, F *     x     a