Search code examples
rjoindata.tablenon-equi-join

Column name labelling in data.table joins


I am trying to join data.table x to z using a non-equi join. Table x contains two columns X1 and X2 that are used as the range to use for joining with column Z1 in z. The current code successfully does the non-equi join however certain columns are removed or renamed. I would like to return the 'ideal' data.table supplied, instead of the one I currently have which I would have to rename columns or join data further to get the 'ideal' data supplied.

> library(data.table)
> 
> x <- data.table(Id  = c("A", "B", "C", "C"),
+                 X1  = c(1L, 3L, 5L, 7L),
+                 X2 = c(8L,12L,9L,18L),
+                 XY  = c("x2", "x4", "x6", "x8"))
> 
> z <- data.table(ID = "C", Z1 = 5:9, Z2 = paste0("z", 5:9))
> 
> x
   Id X1 X2 XY
1:  A  1  8 x2
2:  B  3 12 x4
3:  C  5  9 x6
4:  C  7 18 x8
> z
   ID Z1 Z2
1:  C  5 z5
2:  C  6 z6
3:  C  7 z7
4:  C  8 z8
5:  C  9 z9
> 
> # suboptimal data return data format
> x[z, on = .(Id == ID, X1 <= Z1, X2 >= Z1)]
   Id X1 X2 XY Z2
1:  C  5  5 x6 z5
2:  C  6  6 x6 z6
3:  C  7  7 x6 z7
4:  C  7  7 x8 z7
5:  C  8  8 x6 z8
6:  C  8  8 x8 z8
7:  C  9  9 x6 z9
8:  C  9  9 x8 z9
> 
> # column names are Id, X1 and X2 from x which replaces ID and Z1. The contents of X1 and X2 are also changed to the original values of Z1.
> # XY and Z2 remain unchanged.
> 
> # I want to create the following table where the original column names and values are retained, while still joining the table in a non-equi way.
> 
> ideal <- data.table(ID = c("C", "C", "C", "C", "C", "C", "C", "C"),
+                     Z1 = c(5, 6, 7, 7, 8, 8, 9, 9),
+                     Z2 = c("Z5", "z6", "z7", "z7", "z8", "z8", "z9", "z9"),
+                     X1 = c(5, 5, 5, 7, 5, 7, 5, 7),
+                     X2 = c(9, 9, 9, 18, 9, 18, 9, 18),
+                     XY = c("x6", "x6", "x6", "x8", "x6", "x8", "x6", "x8"))
> 
> print(ideal)
   ID Z1 Z2 X1 X2 XY
1:  C  5 Z5  5  9 x6
2:  C  6 z6  5  9 x6
3:  C  7 z7  5  9 x6
4:  C  7 z7  7 18 x8
5:  C  8 z8  5  9 x6
6:  C  8 z8  7 18 x8
7:  C  9 z9  5  9 x6
8:  C  9 z9  7 18 x8

Solution

  • I ended up answering my own question.

    data_table_tidy_join <- function(x,y, join_by){
    
        x <- data.table(x)
        y <- data.table(y)
    
        # Determine single join names
        single_join_names <- purrr::keep((stringr::str_split(join_by, "==|>=|<=")), ~length(.) == 1) %>% unlist()
    
        # cols from x that won't require as matching in i
        remove_from_x_names <- c(trimws(na.omit(stringr::str_extract(join_by, ".*(?=[=]{2})"))), single_join_names)
    
        # names need to keep
        x_names_keep_raw <- names(x)[!names(x) %in% remove_from_x_names]
        y_names_keep_raw <- names(y)
    
        # cols that exist in both x and y, but not being equi joined on
        cols_rename_index <- x_names_keep_raw[x_names_keep_raw %in% y_names_keep_raw]
    
        #rename so indexing works
        x_names_keep <- x_names_keep_raw
        y_names_keep <- y_names_keep_raw
    
        # give prefix to necessary vars
        x_names_keep[x_names_keep %in% cols_rename_index] <- paste("x.",cols_rename_index, sep ="")
        y_names_keep[y_names_keep %in% cols_rename_index] <- paste("i.",cols_rename_index, sep ="")
    
        # implement data.table call, keeping required cols
        joined_data <-
            x[y, on = join_by,
              mget(c(paste0("i.", y_names_keep_raw),paste0("x.", x_names_keep_raw))) %>% set_names(c(y_names_keep,x_names_keep)),
              mult = "all", allow.cartesian = TRUE, nomatch = NA] %>%
            as_tibble()
    
        return(joined_data)
    
    }
    
    > x <- data.table(Id  = c("A", "B", "C", "C"),
    +                  X1  = c(1L, 3L, 5L, 7L),
    +                  X2 = c(8L,12L,9L,18L),
    +                  XY  = c("x2", "x4", "x6", "x8"))
    >  
    > z <- data.table(ID = "C", Z1 = 5:9, Z2 = paste0("z", 5:9))
    >   
    > data_table_tidy_join(x, z, join_by = c("Id == ID","X1 <= Z1", "X2 >= Z1"))
    # A tibble: 8 x 6
      ID       Z1 Z2       X1    X2 XY   
      <chr> <int> <chr> <int> <int> <chr>
    1 C         5 z5        5     9 x6   
    2 C         6 z6        5     9 x6   
    3 C         7 z7        5     9 x6   
    4 C         7 z7        7    18 x8   
    5 C         8 z8        5     9 x6   
    6 C         8 z8        7    18 x8   
    7 C         9 z9        5     9 x6   
    8 C         9 z9        7    18 x8