Search code examples
rdata.table

How to combine a full join and a update join with a variable number of columns using data.table


I'd like to combine a full join and a update join with a variable number of columns and havn't found an answer covering this case on stackoverflow.

DT is my initial table:

   key_col value_col_1 value_col_2
    <char>       <int>       <int>
1:       a           1          11
2:       b           2          12
3:       c           3          13
4:       d           4          14
5:       e           5          15

which needs to be updated with the data in updateDT:

   key_col value_col_1 value_col_2 value_col_3
    <char>       <int>       <int>       <int>
1:       d          11          21          31
2:       e          12          22          32
3:       f          13          23          33
4:       g          14          24          34

updateDT may add new rows and columns (variable in number) and always has priority over existing data in DT.

The expected result (resultDT) for the above example is:

   key_col value_col_1 value_col_2 value_col_3
    <char>       <int>       <int>       <int>
1:       a           1          11          NA
2:       b           2          12          NA
3:       c           3          13          NA
4:       d          11          21          31
5:       e          12          22          32
6:       f          13          23          33
7:       g          14          24          34

I can get there with hard coded column names, however, the code is repetitive and I'm struggling with generalizing this approach to handle e.g. a third or fourth value_col in updateDT dynamically:

library(data.table)

DT <- data.table(key_col = letters[1:5], value_col_1 = 1:5, value_col_2 = 11:15)
updateDT <- data.table(key_col = letters[4:7], value_col_1 = 11:14, value_col_2 = 21:24, value_col_3 = 31:34)
key_column <- "key_col"
value_columns <- setdiff(c(names(DT), names(updateDT)), key_column)

resultDT <- merge(DT, updateDT, by = "key_col", all = TRUE)
resultDT[, value_col_1 := fifelse(is.na(value_col_1.y), yes = value_col_1.x, no = value_col_1.y)]
resultDT[, value_col_2 := fifelse(is.na(value_col_2.y), yes = value_col_2.x, no = value_col_2.y)]
resultDT[, c("value_col_1.x", "value_col_1.y", "value_col_2.x", "value_col_2.y") := NULL]
print(resultDT)

Edit: accepting ismirsehregal's answer as it is faster - Benchmark:

library(microbenchmark)
microbenchmark(
  ismirsehregal = {
    resultDT1 <- rbindlist(list(old = DT, new = updateDT), use.names = TRUE, fill = TRUE, idcol = "origin")[, key_count := .N, by = key_column]
    resultDT1 <- resultDT1[key_count == 1L | key_count > 1L & origin == 'new'][, c("key_count", "origin") := NULL]
    },
  Wimpel = {
    DT[updateDT, on=.(key_col), (value_columns) := mget(paste0("i.", value_columns))]
    resultDT2 <- rbindlist(list(DT, updateDT[!key_col %in% DT$key_col,]), use.names = TRUE, fill = TRUE)
    }, times = 10L)

Unit: milliseconds
          expr    min     lq    mean median     uq    max neval
 ismirsehregal 1.2166 1.2345 1.33790 1.2933 1.3531 1.8147    10
        Wimpel 3.5286 3.5989 3.85624 3.7680 4.0916 4.5984    10

2. Edit: Example of updateDT not containing all of DTs columns:

library(data.table)

DT <- data.table(key_col = letters[1:5], value_col_1 = 1:5, value_col_2 = 11:15)
updateDT <- data.table(key_col = letters[4:7], value_col_2 = 11:14, value_col_3 = 21:24, value_col_4 = 31:34)
key_column <- "key_col"
# value_columns <- setdiff(c(names(DT), names(updateDT)), key_column) # Error: value for ‘i.value_col_1’ not found
value_columns <- setdiff(intersect(names(DT), names(updateDT)), key_column) # does not insert all data from updateDT

DT[updateDT, on=.(key_col), (value_columns) := mget(paste0("i.", value_columns))]
rbindlist(list(DT, updateDT[!key_col %in% DT$key_col,]), use.names = TRUE, fill = TRUE)

Solution

  • You can use rbindlist to achive this:

    library(data.table)
    
    DT <- data.table(key_col = letters[1:5], value_col_1 = 1:5, value_col_2 = 11:15)
    updateDT <- data.table(key_col = letters[4:7], value_col_1 = 11:14, value_col_2 = 21:24, value_col_3 = 31:34)
    key_column <- "key_col"
    value_columns <- setdiff(c(names(DT), names(updateDT)), key_column)
    
    resultDT <- rbindlist(list(old = DT, new = updateDT), use.names = TRUE, fill = TRUE, idcol = "origin")[, key_count := .N, by = key_column]
    resultDT <- resultDT[key_count == 1L | key_count > 1L & origin == 'new'][, c("key_count", "origin") := NULL]
    print(resultDT)
    

    This is basically row binding the new table to the old one and removing duplicates based on the origin.

    Edit: to avoid dropping data when not all of DTs column exist in updateDT we can simply add them upfront:

    library(data.table)
    
    DT <- data.table(key_col = letters[1:5], value_col_1 = 1:5, value_col_2 = 11:15)
    updateDT <- data.table(key_col = letters[4:7], value_col_2 = 11:14, value_col_3 = 21:24, value_col_4 = 31:34)
    key_column <- "key_col"
    value_columns <- setdiff(c(names(DT), names(updateDT)), key_column)
    
    keep_columns <- c(key_column, setdiff(names(DT), names(updateDT)))
    updateDT <- DT[, ..keep_columns][updateDT, on = key_column]
    
    resultDT <- rbindlist(list(old = DT, new = updateDT), use.names = TRUE, fill = TRUE, idcol = "origin")[, key_count := .N, by = key_column]
    resultDT <- resultDT[key_count == 1L | key_count > 1L & origin == 'new'][, c("key_count", "origin") := NULL]
    print(resultDT)
    

    In the same manner we can use @Tobo's one-liner:

    library(data.table)
    
    DT <- data.table(key_col = letters[1:5], value_col_1 = 1:5, value_col_2 = 11:15)
    updateDT <- data.table(key_col = letters[4:7], value_col_2 = 11:14, value_col_3 = 21:24, value_col_4 = 31:34)
    key_column <- "key_col"
    
    keep_columns <- c(key_column, setdiff(names(DT), names(updateDT)))
    updateDT <- DT[, ..keep_columns][updateDT, on = key_column]
    rbind(DT[!updateDT, on = key_column], updateDT, use.names = TRUE, fill = TRUE)