I'd like to combine a full join and a update join with a variable number of columns and havn't found an answer covering this case on stackoverflow.
DT
is my initial table:
key_col value_col_1 value_col_2
<char> <int> <int>
1: a 1 11
2: b 2 12
3: c 3 13
4: d 4 14
5: e 5 15
which needs to be updated with the data in updateDT
:
key_col value_col_1 value_col_2 value_col_3
<char> <int> <int> <int>
1: d 11 21 31
2: e 12 22 32
3: f 13 23 33
4: g 14 24 34
updateDT
may add new rows and columns (variable in number) and always has priority over existing data in DT
.
The expected result (resultDT
) for the above example is:
key_col value_col_1 value_col_2 value_col_3
<char> <int> <int> <int>
1: a 1 11 NA
2: b 2 12 NA
3: c 3 13 NA
4: d 11 21 31
5: e 12 22 32
6: f 13 23 33
7: g 14 24 34
I can get there with hard coded column names, however, the code is repetitive and I'm struggling with generalizing this approach to handle e.g. a third or fourth value_col
in updateDT
dynamically:
library(data.table)
DT <- data.table(key_col = letters[1:5], value_col_1 = 1:5, value_col_2 = 11:15)
updateDT <- data.table(key_col = letters[4:7], value_col_1 = 11:14, value_col_2 = 21:24, value_col_3 = 31:34)
key_column <- "key_col"
value_columns <- setdiff(c(names(DT), names(updateDT)), key_column)
resultDT <- merge(DT, updateDT, by = "key_col", all = TRUE)
resultDT[, value_col_1 := fifelse(is.na(value_col_1.y), yes = value_col_1.x, no = value_col_1.y)]
resultDT[, value_col_2 := fifelse(is.na(value_col_2.y), yes = value_col_2.x, no = value_col_2.y)]
resultDT[, c("value_col_1.x", "value_col_1.y", "value_col_2.x", "value_col_2.y") := NULL]
print(resultDT)
Edit: accepting ismirsehregal's answer as it is faster - Benchmark:
library(microbenchmark)
microbenchmark(
ismirsehregal = {
resultDT1 <- rbindlist(list(old = DT, new = updateDT), use.names = TRUE, fill = TRUE, idcol = "origin")[, key_count := .N, by = key_column]
resultDT1 <- resultDT1[key_count == 1L | key_count > 1L & origin == 'new'][, c("key_count", "origin") := NULL]
},
Wimpel = {
DT[updateDT, on=.(key_col), (value_columns) := mget(paste0("i.", value_columns))]
resultDT2 <- rbindlist(list(DT, updateDT[!key_col %in% DT$key_col,]), use.names = TRUE, fill = TRUE)
}, times = 10L)
Unit: milliseconds
expr min lq mean median uq max neval
ismirsehregal 1.2166 1.2345 1.33790 1.2933 1.3531 1.8147 10
Wimpel 3.5286 3.5989 3.85624 3.7680 4.0916 4.5984 10
2. Edit: Example of updateDT not containing all of DTs columns:
library(data.table)
DT <- data.table(key_col = letters[1:5], value_col_1 = 1:5, value_col_2 = 11:15)
updateDT <- data.table(key_col = letters[4:7], value_col_2 = 11:14, value_col_3 = 21:24, value_col_4 = 31:34)
key_column <- "key_col"
# value_columns <- setdiff(c(names(DT), names(updateDT)), key_column) # Error: value for ‘i.value_col_1’ not found
value_columns <- setdiff(intersect(names(DT), names(updateDT)), key_column) # does not insert all data from updateDT
DT[updateDT, on=.(key_col), (value_columns) := mget(paste0("i.", value_columns))]
rbindlist(list(DT, updateDT[!key_col %in% DT$key_col,]), use.names = TRUE, fill = TRUE)
You can use rbindlist
to achive this:
library(data.table)
DT <- data.table(key_col = letters[1:5], value_col_1 = 1:5, value_col_2 = 11:15)
updateDT <- data.table(key_col = letters[4:7], value_col_1 = 11:14, value_col_2 = 21:24, value_col_3 = 31:34)
key_column <- "key_col"
value_columns <- setdiff(c(names(DT), names(updateDT)), key_column)
resultDT <- rbindlist(list(old = DT, new = updateDT), use.names = TRUE, fill = TRUE, idcol = "origin")[, key_count := .N, by = key_column]
resultDT <- resultDT[key_count == 1L | key_count > 1L & origin == 'new'][, c("key_count", "origin") := NULL]
print(resultDT)
This is basically row binding the new table to the old one and removing duplicates based on the origin.
Edit: to avoid dropping data when not all of DTs column exist in updateDT we can simply add them upfront:
library(data.table)
DT <- data.table(key_col = letters[1:5], value_col_1 = 1:5, value_col_2 = 11:15)
updateDT <- data.table(key_col = letters[4:7], value_col_2 = 11:14, value_col_3 = 21:24, value_col_4 = 31:34)
key_column <- "key_col"
value_columns <- setdiff(c(names(DT), names(updateDT)), key_column)
keep_columns <- c(key_column, setdiff(names(DT), names(updateDT)))
updateDT <- DT[, ..keep_columns][updateDT, on = key_column]
resultDT <- rbindlist(list(old = DT, new = updateDT), use.names = TRUE, fill = TRUE, idcol = "origin")[, key_count := .N, by = key_column]
resultDT <- resultDT[key_count == 1L | key_count > 1L & origin == 'new'][, c("key_count", "origin") := NULL]
print(resultDT)
In the same manner we can use @Tobo's one-liner:
library(data.table)
DT <- data.table(key_col = letters[1:5], value_col_1 = 1:5, value_col_2 = 11:15)
updateDT <- data.table(key_col = letters[4:7], value_col_2 = 11:14, value_col_3 = 21:24, value_col_4 = 31:34)
key_column <- "key_col"
keep_columns <- c(key_column, setdiff(names(DT), names(updateDT)))
updateDT <- DT[, ..keep_columns][updateDT, on = key_column]
rbind(DT[!updateDT, on = key_column], updateDT, use.names = TRUE, fill = TRUE)