I have a large dataset and would like to form all pairs of rows satisfying some condition and then calculate some variables based on which parts of the condition were satisfied. The following MWE illustrates what I would like to achieve:
library(data.table)
set.seed(1234)
IDs <- data.table(id = letters[1:10],
c1 = sample(1:5, 10, replace = T),
c2 = sample(1:5, 10, replace = T),
c3 = sample(1:5, 10, replace = T),
c = 1)
IDs.joined <- IDs[IDs, on = 'c', allow.cartesian = T
][c1 != i.c1 & (c2 == i.c2 | c3 == i.c3) # condition defining which pairs are joined
][, c('Ic2', 'Ic3') := .(c2 == i.c2, c3 == i.c3)
][, overlap_id := fifelse(Ic2 == 1, 2, 3)
][, overlap := Ic2 + Ic3
][, -c('i.c1', 'i.c2', 'i.c3', 'Ic2', 'Ic3')]
The problem is that the full dataset is way too large (~5 million rows) to form the Cartesian join on itself. My question is, is there a way to use data.table
's syntax to perform a conditional join like this directly, without going via the Cartesian join first and imposing the desired condition second?
I have seen similar problems on SO but these can typically be expressed as a rolling join, I am not aware of a way to include X | Y
statements in the rolling join syntax, or X != Y
conditions.
The best option I've found so far for relatively simple conditions like these is to bind multiple joins. It's not pretty, but it is fast and memory efficient.
Data:
library(data.table)
set.seed(1234)
IDs <- data.table(id = 1:1e4,
c1 = sample(5e3, 1e4, replace = T),
c2 = sample(5e3, 1e4, replace = T),
c3 = sample(5e3, 1e4, replace = T))
Original solution with a single cartesian join followed by the required filtering:
f1 <- function(dt) {
on.exit(try(dt[,c := NULL], TRUE))
dt[
,c := 0L
][
dt, on = 'c', allow.cartesian = TRUE
][
c1 != i.c1 & (c2 == i.c2 | c3 == i.c3)
][
,c := NULL
]
}
Solution using four non-equi joins followed by filtering out duplicates:
f2 <- function(dt) {
setorder(
rbindlist(
list(
dt[dt, on = .(c1 > c1, c2 == c2), .(id = x.id, c1 = x.c1, c2 = x.c2, c3 = x.c3, i.id = i.id, i.c1 = i.c1, i.c2 = i.c2, i.c3 = i.c3), nomatch = 0],
dt[dt, on = .(c1 < c1, c2 == c2), .(id = x.id, c1 = x.c1, c2 = x.c2, c3 = x.c3, i.id = i.id, i.c1 = i.c1, i.c2 = i.c2, i.c3 = i.c3), nomatch = 0],
dt[dt, on = .(c1 > c1, c3 == c3), .(id = x.id, c1 = x.c1, c2 = x.c2, c3 = x.c3, i.id = i.id, i.c1 = i.c1, i.c2 = i.c2, i.c3 = i.c3), nomatch = 0],
dt[dt, on = .(c1 < c1, c3 == c3), .(id = x.id, c1 = x.c1, c2 = x.c2, c3 = x.c3, i.id = i.id, i.c1 = i.c1, i.c2 = i.c2, i.c3 = i.c3), nomatch = 0]
)
)
)[
c2 != i.c2 | c3 != i.c3 | rep(!0:1, .N/2L)
]
}
Solution using two cartesian joins, each followed by additional filtering:
f3 <- function(dt) {
rbindlist(
list(
dt[dt, on = "c3", allow.cartesian = TRUE][c1 != i.c1][, i.c3 := c3],
dt[dt, on = "c2", allow.cartesian = TRUE][c1 != i.c1 & c3 != i.c3][, i.c2 := c2]
),
use.names = TRUE
)
}
Benchmarking:
microbenchmark::microbenchmark(f1(IDs),
f2(IDs),
f3(IDs),
times = 10)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1(IDs) 2292.6852 2400.9380 2398.13919 2406.86010 2418.6276 2459.2945 10
#> f2(IDs) 366.6586 367.2055 373.34916 373.99310 378.5482 383.0736 10
#> f3(IDs) 6.5007 6.5537 7.24775 6.88965 7.0150 10.6413 10
identical(setorder(f1(IDs)), f2(IDs))
#> [1] TRUE
identical(f2(IDs), setorder(f3(IDs)))
#> [1] TRUE