I'd like to remove all items that appear more than once in a vector. Specifically, this includes character, numeric and integer vectors. Currently, I'm using duplicated()
both forwards and backward (using the fromLast
parameter).
Is there a more computationally efficient (faster) way to execute this in R? The solution below is simple enough to write/read, but it seems inefficient to execute the duplicate search twice. Perhaps a counting-based method using an additional data structure would be better?
Example:
d <- c(1,2,3,4,1,5,6,4,2,1)
d[!(duplicated(d) | duplicated(d, fromLast=TRUE))]
#[1] 3 5 6
Some timings:
set.seed(1001)
d <- sample(1:100000, 100000, replace=T)
d <- c(d, sample(d, 20000, replace=T)) # ensure many duplicates
mb <- microbenchmark::microbenchmark(
d[!(duplicated(d) | duplicated(d, fromLast=TRUE))],
setdiff(d, d[duplicated(d)]),
{tmp <- rle(sort(d)); tmp$values[tmp$lengths == 1]},
as.integer(names(table(d)[table(d)==1])),
d[!(duplicated.default(d) | duplicated.default(d, fromLast=TRUE))],
d[!(d %in% d[duplicated(d)])],
{ ud = unique(d); ud[tabulate(match(d, ud)) == 1L] },
d[!(.Internal(duplicated(d, F, F, NA)) | .Internal(duplicated(d, F, T, NA)))]
)
summary(mb)[, c(1, 4)] # in milliseconds
# expr mean
#1 d[!(duplicated(d) | duplicated(d, fromLast = TRUE))] 18.34692
#2 setdiff(d, d[duplicated(d)]) 24.84984
#3 { tmp <- rle(sort(d)) tmp$values[tmp$lengths == 1] } 9.53831
#4 as.integer(names(table(d)[table(d) == 1])) 255.76300
#5 d[!(duplicated.default(d) | duplicated.default(d, fromLast = TRUE))] 18.35360
#6 d[!(d %in% d[duplicated(d)])] 24.01009
#7 { ud = unique(d) ud[tabulate(match(d, ud)) == 1L] } 32.10166
#8 d[!(.Internal(duplicated(d, F, F, NA)) | .Internal(duplicated(d, F, T, NA)))] 18.33475
Given the comments let's see if they are all correct?
results <- list(d[!(duplicated(d) | duplicated(d, fromLast=TRUE))],
setdiff(d, d[duplicated(d)]),
{tmp <- rle(sort(d)); tmp$values[tmp$lengths == 1]},
as.integer(names(table(d)[table(d)==1])),
d[!(duplicated.default(d) | duplicated.default(d, fromLast=TRUE))],
d[!(d %in% d[duplicated(d)])],
{ ud = unique(d); ud[tabulate(match(d, ud)) == 1L] },
d[!(.Internal(duplicated(d, F, F, NA)) | .Internal(duplicated(d, F, T, NA)))])
all(sapply(ls, all.equal, c(3, 5, 6)))
# TRUE