I am looking for the fastest possible way of counting the common elements in two character strings.
The elements in the strings are separated by a |
.
library(data.table)
dt <- data.table(input1 = c("A|B", "C|D|", "R|S|T", "A|B"),
input2 = c("A|B|C|D|E|F", "C|D|E|F|G", "R|S|T", "X|Y|Z"))
Count he common elements in character strings and create dt$outcome
.
dt <- transform(dt, var1 = I(strsplit(as.character(input1), "\\|")))
dt <- transform(dt, var2 = I(strsplit(as.character(input2), "\\|")))
dt <- transform(dt, outcome = mapply(function(x, y) sum(x%in%y),
var1, var2))
> dt
input1 input2 var1 var2 outcome
1: A|B A|B|C|D|E|F A,B A,B,C,D,E,F 2
2: C|D| C|D|E|F|G C,D C,D,E,F,G 2
3: R|S|T R|S|T R,S,T R,S,T 3
4: A|B X|Y|Z A,B X,Y,Z 0
This example works great, but the real data has thousands of elements for input1
and input2
and has over 200,000 rows. The current code runs therefore for days and can't be put into production.
How can we speed this up?
Columns dt$var1
and dt$var2
are not required outputs and can be left out.
dt[, outcome:= lengths(str_extract_all(input2, sub('[|]$', '',input1)))][]
input1 input2 outcome
1: A|B A|B|C|D|E|F 2
2: C|D| C|D|E|F|G 2
3: R|S|T R|S|T 3
4: A|B X|Y|Z 0
You could speed up the process by writing the code in either C++, C or Fortran. Lets look how the C++ code will look like:
Rcpp::cppFunction('
std::vector<int> count_intersect(std::vector<std::string> vec1,
std::vector<std::string> vec2, char split){
auto string_split = [=](std::string x) {
std::vector<std::string> vec;
std::string sub_string;
for(auto i: x){
if(i == split) {
vec.push_back(sub_string);
sub_string = "";
}
else sub_string+=i;
}
if(sub_string.size() > 0)vec.push_back(sub_string);
return vec;
};
auto count = [=](std::string input1, std::string input2){
std::vector<std::string> in1 = string_split(input1);
std::vector<std::string> in2 = string_split(input2);
int total = 0;
for (auto i: in1)
if(std::find(in2.begin(), in2.end(), i) != in2.end()) total += 1;
return total;
};
std::size_t len1 = vec1.size();
std::vector<int> result(len1);
for (std::size_t i = 0; i<len1; i++)
result[i] = count(vec1[i], vec2[i]);
return result;
}')
dt[, outcome:=count_intersect(input1, input2, "|")][]
input1 input2 outcome
1: A|B A|B|C|D|E|F 2
2: C|D| C|D|E|F|G 2
3: R|S|T R|S|T 3
4: A|B X|Y|Z 0
Doing the BenchMark: With really large data ie 200,000 rows:
bigdt <- mosaic::sample(dt, 200000, TRUE)[,1:2]
inputs <- c("input1", "input2")
vars <- c("var1", "var2")
bench::mark(OP = {
bigdt <- transform(bigdt, var1 = I(strsplit(as.character(input1), "\\|")))
bigdt <- transform(bigdt, var2 = I(strsplit(as.character(input2), "\\|")))
bigdt <- transform(bigdt, outcome = mapply(function(x, y) sum(x%in%y), var1, var2))
},
r2evans = {
bigdt[, (vars) := lapply(.SD, strsplit, "|", fixed = TRUE), .SDcols = inputs
][, outcome := mapply(function(x, y) sum(x %in% y), var1, var2)]
},
r2evans2 = {bigdt[, outcome := mapply(function(x, y) sum(x %in% y),
strsplit(input1, "|", fixed = TRUE),
strsplit(input2, "|", fixed = TRUE)) ]},
onyambu = {
bigdt[, outcome:= lengths(stringr::str_extract_all(input2, sub('[|]$', '',input1)))]
},
onyambuCpp = bigdt[, outcome:=count_intersect(input1, input2, "|")],
relative = TRUE
)
A tibble: 5 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 OP 12.4 12.1 1 30.9 Inf 1 6 1.66s <data.table [200,000 x 5]> <Rprofmem> <bench_tm> <tibble>
2 r2evans 4.77 4.66 2.60 5.72 Inf 1 3 641.39ms <data.table [200,000 x 5]> <Rprofmem> <bench_tm> <tibble>
3 r2evans2 6.08 5.94 2.04 5.70 Inf 1 5 817.4ms <data.table [200,000 x 5]> <Rprofmem> <bench_tm> <tibble>
4 onyambu 7.36 7.20 1.68 2.47 NaN 1 0 990.19ms <data.table [200,000 x 5]> <Rprofmem> <bench_tm> <tibble>
5 onyambuCpp 1 1 12.1 1 NaN 4 0 549.54ms <data.table [200,000 x 5]> <Rprofmem> <bench_tm> <tibble>
Note that the the unit is relative, and CPP atleast 4* faster than the next method.