Search code examples
rperformancedata.tablemapply

R - Find intersect of common elements in two character strings


I am looking for the fastest possible way of counting the common elements in two character strings.

The elements in the strings are separated by a |.

Mock data:

library(data.table)
dt <- data.table(input1 = c("A|B", "C|D|", "R|S|T", "A|B"),
                 input2 = c("A|B|C|D|E|F", "C|D|E|F|G", "R|S|T", "X|Y|Z"))

Count he common elements in character strings and create dt$outcome.

dt <- transform(dt, var1 = I(strsplit(as.character(input1), "\\|")))
dt <- transform(dt, var2 = I(strsplit(as.character(input2), "\\|")))
dt <- transform(dt, outcome = mapply(function(x, y) sum(x%in%y),
                                 var1, var2))

Result:

> dt
   input1      input2  var1        var2 outcome
1:    A|B A|B|C|D|E|F   A,B A,B,C,D,E,F       2
2:   C|D|   C|D|E|F|G   C,D   C,D,E,F,G       2
3:  R|S|T       R|S|T R,S,T       R,S,T       3
4:    A|B       X|Y|Z   A,B       X,Y,Z       0

This example works great, but the real data has thousands of elements for input1 and input2 and has over 200,000 rows. The current code runs therefore for days and can't be put into production.

How can we speed this up?

Columns dt$var1 and dt$var2 are not required outputs and can be left out.


Solution

  • dt[, outcome:= lengths(str_extract_all(input2, sub('[|]$', '',input1)))][]
       input1      input2 outcome
    1:    A|B A|B|C|D|E|F       2
    2:   C|D|   C|D|E|F|G       2
    3:  R|S|T       R|S|T       3
    4:    A|B       X|Y|Z       0
    

    You could speed up the process by writing the code in either C++, C or Fortran. Lets look how the C++ code will look like:

    Rcpp::cppFunction('
      std::vector<int> count_intersect(std::vector<std::string> vec1,
                   std::vector<std::string> vec2, char split){
      auto  string_split = [=](std::string x) {
        std::vector<std::string> vec;
        std::string sub_string;
        for(auto i: x){
          if(i == split) {
            vec.push_back(sub_string);
            sub_string = "";
          }
          else sub_string+=i;
        }
        if(sub_string.size() > 0)vec.push_back(sub_string);
        return  vec;
      };
      
      auto count = [=](std::string input1, std::string input2){
        std::vector<std::string> in1 = string_split(input1);
        std::vector<std::string> in2 = string_split(input2);
        int total = 0;
        for (auto i: in1) 
          if(std::find(in2.begin(), in2.end(), i) != in2.end()) total += 1;
        return total;
      };
      std::size_t len1 = vec1.size();
      std::vector<int> result(len1);
      for (std::size_t i = 0; i<len1; i++)
        result[i] = count(vec1[i], vec2[i]);
      return result;
    }')
    
     dt[, outcome:=count_intersect(input1, input2, "|")][]
           input1      input2 outcome
        1:    A|B A|B|C|D|E|F       2
        2:   C|D|   C|D|E|F|G       2
        3:  R|S|T       R|S|T       3
        4:    A|B       X|Y|Z       0
        
    

    Doing the BenchMark: With really large data ie 200,000 rows:

    bigdt <- mosaic::sample(dt, 200000, TRUE)[,1:2]
    inputs <- c("input1", "input2")
    vars <- c("var1", "var2")
    
    bench::mark(OP = {
      bigdt <- transform(bigdt, var1 = I(strsplit(as.character(input1), "\\|")))
      bigdt <- transform(bigdt, var2 = I(strsplit(as.character(input2), "\\|")))
      bigdt <- transform(bigdt, outcome = mapply(function(x, y) sum(x%in%y), var1, var2))
    },
    r2evans = {
      bigdt[, (vars) := lapply(.SD, strsplit, "|", fixed = TRUE), .SDcols = inputs
      ][, outcome := mapply(function(x, y) sum(x %in% y), var1, var2)]
    },
    r2evans2 = {bigdt[, outcome := mapply(function(x, y) sum(x %in% y), 
                              strsplit(input1, "|", fixed = TRUE), 
                              strsplit(input2, "|", fixed = TRUE)) ]},
    onyambu = {
      bigdt[, outcome:= lengths(stringr::str_extract_all(input2, sub('[|]$', '',input1)))]
    },
    onyambuCpp = bigdt[, outcome:=count_intersect(input1, input2, "|")],
     relative = TRUE
    )
    
    
    
     A tibble: 5 x 13
      expression   min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result                     memory     time       gc      
      <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl> <int> <dbl>   <bch:tm> <list>                     <list>     <list>     <list>  
    1 OP         12.4   12.1       1        30.9       Inf     1     6      1.66s <data.table [200,000 x 5]> <Rprofmem> <bench_tm> <tibble>
    2 r2evans     4.77   4.66      2.60      5.72      Inf     1     3   641.39ms <data.table [200,000 x 5]> <Rprofmem> <bench_tm> <tibble>
    3 r2evans2    6.08   5.94      2.04      5.70      Inf     1     5    817.4ms <data.table [200,000 x 5]> <Rprofmem> <bench_tm> <tibble>
    4 onyambu     7.36   7.20      1.68      2.47      NaN     1     0   990.19ms <data.table [200,000 x 5]> <Rprofmem> <bench_tm> <tibble>
    5 onyambuCpp  1      1        12.1       1         NaN     4     0   549.54ms <data.table [200,000 x 5]> <Rprofmem> <bench_tm> <tibble>
    

    Note that the the unit is relative, and CPP atleast 4* faster than the next method.