Search code examples
rsetgmpset-intersection

Efficient code to do set operations with bigz-class values?


The current release of the package gmp does not support set operations such as intersect, setdiff , etc. I'm doing some work with number sequences (see OEIS for examples) and need to handle large collections of large integers. I'm currently stuck with using various loops to generate the desired differences or intersections; while I could probably generate compiled (Rccp, etc) code, I'm hoping to find a way within existing R functions and packages.


Solution

  • Use bignum instead of gmp what returns a character after using intersect. And reconverting needs time.

    library(bignum)
    
    t1 <- biginteger(1:4)
    t2 <- biginteger(3:6)
    intersect(t1, t2)
    #[1] "3" "4"
    
    biginteger(intersect(t1, t2))
    #<biginteger[2]>
    #[1] 3 4
    

    Store the bigz in a list instead as a vector.

    library(gmp)
    
    intersect(as.bigz(1:4), as.bigz(3:6))
    #Big Integer ('bigz') object of length 2:
    #[1] 1 2
    
    intersect(as.list(as.bigz(1:4)), as.list(as.bigz(3:6)))
    #[[1]]
    #Big Integer ('bigz') :
    #[1] 3
    #
    #[[2]]
    #Big Integer ('bigz') :
    #[1] 4
    
    setdiff(as.list(as.bigz(1:4)), as.list(as.bigz(3:6)))
    #[[1]]
    #Big Integer ('bigz') :
    #[1] 1
    #
    #[[2]]
    #Big Integer ('bigz') :
    #[1] 2
    
    f3 <- as.list(as.bigz(c(3,5,9,6,4)))
    f4 <- as.list(as.bigz(c(6,7,8,10,9)))
    intersect(f3, f4)
    #[[1]]
    #Big Integer ('bigz') :
    #[1] 9
    #
    #[[2]]
    #Big Integer ('bigz') :
    #[1] 6
    

    Unfortunately this is much slower than converting it to character.

    For intersect duplicated could be used. But this is also slower than converting to character.

    . <- c(unique(as.bigz(1:4)), unique(as.bigz(3:6)))
    .[duplicated(.)]
    #Big Integer ('bigz') object of length 2:
    #[1] 3 4
    

    And comparing each element with each other is also slower:

    t1 <- unique(as.bigz(1:4))
    t2 <- unique(as.bigz(3:6))
    t1[sapply(t1, function(x) any(x == t2))]
    #Big Integer ('bigz') object of length 2:
    #[1] 3 4
    

    Marginal faster in this case is:

    t1 <- kit::funique(as.character(as.bigz(1:4)))
    t2 <- as.character(as.bigz(3:6));
    as.bigz(t1[fastmatch::fmatch(t1, t2, 0L) > 0L])
    #Big Integer ('bigz') object of length 2:
    #[1] 3 4
    

    And a simple Rcpp versions.

    library(Rcpp)
    sourceCpp(code=r"(
    #include <Rcpp.h>
    #include <list>
    #include <cstring>
    using namespace Rcpp;
    // [[Rcpp::export]]
    RawVector fun(const RawVector &x, const RawVector &y) {
      std::list<uint32_t const*> xAdr;
      std::list<uint32_t const*> yAdr;
      const uint32_t *px = (const uint32_t *) &x[0];
      const uint32_t *py = (const uint32_t *) &y[0];
      uint32_t nextNr = 1;
      for(uint_fast32_t j=0; j<px[0]; ++j) {
        uint32_t n = px[nextNr] + 2;
        auto i = xAdr.cbegin();
        for(; i != xAdr.cend(); ++i) {
          if(std::memcmp(&px[nextNr], *i, n * sizeof(uint32_t)) == 0) break;
        }
        if(i == xAdr.cend()) xAdr.push_back(&px[nextNr]);
        nextNr += n;
      }
      nextNr = 1;
      for(uint_fast32_t j=0; j<py[0]; ++j) {
        yAdr.push_back(&py[nextNr]);
        nextNr += py[nextNr] + 2;;
      }
      uint32_t l=1;
      for(auto j = xAdr.cbegin(); j != xAdr.cend(); ) {
        auto i = yAdr.cbegin();
        for(; i != yAdr.cend(); ++i) {
          if(std::memcmp(*j, *i, (**j + 2) * sizeof(uint32_t)) == 0) {
            yAdr.erase(i);
            break;
          }
        }
        if(i == yAdr.cend()) j = xAdr.erase(j);
        else {l += **j + 2; ++j;}
      }
      RawVector res(Rcpp::no_init(l * sizeof(uint32_t)));
      res.attr("class") = "bigz";
      uint32_t *p = (uint32_t *) &res[0];
      p[0] = xAdr.size();
      nextNr = 1;
      for(auto j = xAdr.cbegin(); j != xAdr.cend(); ++j) {
        std::memcpy(&p[nextNr], *j, (**j + 2) * sizeof(uint32_t));
        nextNr += p[nextNr] + 2;
      }
      return res;
    }
    )")
    
    fun(as.bigz(1:4), as.bigz(3:6))
    #Big Integer ('bigz') object of length 2:
    #[1] 3 4
    

    Benchmark

    library(gmp)
    x <- as.bigz("10000000000000000000000000000000000000000")+1:4
    y <- as.bigz("10000000000000000000000000000000000000000")+3:6
    
    library(bignum)
    xb <- biginteger("10000000000000000000000000000000000000000")+1:4
    yb <- biginteger("10000000000000000000000000000000000000000")+3:6
    
    bench::mark(check = FALSE,
             "list" = do.call(c, intersect(as.list(x), as.list(y))),
             "char" = as.bigz(intersect(as.character(x), as.character(y))),
             "dupli" = {. <- c(unique(x), unique(y)); .[duplicated(.)]},
             "loop" = {t1 <- unique(x); t2 <- unique(y); t1[sapply(t1, function(x) any(x == t2))]},
             "own" = {t1 <- as.character(x); t2 <- as.character(y);
               x[!duplicated(t1) & match(t1, t2, 0L) > 0L]},
             "own2" = {t1 <- unique(as.character(x)); t2 <- as.character(y);
               as.bigz(t1[match(t1, t2, 0L) > 0L])},
             "kitFmat" = {t1 <- kit::funique(as.character(x)); t2 <- as.character(y);
               as.bigz(t1[fastmatch::fmatch(t1, t2, 0L) > 0L])},
             "collFmat" = {t1 <- collapse::funique(as.character(x)); t2 <- as.character(y);
               as.bigz(t1[fastmatch::fmatch(t1, t2, 0L) > 0L])},
             "bignum" = biginteger(intersect(xb, yb)),
             "rcppIdx" = fun(x, y),
             )
    

    Result

       expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
       <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
     1 list       135.89µs 247.15µs     3147.        0B     2.03  1552     1
     2 char         15.4µs  20.79µs    29933.        0B    12.0   9996     4
     3 dupli       54.58µs  86.69µs     7428.      280B     6.18  3604     3
     4 loop        80.11µs 156.36µs     4198.    6.36KB     8.26  2032     4
     5 own         15.46µs  27.09µs    25254.        0B     5.05  9998     2
     6 own2        13.69µs  20.53µs    28952.        0B     8.69  9997     3
     7 kitFmat     13.23µs  20.57µs    30964.        0B     6.19  9998     2
     8 collFmat    16.76µs  21.76µs    26667.    2.49KB     5.33  9998     2
     9 bignum      32.12µs  48.51µs    10784.        0B     8.47  5090     4
    10 rcppIdx      2.35µs   3.09µs   212933.    2.49KB     0    10000     0
    

    Here it looks like that converting to character and back is currently the fastest way. Even subsetting bigz is slower than subsetting the character and converting it to bigz.
    The Version in Rcpp is about 6-7 times faster than the fastest other method.