The current release of the package gmp
does not support set operations such as intersect
, setdiff
, etc. I'm doing some work with number sequences (see OEIS for examples) and need to handle large collections of large integers. I'm currently stuck with using various loops to generate the desired differences or intersections; while I could probably generate compiled (Rccp, etc) code, I'm hoping to find a way within existing R
functions and packages.
Use bignum
instead of gmp
what returns a character
after using intersect. And reconverting needs time.
library(bignum)
t1 <- biginteger(1:4)
t2 <- biginteger(3:6)
intersect(t1, t2)
#[1] "3" "4"
biginteger(intersect(t1, t2))
#<biginteger[2]>
#[1] 3 4
Store the bigz
in a list
instead as a vector
.
library(gmp)
intersect(as.bigz(1:4), as.bigz(3:6))
#Big Integer ('bigz') object of length 2:
#[1] 1 2
intersect(as.list(as.bigz(1:4)), as.list(as.bigz(3:6)))
#[[1]]
#Big Integer ('bigz') :
#[1] 3
#
#[[2]]
#Big Integer ('bigz') :
#[1] 4
setdiff(as.list(as.bigz(1:4)), as.list(as.bigz(3:6)))
#[[1]]
#Big Integer ('bigz') :
#[1] 1
#
#[[2]]
#Big Integer ('bigz') :
#[1] 2
f3 <- as.list(as.bigz(c(3,5,9,6,4)))
f4 <- as.list(as.bigz(c(6,7,8,10,9)))
intersect(f3, f4)
#[[1]]
#Big Integer ('bigz') :
#[1] 9
#
#[[2]]
#Big Integer ('bigz') :
#[1] 6
Unfortunately this is much slower than converting it to character.
For intersect duplicated
could be used. But this is also slower than converting to character.
. <- c(unique(as.bigz(1:4)), unique(as.bigz(3:6)))
.[duplicated(.)]
#Big Integer ('bigz') object of length 2:
#[1] 3 4
And comparing each element with each other is also slower:
t1 <- unique(as.bigz(1:4))
t2 <- unique(as.bigz(3:6))
t1[sapply(t1, function(x) any(x == t2))]
#Big Integer ('bigz') object of length 2:
#[1] 3 4
Marginal faster in this case is:
t1 <- kit::funique(as.character(as.bigz(1:4)))
t2 <- as.character(as.bigz(3:6));
as.bigz(t1[fastmatch::fmatch(t1, t2, 0L) > 0L])
#Big Integer ('bigz') object of length 2:
#[1] 3 4
And a simple Rcpp versions.
library(Rcpp)
sourceCpp(code=r"(
#include <Rcpp.h>
#include <list>
#include <cstring>
using namespace Rcpp;
// [[Rcpp::export]]
RawVector fun(const RawVector &x, const RawVector &y) {
std::list<uint32_t const*> xAdr;
std::list<uint32_t const*> yAdr;
const uint32_t *px = (const uint32_t *) &x[0];
const uint32_t *py = (const uint32_t *) &y[0];
uint32_t nextNr = 1;
for(uint_fast32_t j=0; j<px[0]; ++j) {
uint32_t n = px[nextNr] + 2;
auto i = xAdr.cbegin();
for(; i != xAdr.cend(); ++i) {
if(std::memcmp(&px[nextNr], *i, n * sizeof(uint32_t)) == 0) break;
}
if(i == xAdr.cend()) xAdr.push_back(&px[nextNr]);
nextNr += n;
}
nextNr = 1;
for(uint_fast32_t j=0; j<py[0]; ++j) {
yAdr.push_back(&py[nextNr]);
nextNr += py[nextNr] + 2;;
}
uint32_t l=1;
for(auto j = xAdr.cbegin(); j != xAdr.cend(); ) {
auto i = yAdr.cbegin();
for(; i != yAdr.cend(); ++i) {
if(std::memcmp(*j, *i, (**j + 2) * sizeof(uint32_t)) == 0) {
yAdr.erase(i);
break;
}
}
if(i == yAdr.cend()) j = xAdr.erase(j);
else {l += **j + 2; ++j;}
}
RawVector res(Rcpp::no_init(l * sizeof(uint32_t)));
res.attr("class") = "bigz";
uint32_t *p = (uint32_t *) &res[0];
p[0] = xAdr.size();
nextNr = 1;
for(auto j = xAdr.cbegin(); j != xAdr.cend(); ++j) {
std::memcpy(&p[nextNr], *j, (**j + 2) * sizeof(uint32_t));
nextNr += p[nextNr] + 2;
}
return res;
}
)")
fun(as.bigz(1:4), as.bigz(3:6))
#Big Integer ('bigz') object of length 2:
#[1] 3 4
Benchmark
library(gmp)
x <- as.bigz("10000000000000000000000000000000000000000")+1:4
y <- as.bigz("10000000000000000000000000000000000000000")+3:6
library(bignum)
xb <- biginteger("10000000000000000000000000000000000000000")+1:4
yb <- biginteger("10000000000000000000000000000000000000000")+3:6
bench::mark(check = FALSE,
"list" = do.call(c, intersect(as.list(x), as.list(y))),
"char" = as.bigz(intersect(as.character(x), as.character(y))),
"dupli" = {. <- c(unique(x), unique(y)); .[duplicated(.)]},
"loop" = {t1 <- unique(x); t2 <- unique(y); t1[sapply(t1, function(x) any(x == t2))]},
"own" = {t1 <- as.character(x); t2 <- as.character(y);
x[!duplicated(t1) & match(t1, t2, 0L) > 0L]},
"own2" = {t1 <- unique(as.character(x)); t2 <- as.character(y);
as.bigz(t1[match(t1, t2, 0L) > 0L])},
"kitFmat" = {t1 <- kit::funique(as.character(x)); t2 <- as.character(y);
as.bigz(t1[fastmatch::fmatch(t1, t2, 0L) > 0L])},
"collFmat" = {t1 <- collapse::funique(as.character(x)); t2 <- as.character(y);
as.bigz(t1[fastmatch::fmatch(t1, t2, 0L) > 0L])},
"bignum" = biginteger(intersect(xb, yb)),
"rcppIdx" = fun(x, y),
)
Result
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
1 list 135.89µs 247.15µs 3147. 0B 2.03 1552 1
2 char 15.4µs 20.79µs 29933. 0B 12.0 9996 4
3 dupli 54.58µs 86.69µs 7428. 280B 6.18 3604 3
4 loop 80.11µs 156.36µs 4198. 6.36KB 8.26 2032 4
5 own 15.46µs 27.09µs 25254. 0B 5.05 9998 2
6 own2 13.69µs 20.53µs 28952. 0B 8.69 9997 3
7 kitFmat 13.23µs 20.57µs 30964. 0B 6.19 9998 2
8 collFmat 16.76µs 21.76µs 26667. 2.49KB 5.33 9998 2
9 bignum 32.12µs 48.51µs 10784. 0B 8.47 5090 4
10 rcppIdx 2.35µs 3.09µs 212933. 2.49KB 0 10000 0
Here it looks like that converting to character
and back is currently the fastest way. Even subsetting bigz
is slower than subsetting the character
and converting it to bigz
.
The Version in Rcpp is about 6-7 times faster than the fastest other method.