Search code examples
rperformancevectorizationmultinomial

Efficient multinomial sampling when sample size and probability vary


This question pertains to efficient sampling from multinomial distributions with varying sample sizes and probabilities. Below I describe the approach I have used, but wonder whether it can be improved with some intelligent vectorisation.

I'm simulating dispersal of organisms amongst multiple populations. Individuals from population j disperse to population i with probability p[i, j]. Given an initial abundance of 10 in population 1, and probabilities of dispersal c(0.1, 0.3, 0.6) to populations 1, 2, and 3, respectively, we can simulate the dispersal process with rmultinom:

set.seed(1)
rmultinom(1, 10, c(0.1, 0.3, 0.6))

#      [,1]
# [1,]    0
# [2,]    3
# [3,]    7

We can extend this to consider n source populations:

set.seed(1)
n <- 3
p <- replicate(n, diff(c(0, sort(runif(n-1)), 1)))
X <- sample(100, n)

Above, p is a matrix of probabilities of moving from one population (column) to another (row), and X is a vector of initial population sizes. The number of individuals dispersing between each pair of populations (and those remaining where they are) can now be simulated with:

sapply(seq_len(ncol(p)), function(i) {
  rmultinom(1, X[i], p[, i])  
})

#      [,1] [,2] [,3]
# [1,]   19   42   11
# [2,]    8   18   43
# [3,]   68    6    8

where the value of the element at the ith row and jth column is the number of individuals moving from population j to population i. The rowSums of this matrix give the new population sizes.

I'd like to repeat this many times, with constant probability matrix but with varying (pre-defined) initial abundances. The following small example achieves this, but is inefficient with larger problems. The resulting matrix gives the post-dispersal abundance in each of three populations for each of 5 simulations for which population had different initial abundances.

X <- matrix(sample(100, n*5, replace=TRUE), nrow=n)

apply(sapply(apply(X, 2, function(x) {
  lapply(seq_len(ncol(p)), function(i) {
    rmultinom(1, x[i], p[, i])  
  })
}), function(x) do.call(cbind, x), simplify='array'), 3, rowSums)

#      [,1] [,2] [,3] [,4] [,5]
# [1,]   79   67   45   28   74
# [2,]   92   99   40   19   52
# [3,]   51   45   16   21   35

Is there a way to better vectorise this problem?


Solution

  • This is a RcppGSL implementation of multi-multinomial. However, it requires you to install gsl independently....which may not be very practical.

    // [[Rcpp::depends(RcppGSL)]]
    
    #include <RcppGSL.h>
    #include <gsl/gsl_rng.h>
    #include <gsl/gsl_randist.h>
    #include <unistd.h>            // getpid
    
    Rcpp::IntegerVector rmn(unsigned int N, Rcpp::NumericVector p, gsl_rng* r){
    
        size_t K = p.size();
    
        Rcpp::IntegerVector x(K);
        gsl_ran_multinomial(r, K, N, p.begin(), (unsigned int *) x.begin());
        return x;             // return results vector
    }
    
    Rcpp::IntegerVector gsl_mmm_1(Rcpp::IntegerVector N, Rcpp::NumericMatrix P, gsl_rng* r){
        size_t K = N.size();
        int i;
        Rcpp::IntegerVector x(K);
        for(i=0; i<K; i++){
            x += rmn(N[i], P(Rcpp::_, i), r);
        }
        return x;
    }
    
    // [[Rcpp::export]]
    Rcpp::IntegerMatrix gsl_mmm(Rcpp::IntegerMatrix X_, Rcpp::NumericMatrix P){
        int j;
        gsl_rng * r = gsl_rng_alloc (gsl_rng_mt19937);
        long seed = rand()/(((double)RAND_MAX + 1)/10000000) * getpid();
        gsl_rng_set (r, seed);
        Rcpp::IntegerMatrix X(X_.nrow(), X_.ncol());
        for(j=0; j<X.ncol(); j++){
            X(Rcpp::_, j) = gsl_mmm_1(X_(Rcpp::_,j), P, r);
        }
        gsl_rng_free (r);
        return X;
    }
    

    I also compare it with a pure R implementation and jbaums's version

    library(Rcpp)
    library(microbenchmark)
    sourceCpp("gsl.cpp")
    
    P = matrix(c(c(0.1,0.2,0.7),c(0.3,0.3,0.4),c(0.5,0.3,0.2)),nc=3)
    X = matrix(c(c(30,40,30),c(20,40,40)), nc=2)
    
    mmm = function(X, P){
        n = ncol(X)
        p = nrow(X)
        Reduce("+", lapply(1:p, function(j) {
            Y = matrix(0,p,n)
            for(i in 1:n) Y[,i] = rmultinom(1, X[j,i], P[,j])
            Y
        }))
    }
    
    jbaums = function(X,P){
        apply(sapply(apply(X, 2, function(x) {
          lapply(seq_len(ncol(P)), function(i) {
            rmultinom(1, x[i], P[, i])
          })
        }), function(x) do.call(cbind, x), simplify='array'), nrow(X), rowSums)
    }
    microbenchmark(jbaums(X,P), mmm(X,P), gsl_mmm(X, P))
    

    and this is the result

    > microbenchmark(jbaums(X,P), mmm(X,P), gsl_mmm(X, P))
    Unit: microseconds
              expr     min       lq  median       uq     max neval
      jbaums(X, P) 165.832 172.8420 179.185 187.2810 339.280   100
         mmm(X, P)  60.071  63.5955  67.437  71.5775  92.963   100
     gsl_mmm(X, P)  10.529  11.8800  13.671  14.6220  40.857   100
    

    The gsl version is about 6x faster than my pure R version.