Fast random sampling from matrix of cumulative probability mass functions in R

I have a matrix (mat_cdf) representing the cumulative probability an individual in census tract i moves to census tract j on a given day. Given a vector of agents who decide not to "stay home", I have a function, GetCTMove function below, to randomly sample from this matrix to determine which census tract they will spend time in.

# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts

# Matrix where rows correspond to probability mass of column integer
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))

# Convert to cumulative probability mass
mat_cdf <- matrix(NA, cts, cts)

  for(i in 1:cts){
  # Create cdf for row i
    mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
  }

GetCTMove <- function(agent_cts, ct_mat_cdf){
# Expand such that every agent has its own row corresponding to CDF of movement from their home ct i to j
  mat_expand <- ct_mat_cdf[agent_cts,]
  
# Probabilistically sample column index for every row by generating random number and then determining corresponding closest column 
  s <- runif(length(agent_cts))
    
  fin_col <- max.col(s < mat_expand, "first")

  return(fin_col)
}

# Sample of 500,000 agents' residence ct
agents <- sample(1:cts, size = 500000, replace = T)

# Run function
system.time(GetCTMove(agents, mat_cdf))
 user  system elapsed 
   3.09    1.19    4.30

Working with 1 million agents, each sample takes ~10 seconds to run, multiplied by many time steps leads to hours for each simulation, and this function is by far the rate limiting factor of the model. I'm wondering if anyone has advice on faster implementation of this kind of random sampling. I've used the dqrng package to speed up random number generation, but that's relatively miniscule in comparison to the matrix expansion (mat_expand) and max.col calls which take longest to run.

Solution

The first thing that you can optimise is the following code:

max.col(s < mat_expand, "first")

Since s < mat_expand returns a logical matrix, applying the max.col function is the same as getting the first TRUE in each row. In this case, using which will be much more efficient. Also, as shown below, you store all your CDFs in a matrix.

mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
  mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}

This structure may not be optimal. A list structure is better for applying functions like which. It is also faster to run as you do not have to go through a do.call(rbind, ...).

# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))

Below is your implementation:

# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
  mat_expand <- ct_mat_cdf[agent_cts,]
  s <- runif(length(agent_cts))
  fin_col <- max.col(s < mat_expand, "first")
  return(fin_col)
}

On my desktop, it takes about 2.68s to run.

> system.time(GetCTMove(agents, mat_cdf))
   user  system elapsed 
   2.25    0.41    2.68

With a list structure and a which function, the run time can be reduced by about 1s.

# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
  n <- length(agent_cts)
  s <- runif(n)
  out <- integer(n)
  i <- 1L
  while (i <= n) {
    out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
    i <- i + 1L
  }
  out
}

> system.time(GetCTMove2(agents, ls_cdf))
   user  system elapsed 
   1.59    0.02    1.64

To my knowledge, with R only there is no other way to further speed up the code. However, you can indeed improve the performance by re-writing the key function GetCTMove in C++. With the Rcpp package, you can do something as follows:

# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
  int n = agents.size(); 
  NumericVector out(n); 
  for (int i = 0; i < n; ++i) {
    NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]); 
    int m = cdf.size(); 
    for (int j = 0; j < m; ++j) {
      if (s[i] < cdf[j]) {
        out[i] = j + 1;
        break;
      }
    }
  }
  return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
  s <- runif(length(agent_cts))
  fast_GetCTMove(agent_cts, s, ls_cdf)
}

This implementation is lightning fast, which should fulfil all your needs.

> system.time(GetCTMove3(agents, ls_cdf))
   user  system elapsed 
   0.07    0.00    0.06

The full script is attached as follows:

# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts
agents <- sample(1:cts, size = 500000, replace = T)

# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))
# below is your code
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
  mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}

# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
  mat_expand <- ct_mat_cdf[agent_cts,]
  s <- runif(length(agent_cts))
  fin_col <- max.col(s < mat_expand, "first")
  return(fin_col)
}


# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
  n <- length(agent_cts)
  s <- runif(n)
  out <- integer(n)
  i <- 1L
  while (i <= n) {
    out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
    i <- i + 1L
  }
  out
}


# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
  int n = agents.size(); 
  NumericVector out(n); 
  for (int i = 0; i < n; ++i) {
    NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]); 
    int m = cdf.size(); 
    for (int j = 0; j < m; ++j) {
      if (s[i] < cdf[j]) {
        out[i] = j + 1;
        break;
      }
    }
  }
  return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
  s <- runif(length(agent_cts))
  fast_GetCTMove(agent_cts, s, ls_cdf)
}


system.time(GetCTMove(agents, mat_cdf))
system.time(GetCTMove2(agents, ls_cdf))
system.time(GetCTMove3(agents, ls_cdf))