rstatisticsvectorizationlapply

Fast binom.test in R for multiple levels of confidence?


I have a dataset df which includes two variables: successes (x) and trials (n). I wish to obtain for every row in that dataset the stats::binom.test() results (est, lower and upper ci) for different levels of confidence. It should be able to handle n = NA or 0.

I wrote the code below but I was hoping for a faster implementation since my true dataset is very large (30 million rows). Any suggestions that will improve the speed will be appreciated (vectorisation, parallelisation...?), preferably a "tidyapproach" but not required.

library(tidyverse)
#function: 
#uses lapply to loop over the conf.levels for the same x&n pair:
binom_test <- function(x, n, conf.levels = c(0.50, 0.75, 0.90, 0.95)) {
  if (is.na(n) | n < 1) {
    return(data.frame(
      conf_level = conf.levels,
      est = rep(NA, length(conf.levels)),
      x = rep(NA, length(conf.levels)),
      n = rep(NA, length(conf.levels)),
      lower_ci = rep(NA, length(conf.levels)),
      upper_ci = rep(NA, length(conf.levels))
    ))
  } else {
    results <- lapply(conf.levels, function(conf.level) {
      test <- binom.test(x = x,
                         n = n,
                         conf.level = conf.level)
      c(
        conf_level = conf.level,
        x = x,
        n = n,
        est = test$estimate[[1]],
        lower_ci = test$conf.int[1],
        upper_ci = test$conf.int[2]
      )
    })
    
    results_df <- as.data.frame(do.call(rbind, results))
    return(results_df)
  }
}

# data
df <- tibble(
  successes = round(runif(1000, 0, 100)),
  trials = round(runif(1000, 0, 100)) + 100
)

# requires rowwise (slower)
df %>%
  rowwise() %>%
  mutate(beta = list(binom_test(x = successes, n = trials))) %>%
  unnest(beta, names_sep = "_")
#> # A tibble: 4,000 × 8
#>    successes trials beta_conf_level beta_x beta_n beta_est beta_lower_ci
#>        <dbl>  <dbl>           <dbl>  <dbl>  <dbl>    <dbl>         <dbl>
#>  1        78    180            0.5      78    180    0.433         0.406
#>  2        78    180            0.75     78    180    0.433         0.389
#>  3        78    180            0.9      78    180    0.433         0.371
#>  4        78    180            0.95     78    180    0.433         0.360
#>  5        87    140            0.5      87    140    0.621         0.590
#>  6        87    140            0.75     87    140    0.621         0.570
#>  7        87    140            0.9      87    140    0.621         0.549
#>  8        87    140            0.95     87    140    0.621         0.536
#>  9        89    129            0.5      89    129    0.690         0.658
#> 10        89    129            0.75     89    129    0.690         0.637
#> # ℹ 3,990 more rows
#> # ℹ 1 more variable: beta_upper_ci <dbl>

# avoids rowwise but still slow
df %>%
  mutate(beta = pmap(list(successes, trials), binom_test)) %>%
  unnest(beta, names_sep = "_")
#> # A tibble: 4,000 × 8
#>    successes trials beta_conf_level beta_x beta_n beta_est beta_lower_ci
#>        <dbl>  <dbl>           <dbl>  <dbl>  <dbl>    <dbl>         <dbl>
#>  1        78    180            0.5      78    180    0.433         0.406
#>  2        78    180            0.75     78    180    0.433         0.389
#>  3        78    180            0.9      78    180    0.433         0.371
#>  4        78    180            0.95     78    180    0.433         0.360
#>  5        87    140            0.5      87    140    0.621         0.590
#>  6        87    140            0.75     87    140    0.621         0.570
#>  7        87    140            0.9      87    140    0.621         0.549
#>  8        87    140            0.95     87    140    0.621         0.536
#>  9        89    129            0.5      89    129    0.690         0.658
#> 10        89    129            0.75     89    129    0.690         0.637
#> # ℹ 3,990 more rows
#> # ℹ 1 more variable: beta_upper_ci <dbl>

#parallel?
# library(furrr)
# plan(multisession, workers = 5)
# df %>%
#   mutate(beta = furrr::future_pmap(list(successes, trials), binom_test)) %>%
#   unnest(beta, names_sep = "_")

Created on 2023-11-24 with reprex v2.0.2

EDIT: I found that the Hmisc package has a vectorised function to compute the CI. But I cannot handle NAs as above...

library(tidyverse)
library(Hmisc)

binom_conf <- function(x, n, alpha) {
  out <- Hmisc::binconf(
    x = x,
    n = n,
    alpha = alpha,
    method = "exact",
    return.df = TRUE
  )
  conf_level <- 1 - alpha
  out$conf_level <- conf_level
  rownames(out) <- NULL
  names(out) <- c("est", "lower_ci", "upper_ci", "conf_level")
  return(out)
}

df <- tibble(
  successes = round(runif(1e5, 0, 100)),
  trials = round(runif(1e5, 0, 100)) + 100
)

alpha <- 1 - c(0.50, 0.75, 0.90, 0.95)


system.time(
  results <-
    map_dfr(.x = alpha,
            ~ cbind(df, binom_conf(
              x = df$successes,
              n = df$trials,
              alpha = .x
            )))
)


library(furrr)
plan(multisession, workers = 5)
system.time(
  results <-
    future_map_dfr(.x = alpha,
                   ~ cbind(df, binom_conf(
                     x = df$successes,
                     n = df$trials,
                     alpha = .x
                   )))
)


Solution

  • Here is a function (and tests) that implements @StephaneLaurent's suggestion (for multiple x and n, but for a single alpha value only: it shouldn't be too much of an extension to implement multiple alpha, but you can't trivially vectorize over {x, n} and alpha at the same time ...)

    b_est <- function(x, n, alpha = 0.05) {
        est <- x/n
        lwr <- qbeta(alpha/2, x, n-x+1)
        zvals <- !is.na(x) & x == 0
        nvals <- !is.na(x) & x == n
        lwr[zvals] <- 0
        lwr[nvals] <- (alpha/2)^(1/n[nvals])
        upr <- qbeta(1-alpha/2, x+1, n-x)
        upr[zvals] <- 1-(alpha/2)^(1/n[zvals])
        upr[nvals] <- 1
        cbind(est, lwr, upr)
    }
    

    Basic test cases (compare with binom.test):

    cmp <- function(x, n) stopifnot(all.equal(c(binom.test(x, n)$conf.int),
                                              unname(b_est(x, n)[1,-1])))
    cmp(0,5)
    cmp(5,5)
    cmp(3,5)
    
    try(c(binom.test(NA,5)$conf.int))
    b_est(NA,5)
    

    Test for large data:

    set.seed(101)
    n <- rpois(3e7, lambda = 10)
    x <- rbinom(3e7, prob = rbeta(3e7, 1, 1), size = n)
    x[sample(3e7, 10000)] <- NA
    
    system.time(
        b <- b_est(x, n)
    )
    

    This takes 45 seconds on a laptop; it could be parallelized, and could be made marginally more efficient by not computing qbeta in 0/1/NA cases. If you really need to squeeze out more speed it could be written in C++/Rcpp, but that might not result in huge gains.