I have a dataset df
which includes two variables: successes
(x) and trials
(n). I wish to obtain for every row in that dataset the stats::binom.test()
results (est, lower and upper ci) for different levels of confidence. It should be able to handle n = NA
or 0
.
I wrote the code below but I was hoping for a faster implementation since my true dataset is very large (30 million rows). Any suggestions that will improve the speed will be appreciated (vectorisation, parallelisation...?), preferably a "tidyapproach" but not required.
library(tidyverse)
#function:
#uses lapply to loop over the conf.levels for the same x&n pair:
binom_test <- function(x, n, conf.levels = c(0.50, 0.75, 0.90, 0.95)) {
if (is.na(n) | n < 1) {
return(data.frame(
conf_level = conf.levels,
est = rep(NA, length(conf.levels)),
x = rep(NA, length(conf.levels)),
n = rep(NA, length(conf.levels)),
lower_ci = rep(NA, length(conf.levels)),
upper_ci = rep(NA, length(conf.levels))
))
} else {
results <- lapply(conf.levels, function(conf.level) {
test <- binom.test(x = x,
n = n,
conf.level = conf.level)
c(
conf_level = conf.level,
x = x,
n = n,
est = test$estimate[[1]],
lower_ci = test$conf.int[1],
upper_ci = test$conf.int[2]
)
})
results_df <- as.data.frame(do.call(rbind, results))
return(results_df)
}
}
# data
df <- tibble(
successes = round(runif(1000, 0, 100)),
trials = round(runif(1000, 0, 100)) + 100
)
# requires rowwise (slower)
df %>%
rowwise() %>%
mutate(beta = list(binom_test(x = successes, n = trials))) %>%
unnest(beta, names_sep = "_")
#> # A tibble: 4,000 × 8
#> successes trials beta_conf_level beta_x beta_n beta_est beta_lower_ci
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 78 180 0.5 78 180 0.433 0.406
#> 2 78 180 0.75 78 180 0.433 0.389
#> 3 78 180 0.9 78 180 0.433 0.371
#> 4 78 180 0.95 78 180 0.433 0.360
#> 5 87 140 0.5 87 140 0.621 0.590
#> 6 87 140 0.75 87 140 0.621 0.570
#> 7 87 140 0.9 87 140 0.621 0.549
#> 8 87 140 0.95 87 140 0.621 0.536
#> 9 89 129 0.5 89 129 0.690 0.658
#> 10 89 129 0.75 89 129 0.690 0.637
#> # ℹ 3,990 more rows
#> # ℹ 1 more variable: beta_upper_ci <dbl>
# avoids rowwise but still slow
df %>%
mutate(beta = pmap(list(successes, trials), binom_test)) %>%
unnest(beta, names_sep = "_")
#> # A tibble: 4,000 × 8
#> successes trials beta_conf_level beta_x beta_n beta_est beta_lower_ci
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 78 180 0.5 78 180 0.433 0.406
#> 2 78 180 0.75 78 180 0.433 0.389
#> 3 78 180 0.9 78 180 0.433 0.371
#> 4 78 180 0.95 78 180 0.433 0.360
#> 5 87 140 0.5 87 140 0.621 0.590
#> 6 87 140 0.75 87 140 0.621 0.570
#> 7 87 140 0.9 87 140 0.621 0.549
#> 8 87 140 0.95 87 140 0.621 0.536
#> 9 89 129 0.5 89 129 0.690 0.658
#> 10 89 129 0.75 89 129 0.690 0.637
#> # ℹ 3,990 more rows
#> # ℹ 1 more variable: beta_upper_ci <dbl>
#parallel?
# library(furrr)
# plan(multisession, workers = 5)
# df %>%
# mutate(beta = furrr::future_pmap(list(successes, trials), binom_test)) %>%
# unnest(beta, names_sep = "_")
Created on 2023-11-24 with reprex v2.0.2
EDIT: I found that the Hmisc
package has a vectorised function to compute the CI. But I cannot handle NAs as above...
library(tidyverse)
library(Hmisc)
binom_conf <- function(x, n, alpha) {
out <- Hmisc::binconf(
x = x,
n = n,
alpha = alpha,
method = "exact",
return.df = TRUE
)
conf_level <- 1 - alpha
out$conf_level <- conf_level
rownames(out) <- NULL
names(out) <- c("est", "lower_ci", "upper_ci", "conf_level")
return(out)
}
df <- tibble(
successes = round(runif(1e5, 0, 100)),
trials = round(runif(1e5, 0, 100)) + 100
)
alpha <- 1 - c(0.50, 0.75, 0.90, 0.95)
system.time(
results <-
map_dfr(.x = alpha,
~ cbind(df, binom_conf(
x = df$successes,
n = df$trials,
alpha = .x
)))
)
library(furrr)
plan(multisession, workers = 5)
system.time(
results <-
future_map_dfr(.x = alpha,
~ cbind(df, binom_conf(
x = df$successes,
n = df$trials,
alpha = .x
)))
)
Here is a function (and tests) that implements @StephaneLaurent's suggestion (for multiple x
and n
, but for a single alpha
value only: it shouldn't be too much of an extension to implement multiple alpha
, but you can't trivially vectorize over {x
, n
} and alpha
at the same time ...)
b_est <- function(x, n, alpha = 0.05) {
est <- x/n
lwr <- qbeta(alpha/2, x, n-x+1)
zvals <- !is.na(x) & x == 0
nvals <- !is.na(x) & x == n
lwr[zvals] <- 0
lwr[nvals] <- (alpha/2)^(1/n[nvals])
upr <- qbeta(1-alpha/2, x+1, n-x)
upr[zvals] <- 1-(alpha/2)^(1/n[zvals])
upr[nvals] <- 1
cbind(est, lwr, upr)
}
Basic test cases (compare with binom.test
):
cmp <- function(x, n) stopifnot(all.equal(c(binom.test(x, n)$conf.int),
unname(b_est(x, n)[1,-1])))
cmp(0,5)
cmp(5,5)
cmp(3,5)
try(c(binom.test(NA,5)$conf.int))
b_est(NA,5)
Test for large data:
set.seed(101)
n <- rpois(3e7, lambda = 10)
x <- rbinom(3e7, prob = rbeta(3e7, 1, 1), size = n)
x[sample(3e7, 10000)] <- NA
system.time(
b <- b_est(x, n)
)
This takes 45 seconds on a laptop; it could be parallelized, and could be made marginally more efficient by not computing qbeta
in 0/1/NA cases. If you really need to squeeze out more speed it could be written in C++/Rcpp
, but that might not result in huge gains.