Search code examples
rstan

Logistic regression in R, Stan


I want to extract the predicted values (in the generated quantities block) of the Stan fit and compare them with the real observations but I can't find an easy solution. here is how did it with a simple logistic regression model:

library(rstan)
library(tidyverse)
library(boot)

rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

T <- 40
set.seed(123)
x <- sort(runif(T, 0, 10))
alpha <- 1
beta <- 0.2
logit_p <- alpha + beta * x
p <- inv.logit(logit_p)
y <- rbinom(T, 1, p)

model_code <- "
data {
  int<lower=0> N;
  vector[N] x;
  
  int<lower=0,upper=1> y[N];
  
}
parameters {
  real alpha;
  real beta;
}
model {
  y ~ bernoulli_logit(alpha + beta * x);
}
generated quantities {
  vector[N] z;
  for (n in 1:N)
    z[n] = bernoulli_logit_rng(alpha + beta * x[n]);
    
}"

model_data <- list(
  N = T, 
  x = x,
  y = y
)


stan_run <- stan(
  data = model_data,
  model_code = model_code
)


posterior <- rstan::extract(stan_run)

df <- as.data.frame(posterior$z)
df <-df %>% summarise(across(everything(.), 
                             ~ ifelse(length(.[which(. == 1)]) > length(.[which(. == 0)]), 1, 0)))

I don't know if my approach is correct. Does anyone know any straightforward way of doing it?


Solution

  • The apply function can be your friend here:

    Run example

    library(rstan)
    #> Loading required package: StanHeaders
    #> Loading required package: ggplot2
    #> rstan (Version 2.21.2, GitRev: 2e1f913d3ca3)
    #> For execution on a local, multicore CPU with excess RAM we recommend calling
    #> options(mc.cores = parallel::detectCores()).
    #> To avoid recompilation of unchanged Stan programs, we recommend calling
    #> rstan_options(auto_write = TRUE)
    library(tidyverse)
    library(boot)
    
    rstan_options(auto_write = TRUE)
    options(mc.cores = parallel::detectCores())
    
    T <- 40
    set.seed(123)
    x <- sort(runif(T, 0, 10))
    alpha <- 1
    beta <- 0.2
    logit_p <- alpha + beta * x
    p <- inv.logit(logit_p)
    y <- rbinom(T, 1, p)
    
    model_code <- "
    data {
      int<lower=0> N;
      vector[N] x;
      
      int<lower=0,upper=1> y[N];
      
    }
    parameters {
      real alpha;
      real beta;
    }
    model {
      y ~ bernoulli_logit(alpha + beta * x);
    }
    generated quantities {
      vector[N] z;
      for (n in 1:N)
        z[n] = bernoulli_logit_rng(alpha + beta * x[n]);
        
    }"
    
    model_data <- list(
      N = T, 
      x = x,
      y = y
    )
    
    # run model
    stan_run <- stan(
      data = model_data,
      model_code = model_code
    )
    #> Trying to compile a simple C file
    #> Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
    #> clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.1/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.1/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.1/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.1/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.1/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.1/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.1/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.1/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DBOOST_NO_AUTO_PTR  -include '/Library/Frameworks/R.framework/Versions/4.1/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/usr/local/include   -fPIC  -Wall -g -O2  -c foo.c -o foo.o
    #> In file included from <built-in>:1:
    #> In file included from /Library/Frameworks/R.framework/Versions/4.1/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
    #> In file included from /Library/Frameworks/R.framework/Versions/4.1/Resources/library/RcppEigen/include/Eigen/Dense:1:
    #> In file included from /Library/Frameworks/R.framework/Versions/4.1/Resources/library/RcppEigen/include/Eigen/Core:88:
    #> /Library/Frameworks/R.framework/Versions/4.1/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name 'namespace'
    #> namespace Eigen {
    #> ^
    #> /Library/Frameworks/R.framework/Versions/4.1/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:16: error: expected ';' after top level declarator
    #> namespace Eigen {
    #>                ^
    #>                ;
    #> In file included from <built-in>:1:
    #> In file included from /Library/Frameworks/R.framework/Versions/4.1/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
    #> In file included from /Library/Frameworks/R.framework/Versions/4.1/Resources/library/RcppEigen/include/Eigen/Dense:1:
    #> /Library/Frameworks/R.framework/Versions/4.1/Resources/library/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
    #> #include <complex>
    #>          ^~~~~~~~~
    #> 3 errors generated.
    #> make: *** [foo.o] Error 1
    
    

    Use apply

    In the case of logistic regression, you can just apply median to get the modal (most frequent) value.

    (df.pred <- apply(rstan::extract(stan_run)$z, 2, median)
    
    #>  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
    #> [39] 1 1
    

    Created on 2021-09-19 by the reprex package (v2.0.1)