Search code examples
rloopsdatatablesmultiple-instancesconfidence-interval

Counter loops for summary data Confidence Intervals in R


I am trying to use a for-loop as a repeat counter to add summary data to a test sample. I have tried to use a data.frame, matrix, and a vector push my data out of the for loop and populate a table. The best I have got is filling one complete column in a vector and completing all columns but one row in a data frame.

#try empty vector to populate 
large.sample.df <- vector(mode = "double", length = 1000)

#try matrix to populate 
large.matrix <- matrix(nrow = 1000, ncol = 3)
matrix.names <- c("mean", "lwr", "upr")
colnames(large.matrix) <- matrix.names

#Try dataframe to populate
large.df <- data.frame(mean="", lwr="", upr="")

#set total length
n <- length(large.sample.df)

#use functions to calculate confidence interval
lwr.ci <- function(a) (mean(a) - 1.96 * (sd(a)/sqrt(length(a))))
upp.ci <- function(a) (mean(a) + 1.96 * (sd(a)/sqrt(length(a))))

#Start new seed count
set.seed(1234)

#begin for loop for mean, lwr, upr CI
for (i in 1:n) {
  large.sample <- rgamma(n = 1000, shape = 4, rate = 2)
  large.df$mean[i] <- mean(large.sample)
  large.df$lwr[i] <- lwr.ci(large.sample)
  large.df$upr[i] <- upp.ci(large.sample)
  }


Solution

  • Here are two ways to get what you want. First we should distinguish between the sample size and the number of samples:

    set.seed(1234)
    n <- 1000
    samples <- 10  # Keep this small for testing and then increase it
    s <- 4
    r <- 2
    

    First your loop approach:

    results <- data.frame(mean=NA, lwr=NA, upr=NA)   # Not "" which makes the variables character strings
    set.seed(1234)
    for (i in 1:samples) {
        x <- rgamma(n, shape = s, rate = r)
        mn <- mean(x)
        sder <- sd(x)/sqrt(n)
        lwr <- mn - 1.96 * sder
        upr <- mn + 1.96 * sder
        results[i, ] <- c(mn, lwr, upr) 
    }
    results
    #           mean         lwr         upr
    # 1  2.015193688 1.952431714 2.077955663
    # 2  2.024218250 1.962404608 2.086031891
    # 3  2.008401293 1.948363928 2.068438658
    # 4  1.993061142 1.932020588 2.054101696
    # 5  1.975824831 1.912961486 2.038688176
    # 6  1.983761126 1.923583927 2.043938325
    # 7  1.983166350 1.924890819 2.041441880
    # 8  1.975453269 1.915336118 2.035570420
    # 9  1.976118333 1.915025748 2.037210918
    # 10 2.044088839 1.983435628 2.104742050
    

    Now using replicate

    confint <- function(n, s, r) {
        x <- rgamma(n, shape = s, rate = r)
        mn <- mean(x)
        sder <- sd(x)/sqrt(n)
        lwr <- mn - 1.96 * sder
        upr <- mn + 1.96 * sder
        return(c(mean=mn, lwr=lwr, upr=upr))
    }
    confint(n, s, r)   # Test the function
    #        mean         lwr         upr 
    # 1.974328366 1.914003710 2.034653023 
    set.seed(1234)
    results <- replicate(samples, confint(n, s, r))
    results <- t(results)
    results
    #              mean         lwr         upr
    #  [1,] 2.015193688 1.952431714 2.077955663
    #  [2,] 2.024218250 1.962404608 2.086031891
    #  [3,] 2.008401293 1.948363928 2.068438658
    #  [4,] 1.993061142 1.932020588 2.054101696
    #  [5,] 1.975824831 1.912961486 2.038688176
    #  [6,] 1.983761126 1.923583927 2.043938325
    #  [7,] 1.983166350 1.924890819 2.041441880
    #  [8,] 1.975453269 1.915336118 2.035570420
    #  [9,] 1.976118333 1.915025748 2.037210918
    # [10,] 2.044088839 1.983435628 2.104742050
    

    Both approaches agree.