Search code examples
rdplyrresampling

Resampling a number of rows 1000 times and calculate de variance


I want to select 3 random rows per group for one especific variable. I want to run 1000 times. Then I want to calculate the mean, the variance and the mean's confidence intervals of these 1000 resamplings (inside each group).

In the example below: I have different groups in the last column. I want to run the resampling with the 3 random rows and calculate mean, IC and variance (for the variable1 column) inside each group.

In the end I want to have a object with the mean, the ICs and the variance for each of the groups, for variable1.

x1 <- matrix(rnorm(200,mean=3), nrow= 100, ncol=2)
x2 <- c(replicate(5, "AA"),replicate(15, "BB"),replicate(15, "CC"),
        replicate(10, "DD"),replicate(10, "EE"),replicate(10, "FF"),
        replicate(10, "GG"),replicate(5, "HH"),replicate(5, "II"),
        replicate(15, "JJ"))
df <- data.frame(cbind(x1,x2))
colnames(df) <- c("variable1", "variable2","group")

I tried to create a function to run this, but I was not successful.

resem <- function (x){
  for (i in 1:1000) {
    # Randomly select three rows
    sampled_rows <- x[sample(nrow(x), size = 3, replace = T), ]
    
    # Calculate the variance of the selected rows
    variance <- var(sampled_rows) 
    data.frame(variance)
  }
}
    
# Runing the function

df %>%
  group_by(group) %>%
  do(resem(.$variable1))

Any idea, please?

Editing. I thought of something like this to calculate the confidence intervals:

norm.interval <- function(df$variable1,variance = var(df$variable1), conf.level = 0.95) {
  z = qnorm((1 - conf.level)/2, lower.tail = FALSE)
  xbar = mean(df$variable1)
  sdx = sqrt(variance/length(df$variable1))
  c(xbar - z * sdx, xbar + z * sdx)
}

Solution

  • Something like this? The function below computes the means and variances of 3 rows only, not the CI's, 1000 per group.

    suppressPackageStartupMessages(
      library(dplyr)
    )
    
    set.seed(2023)
    x1 <- matrix(rnorm(200,mean=3), nrow= 100, ncol=2)
    x2 <- c(replicate(5, "AA"),replicate(15, "BB"),replicate(15, "CC"),
            replicate(10, "DD"),replicate(10, "EE"),replicate(10, "FF"),
            replicate(10, "GG"),replicate(5, "HH"),replicate(5, "II"),
            replicate(15, "JJ"))
    df <- cbind.data.frame(x1, x2)
    colnames(df) <- c("variable1", "variable2","group")
    
    resem <- function (x, R = 1000L){
      out <- vector("list", R)
      for (i in seq.int(R)) {
        # Randomly select three rows
        sampled_rows <- x[sample(NROW(x), size = 3, replace = TRUE)]
        
        # Calculate the variance of the selected rows
        variance <- var(sampled_rows)
        out[[i]] <- data.frame(mean = mean(sampled_rows), variance)
      }
      do.call(rbind, out)
    }
    
    # Running the function
    
    df %>%
      group_by(group) %>%
      do(resem(.$variable1))
    #> # A tibble: 10,000 × 3
    #> # Groups:   group [10]
    #>    group  mean variance
    #>    <chr> <dbl>    <dbl>
    #>  1 AA     1.84  0.410  
    #>  2 AA     1.69  0.951  
    #>  3 AA     1.99  0.714  
    #>  4 AA     2.66  0.0667 
    #>  5 AA     2.58  0.242  
    #>  6 AA     2.88  0.00349
    #>  7 AA     2.32  0.269  
    #>  8 AA     2.55  0.212  
    #>  9 AA     1.95  0.514  
    #> 10 AA     2.40  0.160  
    #> # ℹ 9,990 more rows
    

    Created on 2023-07-19 with reprex v2.0.2


    Edit

    Another function, returning one row per group, is the following.

    resem <- function (x, R = 1000L){
      out <- vector("list", R)
      for (i in seq.int(R)) {
        # Randomly select three rows
        sampled_rows <- x[sample(NROW(x), size = 3, replace = TRUE)]
        
        # Calculate the variance of the selected rows
        variance <- var(sampled_rows)
        out[[i]] <- cbind(mean = mean(sampled_rows), variance)
      }
      out <- do.call(rbind, out)
      colMeans(out) %>% t() %>% as.data.frame()
    }
    
    # Runing the function
    
    df %>%
      group_by(group) %>%
      do(resem(.$variable1))
    #> # A tibble: 10 × 3
    #> # Groups:   group [10]
    #>    group  mean variance
    #>    <chr> <dbl>    <dbl>
    #>  1 AA     2.23    0.428
    #>  2 BB     3.29    0.381
    #>  3 CC     3.05    1.35 
    #>  4 DD     3.34    0.219
    #>  5 EE     3.73    0.873
    #>  6 FF     2.96    1.71 
    #>  7 GG     2.95    0.701
    #>  8 HH     2.65    0.172
    #>  9 II     3.49    0.425
    #> 10 JJ     2.68    1.29
    

    Created on 2023-07-19 with reprex v2.0.2