bootstrap within groups in R

I have a data frame with response ratios for multiple locations and each location is assigned to a group (region). I want to generate a regression for each group (region) that uses Response Ratio (RR) as the response, location as the unit of replication, and each soil type as a predictor. I would like to use bootstrap resampling to generate confidence intervals around the coefficients for each soil type but I am not sure how to generate this.

#sample data
df <- data.frame(
  group=rep(c('region1','region2'), 100),
  subgroup=rep(c('location1','location2',
                 'location2', 'location1'), 25),
  predictor = rep(c('soil1','soil2','soil3','soil4'), 25),
  RR=rnorm(200)
)

Adding script from @Rui below. I actually have a multiple regression and so I added an additional predictor. It is still unclear to me how to extract the coefficient CIs for both soil type and temperature.

library(boot)

bootfun <- function(data, i) {
  d <- data[i,]
  fit <- lm(RR ~ soil_type + temperature, data = d)
  coef(fit)
}

set.seed(2022)
set.seed(123)
df <- data.frame(
  group=rep(c('region1','region2'), 100),
  subgroup=rep(c('location1','location2',
                 'location2', 'location1'), 25),
  soil_type = rep(c('soil1','soil2','soil3','soil4'), 25),
  temperature = abs(rnorm(100, 2,1.75)),
  RR=rnorm(200),
stringsAsFactors = TRUE
)

R <- 1000
b_list <- by(df, df$group, \(X) {
  boot(X, bootfun, R, strata = X$subgroup)
})

b_list$region1

Solution

Function boot is base package boot has an argument strata. Split by group and apply a boot function with, for instance, by stratifying by location.

library(boot)

bootfun <- function(data, i) {
  d <- data[i,]
  fit <- lm(RR ~ predictor, data = d)
  coef(fit)
}

set.seed(2022)
df <- data.frame(
  group=rep(c('region1','region2'), 100),
  subgroup=rep(c('location1','location2',
                 'location2', 'location1'), 25),
  predictor = rep(c('soil1','soil2','soil3','soil4'), 25),
  RR=rnorm(200),
  stringsAsFactors = TRUE
)

R <- 1000
b_list <- by(df, df$group, \(X) {
  boot(X, bootfun, R, strata = X$subgroup)
})

b_list$region1
#> 
#> STRATIFIED BOOTSTRAP
#> 
#> 
#> Call:
#> boot(data = X, statistic = bootfun, R = R, strata = X$subgroup)
#> 
#> 
#> Bootstrap Statistics :
#>       original       bias    std. error
#> t1* -0.2608885  0.000469295   0.1541482
#> t2*  0.3502007 -0.004239248   0.2083503

b_list$region2
#> 
#> STRATIFIED BOOTSTRAP
#> 
#> 
#> Call:
#> boot(data = X, statistic = bootfun, R = R, strata = X$subgroup)
#> 
#> 
#> Bootstrap Statistics :
#>        original        bias    std. error
#> t1* -0.03727332 -0.0001557172   0.1422502
#> t2*  0.11987005  0.0016393125   0.1952310

lapply(b_list, boot.ci)
#> Warning in sqrt(tv[, 2L]): NaNs produced

#> Warning in sqrt(tv[, 2L]): NaNs produced
#> $region1
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#> 
#> CALL : 
#> FUN(boot.out = X[[i]])
#> 
#> Intervals : 
#> Level      Normal              Basic             Studentized     
#> 95%   (-0.5635,  0.0408 )   (-0.5611,  0.0545 )   (-0.8227, -0.0225 )  
#> 
#> Level     Percentile            BCa          
#> 95%   (-0.5762,  0.0393 )   (-0.5733,  0.0446 )  
#> Calculations and Intervals on Original Scale
#> 
#> $region2
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#> 
#> CALL : 
#> FUN(boot.out = X[[i]])
#> 
#> Intervals : 
#> Level      Normal              Basic             Studentized     
#> 95%   (-0.3159,  0.2417 )   (-0.3260,  0.2460 )   (-0.3493,  0.1757 )  
#> 
#> Level     Percentile            BCa          
#> 95%   (-0.3206,  0.2514 )   (-0.3321,  0.2352 )  
#> Calculations and Intervals on Original Scale

^{Created on 2022-10-25 with reprex v2.0.2}

Edit

To get the bootstrapped confidence intervals of each coefficient, the code below uses two nested loops. The outer loop is by region, according to the original data partition. The inner loop is on index, meaning, on the matrix t returned by boot, see help("boot"), section Value. The index are the column numbers in any of

b_list$region1$t
b_list$region2$t

each of them with 3 columns.

library(boot)

npars <- ncol(b_list$region1$t)
ci_list <- lapply(b_list, \(region) {
  ci <- lapply(seq.int(npars), \(index) {
    boot.ci(region, index = index, type = c("norm","basic", "perc", "bca"))
  })
  setNames(ci, c("Intercept", "soil", "temperature"))
})

ci_list$region1$Intercept
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#> 
#> CALL : 
#> boot.ci(boot.out = region, type = c("norm", "basic", "perc", 
#>     "bca"), index = index)
#> 
#> Intervals : 
#> Level      Normal              Basic         
#> 95%   (-0.2517,  0.6059 )   (-0.2423,  0.6043 )  
#> 
#> Level     Percentile            BCa          
#> 95%   (-0.2410,  0.6056 )   (-0.2414,  0.6048 )  
#> Calculations and Intervals on Original Scale

ci_list$region2$temperature
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#> 
#> CALL : 
#> boot.ci(boot.out = region, type = c("norm", "basic", "perc", 
#>     "bca"), index = index)
#> 
#> Intervals : 
#> Level      Normal              Basic         
#> 95%   (-0.2317,  0.0420 )   (-0.2416,  0.0404 )  
#> 
#> Level     Percentile            BCa          
#> 95%   (-0.2278,  0.0542 )   (-0.2265,  0.0570 )  
#> Calculations and Intervals on Original Scale

^{Created on 2022-10-25 with reprex v2.0.2}

Edit 2

Like I say in a comment below, in the new data the soil type uniquely identifies pairs of region and location, unique(df[1:3]) shows it. And it becomes useless to split by group and stratify within groups.

bootfun2 <- function(data, i) {
  d <- data[i,]
  fit <- lm(RR ~ temperature + soil_type, data = d)
  coef(fit)
}

unique(df[1:3])  # soil type uniquely identifies region/location
#>     group  subgroup soil_type
#> 1 region1 location1     soil1
#> 2 region2 location2     soil2
#> 3 region1 location2     soil3
#> 4 region2 location1     soil4

fit <- lm(RR ~ temperature + soil_type, data = df)
coef(fit)
#>    (Intercept)    temperature soil_typesoil2 soil_typesoil3 soil_typesoil4 
#>     0.25928498    -0.06352205    -0.17739104    -0.05243836    -0.20408527

set.seed(2022)
R <- 1000
b_3 <- boot(df, bootfun2, R)
b_3
#> 
#> ORDINARY NONPARAMETRIC BOOTSTRAP
#> 
#> 
#> Call:
#> boot(data = df, statistic = bootfun2, R = R)
#> 
#> 
#> Bootstrap Statistics :
#>        original       bias    std. error
#> t1*  0.25928498  0.005724634  0.18033509
#> t2* -0.06352205 -0.002910677  0.05161868
#> t3* -0.17739104  0.004932486  0.18665594
#> t4* -0.05243836  0.005796168  0.19602658
#> t5* -0.20408527  0.004914674  0.20355549

btype <- c("norm","basic", "perc", "bca")


ci_list3 <- lapply(seq_len(ncol(b_3$t)), \(index) {
  boot.ci(b_3, type = btype, index = index)
})
names(ci_list3) <- names(coef(fit))
ci_list3
#> $`(Intercept)`
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#> 
#> CALL : 
#> boot.ci(boot.out = b_3, type = btype, index = index)
#> 
#> Intervals : 
#> Level      Normal              Basic         
#> 95%   (-0.0999,  0.6070 )   (-0.0868,  0.6172 )  
#> 
#> Level     Percentile            BCa          
#> 95%   (-0.0986,  0.6054 )   (-0.0992,  0.6034 )  
#> Calculations and Intervals on Original Scale
#> 
#> $temperature
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#> 
#> CALL : 
#> boot.ci(boot.out = b_3, type = btype, index = index)
#> 
#> Intervals : 
#> Level      Normal              Basic         
#> 95%   (-0.1618,  0.0406 )   (-0.1631,  0.0401 )  
#> 
#> Level     Percentile            BCa          
#> 95%   (-0.1672,  0.0360 )   (-0.1552,  0.0503 )  
#> Calculations and Intervals on Original Scale
#> 
#> $soil_typesoil2
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#> 
#> CALL : 
#> boot.ci(boot.out = b_3, type = btype, index = index)
#> 
#> Intervals : 
#> Level      Normal              Basic         
#> 95%   (-0.5482,  0.1835 )   (-0.5541,  0.1955 )  
#> 
#> Level     Percentile            BCa          
#> 95%   (-0.5503,  0.1994 )   (-0.5542,  0.1927 )  
#> Calculations and Intervals on Original Scale
#> 
#> $soil_typesoil3
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#> 
#> CALL : 
#> boot.ci(boot.out = b_3, type = btype, index = index)
#> 
#> Intervals : 
#> Level      Normal              Basic         
#> 95%   (-0.4424,  0.3260 )   (-0.4399,  0.3068 )  
#> 
#> Level     Percentile            BCa          
#> 95%   (-0.4117,  0.3350 )   (-0.4116,  0.3350 )  
#> Calculations and Intervals on Original Scale
#> 
#> $soil_typesoil4
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#> 
#> CALL : 
#> boot.ci(boot.out = b_3, type = btype, index = index)
#> 
#> Intervals : 
#> Level      Normal              Basic         
#> 95%   (-0.6080,  0.1900 )   (-0.6116,  0.2127 )  
#> 
#> Level     Percentile            BCa          
#> 95%   (-0.6208,  0.2035 )   (-0.6284,  0.1801 )  
#> Calculations and Intervals on Original Scale

^{Created on 2022-10-25 with reprex v2.0.2}