I have a data frame with response ratios for multiple locations and each location is assigned to a group (region). I want to generate a regression for each group (region) that uses Response Ratio (RR) as the response, location as the unit of replication, and each soil type as a predictor. I would like to use bootstrap resampling to generate confidence intervals around the coefficients for each soil type but I am not sure how to generate this.
#sample data
df <- data.frame(
group=rep(c('region1','region2'), 100),
subgroup=rep(c('location1','location2',
'location2', 'location1'), 25),
predictor = rep(c('soil1','soil2','soil3','soil4'), 25),
RR=rnorm(200)
)
Adding script from @Rui below. I actually have a multiple regression and so I added an additional predictor. It is still unclear to me how to extract the coefficient CIs for both soil type and temperature.
library(boot)
bootfun <- function(data, i) {
d <- data[i,]
fit <- lm(RR ~ soil_type + temperature, data = d)
coef(fit)
}
set.seed(2022)
set.seed(123)
df <- data.frame(
group=rep(c('region1','region2'), 100),
subgroup=rep(c('location1','location2',
'location2', 'location1'), 25),
soil_type = rep(c('soil1','soil2','soil3','soil4'), 25),
temperature = abs(rnorm(100, 2,1.75)),
RR=rnorm(200),
stringsAsFactors = TRUE
)
R <- 1000
b_list <- by(df, df$group, \(X) {
boot(X, bootfun, R, strata = X$subgroup)
})
b_list$region1
Function boot
is base package boot
has an argument strata
. Split by group
and apply a boot function with, for instance, by
stratifying by location.
library(boot)
bootfun <- function(data, i) {
d <- data[i,]
fit <- lm(RR ~ predictor, data = d)
coef(fit)
}
set.seed(2022)
df <- data.frame(
group=rep(c('region1','region2'), 100),
subgroup=rep(c('location1','location2',
'location2', 'location1'), 25),
predictor = rep(c('soil1','soil2','soil3','soil4'), 25),
RR=rnorm(200),
stringsAsFactors = TRUE
)
R <- 1000
b_list <- by(df, df$group, \(X) {
boot(X, bootfun, R, strata = X$subgroup)
})
b_list$region1
#>
#> STRATIFIED BOOTSTRAP
#>
#>
#> Call:
#> boot(data = X, statistic = bootfun, R = R, strata = X$subgroup)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* -0.2608885 0.000469295 0.1541482
#> t2* 0.3502007 -0.004239248 0.2083503
b_list$region2
#>
#> STRATIFIED BOOTSTRAP
#>
#>
#> Call:
#> boot(data = X, statistic = bootfun, R = R, strata = X$subgroup)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* -0.03727332 -0.0001557172 0.1422502
#> t2* 0.11987005 0.0016393125 0.1952310
lapply(b_list, boot.ci)
#> Warning in sqrt(tv[, 2L]): NaNs produced
#> Warning in sqrt(tv[, 2L]): NaNs produced
#> $region1
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> FUN(boot.out = X[[i]])
#>
#> Intervals :
#> Level Normal Basic Studentized
#> 95% (-0.5635, 0.0408 ) (-0.5611, 0.0545 ) (-0.8227, -0.0225 )
#>
#> Level Percentile BCa
#> 95% (-0.5762, 0.0393 ) (-0.5733, 0.0446 )
#> Calculations and Intervals on Original Scale
#>
#> $region2
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> FUN(boot.out = X[[i]])
#>
#> Intervals :
#> Level Normal Basic Studentized
#> 95% (-0.3159, 0.2417 ) (-0.3260, 0.2460 ) (-0.3493, 0.1757 )
#>
#> Level Percentile BCa
#> 95% (-0.3206, 0.2514 ) (-0.3321, 0.2352 )
#> Calculations and Intervals on Original Scale
Created on 2022-10-25 with reprex v2.0.2
To get the bootstrapped confidence intervals of each coefficient, the code below uses two nested loops. The outer loop is by region, according to the original data partition. The inner loop is on index
, meaning, on the matrix t
returned by boot
, see help("boot")
, section Value. The index
are the column numbers in any of
b_list$region1$t
b_list$region2$t
each of them with 3 columns.
library(boot)
npars <- ncol(b_list$region1$t)
ci_list <- lapply(b_list, \(region) {
ci <- lapply(seq.int(npars), \(index) {
boot.ci(region, index = index, type = c("norm","basic", "perc", "bca"))
})
setNames(ci, c("Intercept", "soil", "temperature"))
})
ci_list$region1$Intercept
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = region, type = c("norm", "basic", "perc",
#> "bca"), index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.2517, 0.6059 ) (-0.2423, 0.6043 )
#>
#> Level Percentile BCa
#> 95% (-0.2410, 0.6056 ) (-0.2414, 0.6048 )
#> Calculations and Intervals on Original Scale
ci_list$region2$temperature
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = region, type = c("norm", "basic", "perc",
#> "bca"), index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.2317, 0.0420 ) (-0.2416, 0.0404 )
#>
#> Level Percentile BCa
#> 95% (-0.2278, 0.0542 ) (-0.2265, 0.0570 )
#> Calculations and Intervals on Original Scale
Created on 2022-10-25 with reprex v2.0.2
Like I say in a comment below, in the new data the soil type uniquely identifies pairs of region and location, unique(df[1:3])
shows it. And it becomes useless to split by group
and stratify within groups.
bootfun2 <- function(data, i) {
d <- data[i,]
fit <- lm(RR ~ temperature + soil_type, data = d)
coef(fit)
}
unique(df[1:3]) # soil type uniquely identifies region/location
#> group subgroup soil_type
#> 1 region1 location1 soil1
#> 2 region2 location2 soil2
#> 3 region1 location2 soil3
#> 4 region2 location1 soil4
fit <- lm(RR ~ temperature + soil_type, data = df)
coef(fit)
#> (Intercept) temperature soil_typesoil2 soil_typesoil3 soil_typesoil4
#> 0.25928498 -0.06352205 -0.17739104 -0.05243836 -0.20408527
set.seed(2022)
R <- 1000
b_3 <- boot(df, bootfun2, R)
b_3
#>
#> ORDINARY NONPARAMETRIC BOOTSTRAP
#>
#>
#> Call:
#> boot(data = df, statistic = bootfun2, R = R)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* 0.25928498 0.005724634 0.18033509
#> t2* -0.06352205 -0.002910677 0.05161868
#> t3* -0.17739104 0.004932486 0.18665594
#> t4* -0.05243836 0.005796168 0.19602658
#> t5* -0.20408527 0.004914674 0.20355549
btype <- c("norm","basic", "perc", "bca")
ci_list3 <- lapply(seq_len(ncol(b_3$t)), \(index) {
boot.ci(b_3, type = btype, index = index)
})
names(ci_list3) <- names(coef(fit))
ci_list3
#> $`(Intercept)`
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.0999, 0.6070 ) (-0.0868, 0.6172 )
#>
#> Level Percentile BCa
#> 95% (-0.0986, 0.6054 ) (-0.0992, 0.6034 )
#> Calculations and Intervals on Original Scale
#>
#> $temperature
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.1618, 0.0406 ) (-0.1631, 0.0401 )
#>
#> Level Percentile BCa
#> 95% (-0.1672, 0.0360 ) (-0.1552, 0.0503 )
#> Calculations and Intervals on Original Scale
#>
#> $soil_typesoil2
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.5482, 0.1835 ) (-0.5541, 0.1955 )
#>
#> Level Percentile BCa
#> 95% (-0.5503, 0.1994 ) (-0.5542, 0.1927 )
#> Calculations and Intervals on Original Scale
#>
#> $soil_typesoil3
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.4424, 0.3260 ) (-0.4399, 0.3068 )
#>
#> Level Percentile BCa
#> 95% (-0.4117, 0.3350 ) (-0.4116, 0.3350 )
#> Calculations and Intervals on Original Scale
#>
#> $soil_typesoil4
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.6080, 0.1900 ) (-0.6116, 0.2127 )
#>
#> Level Percentile BCa
#> 95% (-0.6208, 0.2035 ) (-0.6284, 0.1801 )
#> Calculations and Intervals on Original Scale
Created on 2022-10-25 with reprex v2.0.2