Search code examples
rregressionquantreg

Accessing the quantile regression estimates values from the rq function


I am performing a quantile regression as following:

library(quantreg) #quantile regression framework

#function to perform quantile regression with bootstrap confidence intervals
rq_bootstrap <- function(data, n_bootstrap = 1000) {
  rq_model <- summary(
    rq(julian_day ~ year_centered, data = data, tau = 1:99 / 100),
    se = "boot",
    bsmethod = "xy",
    R = n_bootstrap,
    level = 0.95
  )
  plot(rq_model, mfrow = c(1, 2))
  coef_df <- as.data.frame(sapply(rq_model, function(x) c(tau = x$tau, x$coefficients[-1, ])))
  return(list(model_summary = rq_model, coefficients = coef_df))
}

#apply the function
rq_results_lat1 <- rq_bootstrap(lat1)

The resulting plot is correct. However, I would like to access to the estimates values and the values of their bootstrap confidence interval, but I do not manage to.

Here is a sample of my data frame for reproductibility:

> dput(lat1[sample(nrow(lat1), 50),])
structure(list(date = structure(c(5567, 14294, 12820, 16167, 
17629, 14327, 16452, 16130, 18664, 13913, 13243, 15759, 12507, 
11049, 15410, 11342, 12821, 16147, 13240, 18645, 17979, 11706, 
17607, 17615, 16528, 16131, 13562, 18349, 13221, 14290, 15068, 
16507, 15059, 14997, 14283, 9931, 3677, 15421, 12151, 13591, 
13920, 16548, 16123, 18294, 13596, 17186, 12862, 6690, 13218, 
13252), class = "Date"), lat = c(57, 56, 57, 59, 58, 59, 56, 
59, 56, 56, 59, 57, 56, 57, 59, 55, 56, 56, 59, 59, 59, 56, 57, 
57, 58, 56, 58, 55, 57, 56, 58, 59, 58, 58, 56, 58, 58, 58, 56, 
57, 57, 55, 59, 56, 56, 55, 57, 59, 59, 59), long = c(11, 12, 
12, 17, 11, 15, 12, 18, 12, 16, 14, 11, 16, 11, 15, 12, 16, 16, 
18, 16, 17, 12, 13, 12, 13, 12, 12, 13, 11, 16, 14, 17, 12, 14, 
16, 14, 11, 17, 16, 18, 12, 14, 16, 12, 12, 12, 13, 16, 17, 15
), julian_day = c(89, 50, 37, 97, 98, 83, 17, 60, 37, 35, 95, 
54, 90, 93, 71, 20, 38, 77, 92, 18, 83, 19, 76, 84, 93, 61, 49, 
88, 73, 46, 94, 72, 85, 23, 39, 70, 26, 82, 99, 78, 42, 113, 
53, 33, 83, 20, 79, 117, 70, 104), year = c("1985", "2009", "2005", 
"2014", "2018", "2009", "2015", "2014", "2021", "2008", "2006", 
"2013", "2004", "2000", "2012", "2001", "2005", "2014", "2006", 
"2021", "2019", "2002", "2018", "2018", "2015", "2014", "2007", 
"2020", "2006", "2009", "2011", "2015", "2011", "2011", "2009", 
"1997", "1980", "2012", "2003", "2007", "2008", "2015", "2014", 
"2020", "2007", "2017", "2005", "1988", "2006", "2006"), decade = c("1980-1989", 
"2000-2009", "2000-2009", "2010-2019", "2010-2019", "2000-2009", 
"2010-2019", "2010-2019", "2020-2029", "2000-2009", "2000-2009", 
"2010-2019", "2000-2009", "2000-2009", "2010-2019", "2000-2009", 
"2000-2009", "2010-2019", "2000-2009", "2020-2029", "2010-2019", 
"2000-2009", "2010-2019", "2010-2019", "2010-2019", "2010-2019", 
"2000-2009", "2020-2029", "2000-2009", "2000-2009", "2010-2019", 
"2010-2019", "2010-2019", "2010-2019", "2000-2009", "1990-1999", 
"1980-1989", "2010-2019", "2000-2009", "2000-2009", "2000-2009", 
"2010-2019", "2010-2019", "2020-2029", "2000-2009", "2010-2019", 
"2000-2009", "1980-1989", "2000-2009", "2000-2009"), time = c(13L, 
15L, 15L, 16L, 16L, 15L, 16L, 16L, 17L, 15L, 15L, 16L, 15L, 15L, 
16L, 15L, 15L, 16L, 15L, 17L, 16L, 15L, 16L, 16L, 16L, 16L, 15L, 
17L, 15L, 15L, 16L, 16L, 16L, 16L, 15L, 14L, 13L, 16L, 15L, 15L, 
15L, 16L, 16L, 17L, 15L, 16L, 15L, 13L, 15L, 15L), lat_grouped = c(1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1), year_centered = structure(c(85, 109, 105, 
114, 118, 109, 115, 114, 121, 108, 106, 113, 104, 100, 112, 101, 
105, 114, 106, 121, 119, 102, 118, 118, 115, 114, 107, 120, 106, 
109, 111, 115, 111, 111, 109, 97, 80, 112, 103, 107, 108, 115, 
114, 120, 107, 117, 105, 88, 106, 106), class = "AsIs")), row.names = c(3378L, 
21344L, 11753L, 33172L, 42002L, 22010L, 33453L, 31852L, 49389L, 
17982L, 15167L, 29657L, 11059L, 7655L, 27224L, 7730L, 11784L, 
32491L, 14955L, 48899L, 44025L, 8367L, 40663L, 41022L, 35232L, 
31932L, 16540L, 47318L, 13830L, 21255L, 26321L, 34488L, 25776L, 
25064L, 21134L, 6545L, 2085L, 27476L, 9846L, 17134L, 18082L, 
35808L, 31695L, 45590L, 17310L, 37493L, 12220L, 4371L, 13731L, 
15875L), class = "data.frame")

> head(lat1)
          date lat long julian_day year    decade time lat_grouped year_centered
592 1970-01-18  57   12         18 1970 1970-1979   12           1            70
593 1970-01-18  57   18         18 1970 1970-1979   12           1            70
594 1970-01-18  56   12         18 1970 1970-1979   12           1            70
595 1970-01-18  57   18         18 1970 1970-1979   12           1            70
596 1970-01-19  56   16         19 1970 1970-1979   12           1            70
597 1970-01-23  56   12         23 1970 1970-1979   12           1            70

I would be glad if someone knows how to extract the researched values.


Solution

  • Your data are already there in the output, but you probably need to reshape it into a format with estimates and upper / lower confidence intervals. The confidence intervals can be calculated, since they are simply the fit +/- 1.96 standard errors.

    The manipulation is probably a bit easier using tidyverse functions:

    library(quantreg) 
    library(tidyverse)
    
    rq_bootstrap <- function(data, n_bootstrap = 1000) {
      suppressWarnings(
        rq(julian_day ~ year_centered, data = data, tau = 1:99 / 100)
        ) %>%
        summary(se = "boot", bsmethod = "xy", R = n_bootstrap, level = 0.95) %>%
        map_df(function(x) {
          coef(x) |>
            as.data.frame() |>
            cbind(variable = rownames(coef(x)), tau = x$tau)
        }) %>%
        bind_rows() %>%
        mutate(lower = Value - `Std. Error` * 1.96, 
               upper = Value + `Std. Error` * 1.96) %>%
        select(variable, tau, Value, lower, upper) %>%
        as_tibble()
    }
    

    Now if we run our function, we get a nicer result:

    rq_results_lat1 <- rq_bootstrap(lat1)
    
    rq_results_lat1
    #> # A tibble: 198 x 5
    #>    variable        tau   Value   lower   upper
    #>    <chr>         <dbl>   <dbl>   <dbl>   <dbl>
    #>  1 (Intercept)    0.01 34.7    -76.2   146.   
    #>  2 year_centered  0.01 -0.154   -1.14    0.836
    #>  3 (Intercept)    0.02 34.7    -54.6   124.   
    #>  4 year_centered  0.02 -0.154   -0.949   0.641
    #>  5 (Intercept)    0.03 34.7    -81.1   150.   
    #>  6 year_centered  0.03 -0.154   -1.19    0.885
    #>  7 (Intercept)    0.04 24.4    -92.0   141.   
    #>  8 year_centered  0.04 -0.0526  -1.09    0.985
    #>  9 (Intercept)    0.05 30.1    -94.5   155.   
    #> 10 year_centered  0.05 -0.100   -1.22    1.02 
    #> # ... with 188 more rows
    

    And to show this is the expected output, let's plot this using ggplot

    ggplot(rq_results_lat1, aes(tau, Value)) +
      geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.3) +
      ggh4x::geom_pointpath(size = 1) +
      facet_wrap(.~variable, scale = 'free_y') +
      theme_classic(base_size = 16) +
      theme(strip.background = element_blank(),
            panel.border = element_rect(fill = NA),
            strip.text = element_text(size = 14, face = "bold"),
            panel.spacing = unit(1, "cm"))
    

    enter image description here

    This is identical to the base R plot obtained with plot(rq_model), though when you plot the summary you need to specify the confidence level. We can check the output of the above code is correct by doing:

    plot(rq_model, mfrow = c(1, 2), level = 0.95)
    

    enter image description here

    Created on 2023-08-03 with reprex v2.0.2