Search code examples
rlinear-regressionp-value

Extract lists of coefficients and p-values for multiple invariant independent variables in R


I try to do 1104 linear regressions with the same model. My independent variables do not change. However, my dependent variable does. Indeed, I have 1104 dependent variables. I do not know how to extract all the coefficients (intercepts included) and p-values in order to compute means of each (coefficients & p-values). How to do that with an easy way ? This is my model :

testMCFG1 <- lapply(101:1204, function(i) lm(recexp[,i]~recexp[,"rm"] + recexp[,"zdy"] + recexp[,"ztbl"] + recexp[,"ztms"] + recexp[,"zdfy"] + recexp[,"rm_zdy"] + recexp[,"rm_ztbl"] + recexp[,"rm_ztms"] + recexp[,"rm_zdfy"] + recexp[,"contexte"] + recexp[,"rm_contexte"]))

However, someone here has already showed me how to do that with only one invariant independent variable. That works. Find below the codes for this case:

y <- 'rm'


x <- names(recexp[101:1204])

models <- map(setNames(x, x),
              ~ lm(as.formula(paste(.x, y, sep="~")),
                   data=recexp))

pvalues <-
  data.frame(rsquared = unlist(map(models, ~ summary(.)$r.squared)),
             RSE = unlist(map(models, ~ summary(.)$sigma))) %>%
  rownames_to_column(var = "which_dependent")

results <- full_join(basic_information, pvalues)

results %>% group_by(term) %>% summarise(mean_estimate = mean(estimate))

results %>% group_by(term) %>% summarise(mean_p = mean(p.value))

Solution

  • Here is a solution using several tidyverse packages. You don't provide your data so I'll use mtcars as an example. Put your independent variables into a fixed string called independents and we'll grab your dependents using a slice as you did with your code producing a character vector

    #####
    independents <- 'mpg + vs + am + gear'
    dependent <- names(mtcars[2:7])
    

    Load the libraries

    library(dplyr)
    library(purrr)
    library(broom)
    library(tidyr)
    library(tibble)
    

    Make a list of all the models using purrr::map

    models <- map(setNames(dependent, dependent),
                  ~ lm(as.formula(paste(.x, independents, sep="~")),
                       data=mtcars))
    

    Take that list of lm models and feed it to broom::tidy to extract the basic information about beta estimates, and p values etc. To keep it neat use the name of the list item (which is the dependent variable) and add it as a column. Remove the parens from intercept and add a zero so it is always first and you know it's beta0

    basics <-
       map(models, ~ broom::tidy(.)) %>%
       map2_df(.,
               names(.),
               ~ mutate(.x, which_dependent = .y)) %>%
       select(which_dependent, everything()) %>%
       mutate(term = gsub("\\(Intercept\\)", "0Intercept", term))
    

    Feed the list in again this time extract r squared and sigma a.k.a. "Residual standard error"

    model_summary <-
       data.frame(rsquared = unlist(map(models, ~ summary(.)$r.squared)),
                  RSE = unlist(map(models, ~ summary(.)$sigma))) %>%
       rownames_to_column(var = "which_dependent")
    

    Join the two based on which dependent variable

    results <- full_join(basics, model_summary)
    #> Joining, by = "which_dependent"
    results
    #> # A tibble: 30 x 8
    #>    which_dependent term    estimate std.error statistic  p.value rsquared    RSE
    #>    <chr>           <chr>      <dbl>     <dbl>     <dbl>    <dbl>    <dbl>  <dbl>
    #>  1 cyl             0Inter…   10.4      1.14       9.13  9.58e-10    0.861  0.714
    #>  2 cyl             mpg       -0.117    0.0382    -3.06  4.98e- 3    0.861  0.714
    #>  3 cyl             vs        -1.80     0.374     -4.81  5.09e- 5    0.861  0.714
    #>  4 cyl             am        -0.414    0.502     -0.826 4.16e- 1    0.861  0.714
    #>  5 cyl             gear      -0.258    0.290     -0.891 3.81e- 1    0.861  0.714
    #>  6 disp            0Inter…  571.      94.1        6.07  1.76e- 6    0.804 58.8  
    #>  7 disp            mpg       -9.50     3.14      -3.02  5.47e- 3    0.804 58.8  
    #>  8 disp            vs       -85.9     30.8       -2.79  9.49e- 3    0.804 58.8  
    #>  9 disp            am       -31.9     41.3       -0.774 4.45e- 1    0.804 58.8  
    #> 10 disp            gear     -26.8     23.9       -1.12  2.71e- 1    0.804 58.8  
    #> # … with 20 more rows
    

    It's in long format so you can do things like summarise grouped by term

    results %>%
       group_by(term) %>%
       summarise(mean_p = mean(p.value)) %>%
       arrange(term)
    #> `summarise()` ungrouping output (override with `.groups` argument)
    #> # A tibble: 5 x 2
    #>   term         mean_p
    #>   <chr>         <dbl>
    #> 1 0Intercept 0.000168
    #> 2 am         0.359   
    #> 3 gear       0.287   
    #> 4 mpg        0.0538  
    #> 5 vs         0.159
    

    Or you can make it wider if you prefer...

    wide_results <-
       results %>%
       pivot_wider(names_from = term,
                   values_from = estimate:p.value)
    wide_results
    #> # A tibble: 6 x 23
    #>   which_dependent rsquared    RSE estimate_0Inter… estimate_mpg estimate_vs
    #>   <chr>              <dbl>  <dbl>            <dbl>        <dbl>       <dbl>
    #> 1 cyl                0.861  0.714            10.4       -0.117       -1.80 
    #> 2 disp               0.804 58.8             571.        -9.50       -85.9  
    #> 3 hp                 0.736 37.7             241.        -8.17       -41.4  
    #> 4 drat               0.667  0.331             2.07       0.0228       0.166
    #> 5 wt                 0.804  0.464             5.90      -0.104       -0.146
    #> 6 qsec               0.734  0.988            17.5        0.0894       2.29 
    #> # … with 17 more variables: estimate_am <dbl>, estimate_gear <dbl>,
    #> #   std.error_0Intercept <dbl>, std.error_mpg <dbl>, std.error_vs <dbl>,
    #> #   std.error_am <dbl>, std.error_gear <dbl>, statistic_0Intercept <dbl>,
    #> #   statistic_mpg <dbl>, statistic_vs <dbl>, statistic_am <dbl>,
    #> #   statistic_gear <dbl>, p.value_0Intercept <dbl>, p.value_mpg <dbl>,
    #> #   p.value_vs <dbl>, p.value_am <dbl>, p.value_gear <dbl>
    names(wide_results)
    #>  [1] "which_dependent"      "rsquared"             "RSE"                 
    #>  [4] "estimate_0Intercept"  "estimate_mpg"         "estimate_vs"         
    #>  [7] "estimate_am"          "estimate_gear"        "std.error_0Intercept"
    #> [10] "std.error_mpg"        "std.error_vs"         "std.error_am"        
    #> [13] "std.error_gear"       "statistic_0Intercept" "statistic_mpg"       
    #> [16] "statistic_vs"         "statistic_am"         "statistic_gear"      
    #> [19] "p.value_0Intercept"   "p.value_mpg"          "p.value_vs"          
    #> [22] "p.value_am"           "p.value_gear"