Search code examples
rdataframeloopsregressionlinear-regression

Loop regression model changing y and controls


I have a dataframe like this (the real one just has many more variables):

data<-data.frame(Country=c("USA","USA","USA","USA","India","India","India","India","China","China","China","China"),
               Indicator=rep(c("Population","GDP","Debt","Currency"),times=3),`2011`=rep(c(1,2,3,4),each=3),`2012`=rep(c(4,5,6,7),each=3),`2013`=rep(c(8,9,11,12),each=3))                                                                                                                       

data<-data  %>%
  pivot_longer(
    starts_with("X"),
    names_to = "Year",
    names_transform = list(Year = parse_number)
  ) %>%
  pivot_wider(names_from = Indicator, values_from = value) %>% 
  relocate(Year)

data$y1<-c(1,10,11,3,4,5,2,2,1)
data$y2<-c(1,2,3,4,5,6,6,8,9)
data$y3<-c(10,9,8,7,5,5,11,3,4)
data$y4<-c(1,1,11,3,4,2,2,2,1)
data$y5<-c(5,10,11,3,5,5,5,5,1)

enter image description here

And I want to loop a linear regression model each time with a new y and different combinations of x (GDP) and control variables (in this case, these are Country, Population, Debt and Currency), like so:

lm_y1_GDP=lm(y1~GDP, data = data)
lm_y1_GDP_year=lm(y1~GDP+year, data = data)
lm_y1_GDP_country=lm(y1~GDP+country, data = data)
...
lm_y5_GDP_country_population_debt_currency=lm(y5~GDP+Country+Population+Debt+Currency, data = data)

and store each summary(lm_y1), summary(lm_y2),... in a dedicated list of regression results. Ideally, I would also want to create dummy variables to add to the list of regressions also country- and time-fixed effects. Thanks a lot in advance!


Solution

  • This one is leaning more towards Tidyverse. I also left Country in, to my understanding lm() does handle factors, though it's something to consider when interpreting results. Broom for tabular overview.

    library(dplyr)
    library(tidyr)
    library(purrr)
    library(broom)
    
    data$Country <- as.factor(data$Country)
    
    lm_out <- map(1:5, ~ combn(c("Year", "Country", "Population", "Debt", "Currency"), .x)) %>% 
      # list of matrices, 1x5, 2x10 .. 4x5, 5x1
      map(~ apply(.x, 2, paste0, collapse = "+")) %>% 
      # formulas without y.~GDP+
      unlist() %>% 
      paste0("GDP+", .) %>%
      # all combinations as data.frame
      expand.grid(y = c("y1", "y2", "y3", "y4", "y5"), terms = .) %>% 
      mutate(frm = paste0(y,"~",terms)) %>% 
      # y       terms            frm
      # 1  y1    GDP+Year    y1~GDP+Year
      # 2  y2    GDP+Year    y2~GDP+Year
      # 3  y3    GDP+Year    y3~GDP+Year
      # 4  y4    GDP+Year    y4~GDP+Year
      # 5  y5    GDP+Year    y5~GDP+Year
      # ...
      pull(frm) %>% 
      set_names() %>% 
      # fit 155 models
      map(~ lm(.x, data = data))
    
    # used formulas can be found from list names:
    names(lm_out)[1]
    #> [1] "y1~GDP+Year"
    summary(lm_out[[1]])
    #> 
    #> Call:
    #> lm(formula = .x, data = data)
    #> 
    #> Residuals:
    #>     Min      1Q  Median      3Q     Max 
    #> -3.3627 -1.6373 -0.2843  2.4118  2.6863 
    #> 
    #> Coefficients:
    #>               Estimate Std. Error t value Pr(>|t|)  
    #> (Intercept) -1.604e+04  4.789e+03  -3.350   0.0154 *
    #> GDP         -1.677e+00  5.847e-01  -2.867   0.0285 *
    #> Year         7.980e+00  2.382e+00   3.351   0.0154 *
    #> ---
    #> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    #> 
    #> Residual standard error: 2.541 on 6 degrees of freedom
    #> Multiple R-squared:  0.6541, Adjusted R-squared:  0.5387 
    #> F-statistic: 5.672 on 2 and 6 DF,  p-value: 0.0414
    
    # imap to access list names as .y, run tidy and glance over whole list,
    # add column with formula
    imap_dfr(lm_out, ~ tidy(.x) %>% mutate(formula = .y) )
    #> # A tibble: 790 × 6
    #>    term          estimate std.error statistic  p.value formula    
    #>    <chr>            <dbl>     <dbl>     <dbl>    <dbl> <chr>      
    #>  1 (Intercept) -16043.     4789.       -3.35  0.0154   y1~GDP+Year
    #>  2 GDP             -1.68      0.585    -2.87  0.0285   y1~GDP+Year
    #>  3 Year             7.98      2.38      3.35  0.0154   y1~GDP+Year
    #>  4 (Intercept)   8628.     2019.        4.27  0.00524  y2~GDP+Year
    #>  5 GDP              1.49      0.246     6.04  0.000933 y2~GDP+Year
    #>  6 Year            -4.29      1.00     -4.27  0.00525  y2~GDP+Year
    #>  7 (Intercept)   -554.     4645.       -0.119 0.909    y3~GDP+Year
    #>  8 GDP             -0.576     0.567    -1.02  0.349    y3~GDP+Year
    #>  9 Year             0.280     2.31      0.121 0.907    y3~GDP+Year
    #> 10 (Intercept)  -8273.     5882.       -1.41  0.209    y4~GDP+Year
    #> # … with 780 more rows
    
    imap_dfr(lm_out, ~ glance(.x) %>% mutate(formula = .y) )
    #> # A tibble: 155 × 13
    #>    r.squared adj.r.squ…¹ sigma stati…² p.value    df  logLik   AIC   BIC devia…³
    #>        <dbl>       <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl>
    #>  1     0.654      0.539  2.54    5.67  4.14e-2     2 -19.3    46.7  47.5  38.7  
    #>  2     0.879      0.839  1.07   21.8   1.77e-3     2 -11.6    31.1  31.9   6.89 
    #>  3     0.420      0.227  2.46    2.18  1.95e-1     2 -19.1    46.1  46.9  36.4  
    #>  4     0.269      0.0257 3.12    1.11  3.90e-1     2 -21.2    50.4  51.2  58.5  
    #>  5     0.548      0.397  2.43    3.64  9.23e-2     2 -18.9    45.9  46.6  35.4  
    #>  6     0.578      0.325  3.07    2.29  1.96e-1     3 -20.2    50.5  51.4  47.2  
    #>  7     0.989      0.982  0.356 148.    2.65e-5     3  -0.824  11.6  12.6   0.633
    #>  8     0.611      0.378  2.21    2.62  1.63e-1     3 -17.3    44.5  45.5  24.4  
    #>  9     0.256     -0.191  3.45    0.573 6.57e-1     3 -21.3    52.5  53.5  59.5  
    #> 10     0.580      0.328  2.56    2.30  1.95e-1     3 -18.6    47.2  48.2  32.9  
    #> # … with 145 more rows, 3 more variables: df.residual <int>, nobs <int>,
    #> #   formula <chr>, and abbreviated variable names ¹​adj.r.squared, ²​statistic,
    #> #   ³​deviance
    

    Input:

    data<-data.frame(Country=c("USA","USA","USA","USA","India","India","India","India","China","China","China","China"),
                     Indicator=rep(c("Population","GDP","Debt","Currency"),times=3),`2011`=rep(c(1,2,3,4),each=3),`2012`=rep(c(4,5,6,7),each=3),`2013`=rep(c(8,9,11,12),each=3))                                                                                                                       
    
    data<-data  %>%
      pivot_longer(
        starts_with("X"),
        names_to = "Year",
        names_transform = list(Year = readr::parse_number)
      ) %>%
      pivot_wider(names_from = Indicator, values_from = value) %>% 
      relocate(Year)
    
    data$y1<-c(1,10,11,3,4,5,2,2,1)
    data$y2<-c(1,2,3,4,5,6,6,8,9)
    data$y3<-c(10,9,8,7,5,5,11,3,4)
    data$y4<-c(1,1,11,3,4,2,2,2,1)
    data$y5<-c(5,10,11,3,5,5,5,5,1)
    

    Created on 2023-01-20 with reprex v2.0.2