r dataframe loops regression linear-regression

Loop regression model changing y and controls

I have a dataframe like this (the real one just has many more variables):

data<-data.frame(Country=c("USA","USA","USA","USA","India","India","India","India","China","China","China","China"),
               Indicator=rep(c("Population","GDP","Debt","Currency"),times=3),`2011`=rep(c(1,2,3,4),each=3),`2012`=rep(c(4,5,6,7),each=3),`2013`=rep(c(8,9,11,12),each=3))                                                                                                                       

data<-data  %>%
  pivot_longer(
    starts_with("X"),
    names_to = "Year",
    names_transform = list(Year = parse_number)
  ) %>%
  pivot_wider(names_from = Indicator, values_from = value) %>% 
  relocate(Year)

data$y1<-c(1,10,11,3,4,5,2,2,1)
data$y2<-c(1,2,3,4,5,6,6,8,9)
data$y3<-c(10,9,8,7,5,5,11,3,4)
data$y4<-c(1,1,11,3,4,2,2,2,1)
data$y5<-c(5,10,11,3,5,5,5,5,1)

And I want to loop a linear regression model each time with a new y and different combinations of x (GDP) and control variables (in this case, these are Country, Population, Debt and Currency), like so:

lm_y1_GDP=lm(y1~GDP, data = data)
lm_y1_GDP_year=lm(y1~GDP+year, data = data)
lm_y1_GDP_country=lm(y1~GDP+country, data = data)
...
lm_y5_GDP_country_population_debt_currency=lm(y5~GDP+Country+Population+Debt+Currency, data = data)

and store each summary(lm_y1), summary(lm_y2),... in a dedicated list of regression results. Ideally, I would also want to create dummy variables to add to the list of regressions also country- and time-fixed effects. Thanks a lot in advance!

Solution

This one is leaning more towards Tidyverse. I also left Country in, to my understanding lm() does handle factors, though it's something to consider when interpreting results. Broom for tabular overview.

library(dplyr)
library(tidyr)
library(purrr)
library(broom)

data$Country <- as.factor(data$Country)

lm_out <- map(1:5, ~ combn(c("Year", "Country", "Population", "Debt", "Currency"), .x)) %>% 
  # list of matrices, 1x5, 2x10 .. 4x5, 5x1
  map(~ apply(.x, 2, paste0, collapse = "+")) %>% 
  # formulas without y.~GDP+
  unlist() %>% 
  paste0("GDP+", .) %>%
  # all combinations as data.frame
  expand.grid(y = c("y1", "y2", "y3", "y4", "y5"), terms = .) %>% 
  mutate(frm = paste0(y,"~",terms)) %>% 
  # y       terms            frm
  # 1  y1    GDP+Year    y1~GDP+Year
  # 2  y2    GDP+Year    y2~GDP+Year
  # 3  y3    GDP+Year    y3~GDP+Year
  # 4  y4    GDP+Year    y4~GDP+Year
  # 5  y5    GDP+Year    y5~GDP+Year
  # ...
  pull(frm) %>% 
  set_names() %>% 
  # fit 155 models
  map(~ lm(.x, data = data))

# used formulas can be found from list names:
names(lm_out)[1]
#> [1] "y1~GDP+Year"
summary(lm_out[[1]])
#> 
#> Call:
#> lm(formula = .x, data = data)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -3.3627 -1.6373 -0.2843  2.4118  2.6863 
#> 
#> Coefficients:
#>               Estimate Std. Error t value Pr(>|t|)  
#> (Intercept) -1.604e+04  4.789e+03  -3.350   0.0154 *
#> GDP         -1.677e+00  5.847e-01  -2.867   0.0285 *
#> Year         7.980e+00  2.382e+00   3.351   0.0154 *
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 2.541 on 6 degrees of freedom
#> Multiple R-squared:  0.6541, Adjusted R-squared:  0.5387 
#> F-statistic: 5.672 on 2 and 6 DF,  p-value: 0.0414

# imap to access list names as .y, run tidy and glance over whole list,
# add column with formula
imap_dfr(lm_out, ~ tidy(.x) %>% mutate(formula = .y) )
#> # A tibble: 790 × 6
#>    term          estimate std.error statistic  p.value formula    
#>    <chr>            <dbl>     <dbl>     <dbl>    <dbl> <chr>      
#>  1 (Intercept) -16043.     4789.       -3.35  0.0154   y1~GDP+Year
#>  2 GDP             -1.68      0.585    -2.87  0.0285   y1~GDP+Year
#>  3 Year             7.98      2.38      3.35  0.0154   y1~GDP+Year
#>  4 (Intercept)   8628.     2019.        4.27  0.00524  y2~GDP+Year
#>  5 GDP              1.49      0.246     6.04  0.000933 y2~GDP+Year
#>  6 Year            -4.29      1.00     -4.27  0.00525  y2~GDP+Year
#>  7 (Intercept)   -554.     4645.       -0.119 0.909    y3~GDP+Year
#>  8 GDP             -0.576     0.567    -1.02  0.349    y3~GDP+Year
#>  9 Year             0.280     2.31      0.121 0.907    y3~GDP+Year
#> 10 (Intercept)  -8273.     5882.       -1.41  0.209    y4~GDP+Year
#> # … with 780 more rows

imap_dfr(lm_out, ~ glance(.x) %>% mutate(formula = .y) )
#> # A tibble: 155 × 13
#>    r.squared adj.r.squ…¹ sigma stati…² p.value    df  logLik   AIC   BIC devia…³
#>        <dbl>       <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl>
#>  1     0.654      0.539  2.54    5.67  4.14e-2     2 -19.3    46.7  47.5  38.7  
#>  2     0.879      0.839  1.07   21.8   1.77e-3     2 -11.6    31.1  31.9   6.89 
#>  3     0.420      0.227  2.46    2.18  1.95e-1     2 -19.1    46.1  46.9  36.4  
#>  4     0.269      0.0257 3.12    1.11  3.90e-1     2 -21.2    50.4  51.2  58.5  
#>  5     0.548      0.397  2.43    3.64  9.23e-2     2 -18.9    45.9  46.6  35.4  
#>  6     0.578      0.325  3.07    2.29  1.96e-1     3 -20.2    50.5  51.4  47.2  
#>  7     0.989      0.982  0.356 148.    2.65e-5     3  -0.824  11.6  12.6   0.633
#>  8     0.611      0.378  2.21    2.62  1.63e-1     3 -17.3    44.5  45.5  24.4  
#>  9     0.256     -0.191  3.45    0.573 6.57e-1     3 -21.3    52.5  53.5  59.5  
#> 10     0.580      0.328  2.56    2.30  1.95e-1     3 -18.6    47.2  48.2  32.9  
#> # … with 145 more rows, 3 more variables: df.residual <int>, nobs <int>,
#> #   formula <chr>, and abbreviated variable names ¹adj.r.squared, ²statistic,
#> #   ³deviance

Input:

data<-data.frame(Country=c("USA","USA","USA","USA","India","India","India","India","China","China","China","China"),
                 Indicator=rep(c("Population","GDP","Debt","Currency"),times=3),`2011`=rep(c(1,2,3,4),each=3),`2012`=rep(c(4,5,6,7),each=3),`2013`=rep(c(8,9,11,12),each=3))                                                                                                                       

data<-data  %>%
  pivot_longer(
    starts_with("X"),
    names_to = "Year",
    names_transform = list(Year = readr::parse_number)
  ) %>%
  pivot_wider(names_from = Indicator, values_from = value) %>% 
  relocate(Year)

data$y1<-c(1,10,11,3,4,5,2,2,1)
data$y2<-c(1,2,3,4,5,6,6,8,9)
data$y3<-c(10,9,8,7,5,5,11,3,4)
data$y4<-c(1,1,11,3,4,2,2,2,1)
data$y5<-c(5,10,11,3,5,5,5,5,1)

^{Created on 2023-01-20 with reprex v2.0.2}