I have a dataframe like this (the real one just has many more variables):
data<-data.frame(Country=c("USA","USA","USA","USA","India","India","India","India","China","China","China","China"),
Indicator=rep(c("Population","GDP","Debt","Currency"),times=3),`2011`=rep(c(1,2,3,4),each=3),`2012`=rep(c(4,5,6,7),each=3),`2013`=rep(c(8,9,11,12),each=3))
data<-data %>%
pivot_longer(
starts_with("X"),
names_to = "Year",
names_transform = list(Year = parse_number)
) %>%
pivot_wider(names_from = Indicator, values_from = value) %>%
relocate(Year)
data$y1<-c(1,10,11,3,4,5,2,2,1)
data$y2<-c(1,2,3,4,5,6,6,8,9)
data$y3<-c(10,9,8,7,5,5,11,3,4)
data$y4<-c(1,1,11,3,4,2,2,2,1)
data$y5<-c(5,10,11,3,5,5,5,5,1)
And I want to loop a linear regression model each time with a new y and different combinations of x (GDP) and control variables (in this case, these are Country, Population, Debt and Currency), like so:
lm_y1_GDP=lm(y1~GDP, data = data)
lm_y1_GDP_year=lm(y1~GDP+year, data = data)
lm_y1_GDP_country=lm(y1~GDP+country, data = data)
...
lm_y5_GDP_country_population_debt_currency=lm(y5~GDP+Country+Population+Debt+Currency, data = data)
and store each summary(lm_y1), summary(lm_y2),...
in a dedicated list of regression results. Ideally, I would also want to create dummy variables to add to the list of regressions also country- and time-fixed effects. Thanks a lot in advance!
This one is leaning more towards Tidyverse. I also left Country
in, to my understanding lm()
does handle factors, though it's something to consider when interpreting results. Broom for tabular overview.
library(dplyr)
library(tidyr)
library(purrr)
library(broom)
data$Country <- as.factor(data$Country)
lm_out <- map(1:5, ~ combn(c("Year", "Country", "Population", "Debt", "Currency"), .x)) %>%
# list of matrices, 1x5, 2x10 .. 4x5, 5x1
map(~ apply(.x, 2, paste0, collapse = "+")) %>%
# formulas without y.~GDP+
unlist() %>%
paste0("GDP+", .) %>%
# all combinations as data.frame
expand.grid(y = c("y1", "y2", "y3", "y4", "y5"), terms = .) %>%
mutate(frm = paste0(y,"~",terms)) %>%
# y terms frm
# 1 y1 GDP+Year y1~GDP+Year
# 2 y2 GDP+Year y2~GDP+Year
# 3 y3 GDP+Year y3~GDP+Year
# 4 y4 GDP+Year y4~GDP+Year
# 5 y5 GDP+Year y5~GDP+Year
# ...
pull(frm) %>%
set_names() %>%
# fit 155 models
map(~ lm(.x, data = data))
# used formulas can be found from list names:
names(lm_out)[1]
#> [1] "y1~GDP+Year"
summary(lm_out[[1]])
#>
#> Call:
#> lm(formula = .x, data = data)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -3.3627 -1.6373 -0.2843 2.4118 2.6863
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -1.604e+04 4.789e+03 -3.350 0.0154 *
#> GDP -1.677e+00 5.847e-01 -2.867 0.0285 *
#> Year 7.980e+00 2.382e+00 3.351 0.0154 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.541 on 6 degrees of freedom
#> Multiple R-squared: 0.6541, Adjusted R-squared: 0.5387
#> F-statistic: 5.672 on 2 and 6 DF, p-value: 0.0414
# imap to access list names as .y, run tidy and glance over whole list,
# add column with formula
imap_dfr(lm_out, ~ tidy(.x) %>% mutate(formula = .y) )
#> # A tibble: 790 × 6
#> term estimate std.error statistic p.value formula
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 (Intercept) -16043. 4789. -3.35 0.0154 y1~GDP+Year
#> 2 GDP -1.68 0.585 -2.87 0.0285 y1~GDP+Year
#> 3 Year 7.98 2.38 3.35 0.0154 y1~GDP+Year
#> 4 (Intercept) 8628. 2019. 4.27 0.00524 y2~GDP+Year
#> 5 GDP 1.49 0.246 6.04 0.000933 y2~GDP+Year
#> 6 Year -4.29 1.00 -4.27 0.00525 y2~GDP+Year
#> 7 (Intercept) -554. 4645. -0.119 0.909 y3~GDP+Year
#> 8 GDP -0.576 0.567 -1.02 0.349 y3~GDP+Year
#> 9 Year 0.280 2.31 0.121 0.907 y3~GDP+Year
#> 10 (Intercept) -8273. 5882. -1.41 0.209 y4~GDP+Year
#> # … with 780 more rows
imap_dfr(lm_out, ~ glance(.x) %>% mutate(formula = .y) )
#> # A tibble: 155 × 13
#> r.squared adj.r.squ…¹ sigma stati…² p.value df logLik AIC BIC devia…³
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.654 0.539 2.54 5.67 4.14e-2 2 -19.3 46.7 47.5 38.7
#> 2 0.879 0.839 1.07 21.8 1.77e-3 2 -11.6 31.1 31.9 6.89
#> 3 0.420 0.227 2.46 2.18 1.95e-1 2 -19.1 46.1 46.9 36.4
#> 4 0.269 0.0257 3.12 1.11 3.90e-1 2 -21.2 50.4 51.2 58.5
#> 5 0.548 0.397 2.43 3.64 9.23e-2 2 -18.9 45.9 46.6 35.4
#> 6 0.578 0.325 3.07 2.29 1.96e-1 3 -20.2 50.5 51.4 47.2
#> 7 0.989 0.982 0.356 148. 2.65e-5 3 -0.824 11.6 12.6 0.633
#> 8 0.611 0.378 2.21 2.62 1.63e-1 3 -17.3 44.5 45.5 24.4
#> 9 0.256 -0.191 3.45 0.573 6.57e-1 3 -21.3 52.5 53.5 59.5
#> 10 0.580 0.328 2.56 2.30 1.95e-1 3 -18.6 47.2 48.2 32.9
#> # … with 145 more rows, 3 more variables: df.residual <int>, nobs <int>,
#> # formula <chr>, and abbreviated variable names ¹adj.r.squared, ²statistic,
#> # ³deviance
Input:
data<-data.frame(Country=c("USA","USA","USA","USA","India","India","India","India","China","China","China","China"),
Indicator=rep(c("Population","GDP","Debt","Currency"),times=3),`2011`=rep(c(1,2,3,4),each=3),`2012`=rep(c(4,5,6,7),each=3),`2013`=rep(c(8,9,11,12),each=3))
data<-data %>%
pivot_longer(
starts_with("X"),
names_to = "Year",
names_transform = list(Year = readr::parse_number)
) %>%
pivot_wider(names_from = Indicator, values_from = value) %>%
relocate(Year)
data$y1<-c(1,10,11,3,4,5,2,2,1)
data$y2<-c(1,2,3,4,5,6,6,8,9)
data$y3<-c(10,9,8,7,5,5,11,3,4)
data$y4<-c(1,1,11,3,4,2,2,2,1)
data$y5<-c(5,10,11,3,5,5,5,5,1)
Created on 2023-01-20 with reprex v2.0.2