Search code examples
rloopslinear-regressionlmr-colnames

loop regression when column names have special characters


Dummy data shown below. I tried amending my loop regression code to account for spaces in the column names, but it still gives an error. Additionally, I'm not sure how to adjust .x to account for spaces in the independent variable column names too. The purpose of the code is to regress the first column against each of the subsequent columns one at a time. i.e. Canada Price ~ Global Price, then Canada Price ~ Canada Bond Price and so on.

lm.test <- map_dfr(
  set_names(names(df)),
  ~ tidy(lm(as.formula(paste("`",colnames(df[1]),"`","~", .x)),
            data = df)),
  .id = "predictor") 
Canada Price Global Price Canada 2Y Bond Price US - Canada Inflation Oil Price
-0.030661468 -0.000207995 0.084 -0.075 0.002116
-0.034269304 -0.140212141 0.363 -1.062 -0.06122
0.02371464 0.019987 -0.781 0.273 0.209895
-0.007971484 0.090597341 -0.221 0.068 -0.10657
0.005342716 0.060627149 0.151 0.234 -0.22191
-0.002210376 0.046010712 -0.106 0.13 0.073975
0.018338299 0.019799534 0.036 0.005 0.058091
0.000662471 -0.025356047 0.523 0.045 -0.03765
0.002124411 0.045979342 -0.433 0.282 0.365933
0.034191555 0.043211347 -0.08 0.147 0.113365
-0.011470069 -0.033773176 0.396 -0.037 -0.09753
0.007039847 0.050327089 -0.052 0.151 0.145487
-0.028872959 -0.004501371 0.324 -0.214 0.064282
0.009244373 -0.00092413 0.166 -0.066 0.076961
0.017448201 -0.010638258 -0.179 0.053 0.108548
-0.002379819 0.050111674 0.352 -0.161 -0.11261
-0.002103406 0.03094893 0.121 0.106 0.130575
0.019154969 0.08307375 0.051 0.149 0.041074
-0.00124318 -0.054526954 0.345 0.032 0.079687
-0.001241636 0.003319001 -0.289 0.219 0.100941

Edit

Data in dput format.

df1 <-
  structure(list(
    `Canada Price` = c(-0.030661468, -0.034269304, 0.02371464, -0.007971484, 
                       0.005342716, -0.002210376, 0.018338299, 0.000662471, 0.002124411, 
                       0.034191555, -0.011470069, 0.007039847, -0.028872959, 0.009244373, 
                       0.017448201, -0.002379819, -0.002103406, 0.019154969, -0.00124318, -0.001241636), 
    `Global Price` = c(-0.000207995, -0.140212141, 0.019987, 0.090597341, 
                       0.060627149, 0.046010712, 0.019799534, -0.025356047, 0.045979342,
                       0.043211347, -0.033773176, 0.050327089, -0.004501371, -0.00092413, 
                       -0.010638258, 0.050111674, 0.03094893, 0.08307375, -0.054526954, 0.003319001), 
    `Canada 2Y Bond Price` = c(0.084, 0.363, -0.781, -0.221, 0.151, -0.106, 
                               0.036, 0.523, -0.433, -0.08, 0.396, -0.052, 0.324, 0.166, -0.179, 0.352,
                               0.121, 0.051, 0.345, -0.289), 
    `US - Canada Inflation` = c(-0.075, -1.062, 0.273, 0.068, 
                                0.234, 0.13, 0.005, 0.045, 0.282, 0.147, -0.037, 0.151, -0.214, 
                                -0.066, 0.053, -0.161, 0.106, 0.149, 0.032, 0.219), 
    `Oil Price` = c(0.002116, -0.06122, 0.209895, -0.10657, -0.22191, 0.073975, 
                    0.058091, -0.03765, 0.365933, 0.113365, -0.09753, 0.145487, 0.064282, 
                    0.076961, 0.108548, -0.11261, 0.130575, 0.041074, 0.079687, 0.100941)), 
    class = "data.frame", row.names = c(NA, -20L))

Solution

  • Here is a solution to run regressions on all other columns but the first.
    The formula is created with reformulate.

    suppressPackageStartupMessages({
      library(magrittr)
      library(purrr)
    })
    
    # create the formula with 'value' as regressor,
    # it will be the column name after pivoting to long format
    fmla <- reformulate("value", response = names(df1)[1L] |> as.name())
    # run the regressions
    lm_fit_list <- df1 %>%
      tidyr::pivot_longer(-1L) %>%
      split(.$name) %>%
      map(\(df) lm(fmla, data = df))
      
    # this gives some statistics
    lm_smry_list <- lm_fit_list %>% map(summary)
    lm_smry_list %>% map(coef)
    #> $`Canada 2Y Bond Price`
    #>                 Estimate  Std. Error    t value   Pr(>|t|)
    #> (Intercept)  0.001736073 0.003667346  0.4733866 0.64162781
    #> value       -0.025789455 0.011690588 -2.2060015 0.04061964
    #> 
    #> $`Global Price`
    #>                 Estimate  Std. Error    t value   Pr(>|t|)
    #> (Intercept) -0.001515084 0.003712875 -0.4080622 0.68804100
    #> value        0.164831106 0.070018144  2.3541199 0.03012634
    #> 
    #> $`Oil Price`
    #>                 Estimate  Std. Error    t value  Pr(>|t|)
    #> (Intercept) -0.001424249 0.004112478 -0.3463238 0.7331147
    #> value        0.046411939 0.030404220  1.5264966 0.1442663
    #> 
    #> $`US - Canada Inflation`
    #>                 Estimate  Std. Error    t value    Pr(>|t|)
    #> (Intercept) 0.0001923356 0.003176245 0.06055441 0.952381362
    #> value       0.0393945114 0.011325922 3.47826091 0.002683133
    

    Created on 2024-02-23 with reprex v2.0.2


    Edit

    After running summary on the output list, you can extract the values you want from this second list (of summaries).

    suppressPackageStartupMessages({
      library(magrittr)
      library(purrr)
    })
    
    # create the formula with 'value' as regressor,
    # it will be the column name after pivoting to long format
    fmla <- reformulate("value", response = names(df1)[1L] |> as.name())
    # run the regressions
    lm_fit_list <- df1 %>%
      tidyr::pivot_longer(-1L) %>%
      split(.$name) %>%
      map(\(df) lm(fmla, data = df))
      
    lm_smry_list <- lm_fit_list %>% map(summary)
    
    stats <- lm_smry_list %>%
      map(coef) %>%
      map2(names(lm_smry_list), \(x, y) {
        data.frame(Variable = y) %>% cbind(x)
      }) %>% 
      list_rbind() 
    row.names(stats) <- NULL
    stats
    #>                Variable      Estimate  Std. Error     t value    Pr(>|t|)
    #> 1  Canada 2Y Bond Price  0.0017360725 0.003667346  0.47338661 0.641627809
    #> 2  Canada 2Y Bond Price -0.0257894550 0.011690588 -2.20600149 0.040619640
    #> 3          Global Price -0.0015150839 0.003712875 -0.40806217 0.688041004
    #> 4          Global Price  0.1648311056 0.070018144  2.35411990 0.030126342
    #> 5             Oil Price -0.0014242490 0.004112478 -0.34632381 0.733114717
    #> 6             Oil Price  0.0464119390 0.030404220  1.52649664 0.144266337
    #> 7 US - Canada Inflation  0.0001923356 0.003176245  0.06055441 0.952381362
    #> 8 US - Canada Inflation  0.0393945114 0.011325922  3.47826091 0.002683133
    

    Created on 2024-02-23 with reprex v2.0.2