Search code examples
rfor-looptidymodels

Series of regression where the dependent variable is each level of a categorical variable


I would like to test how being female affects the day of hospital discharge. For this I would like to run a series of regression where the dependent variable is =1 if Monday is the discharge day and =0 otherwise. Next, model would be =1 if Tuesday, and =0 otherwise... etc. The days of the week at the moment are stored in a categorical variable called wkday.

How would I do this quickly using tidymodel for in a for-loop for example? Here is what I have so far...

# libraries:
library(tidyr)
library(dplyr)

# create dataset:
id <- seq(1:1000)
wkdays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
wkday <- sample(wkdays,1000, replace=T)
female <- sample(0:1, 1000, replace = T)

dta <- data.frame(id=id, wkday=wkday, female=female)

dta$mon <- ifelse(dta$wkday=="Monday",1,0)
dta$tues <- ifelse(dta$wkday=="Tuesday",1,0)
dta$wed <- ifelse(dta$wkday=="Wednesday",1,0)
dta$thurs <- ifelse(dta$wkday=="Thursday",1,0)
dta$fri <- ifelse(dta$wkday=="Friday",1,0)
dta$sat <- ifelse(dta$wkday=="Saturday",1,0)
dta$sun <- ifelse(dta$wkday=="Sunday",1,0)



# Model:
mon <- glm(mon ~ female, data=dta, family = "binomial")
tues <- glm(tues ~ female, data=dta, family = "binomial")

.
.
.

summary(mon)
summary(tues)

Solution

  • Maybe something like the following answers the question.
    First of all, there is no need to create the dummies one by one by hand, model.matrix is meant for that.

    library(tidyr)
    library(dplyr)
    library(purrr)
    library(broom)
    
    # create dataset:
    set.seed(2021)
    id <- 1:1000
    wkdays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
    wkday <- sample(wkdays,1000, replace=T)
    female <- sample(0:1, 1000, replace = T)
    
    dta <- data.frame(id=id, wkday=wkday, female=female)
    
    tmp <- model.matrix(~0 + wkday, dta)
    colnames(tmp) <- sub("wkday", "", colnames(tmp))
    

    Now the models.

    cbind(dta, tmp) %>% 
      select(-wkday) %>%
      pivot_longer(
        cols = -c(id, female),
        names_to = "wkday",
        values_to = "dummy"
      ) %>% 
      group_by(wkday) %>%
      do(tidy(lm(dummy ~ female, data = .)))
    ## A tibble: 14 x 6
    ## Groups:   wkday [7]
    #   wkday     term        estimate std.error statistic  p.value
    #   <chr>     <chr>          <dbl>     <dbl>     <dbl>    <dbl>
    # 1 Friday    (Intercept)  0.128      0.0155     8.26  4.55e-16
    # 2 Friday    female       0.0226     0.0219     1.03  3.03e- 1
    # 3 Monday    (Intercept)  0.152      0.0163     9.30  8.52e-20
    # 4 Monday    female       0.0126     0.0231     0.547 5.84e- 1
    # 5 Saturday  (Intercept)  0.170      0.0163    10.4   3.78e-24
    # 6 Saturday  female      -0.0234     0.0231    -1.01  3.12e- 1
    # 7 Sunday    (Intercept)  0.138      0.0156     8.82  4.88e-18
    # 8 Sunday    female       0.00857    0.0221     0.388 6.98e- 1
    # 9 Thursday  (Intercept)  0.128      0.0157     8.14  1.13e-15
    #10 Thursday  female       0.0326     0.0222     1.47  1.43e- 1
    #11 Tuesday   (Intercept)  0.138      0.0145     9.49  1.63e-20
    #12 Tuesday   female      -0.0355     0.0205    -1.73  8.41e- 2
    #13 Wednesday (Intercept)  0.148      0.0155     9.55  9.66e-21
    #14 Wednesday female      -0.0174     0.0219    -0.797 4.26e- 1
    

    Without the intercepts in the final output (but models with intercept):

    cbind(dta, tmp) %>% 
      select(-wkday) %>%
      pivot_longer(
        cols = -c(id, female),
        names_to = "wkday",
        values_to = "dummy"
      ) %>% 
      group_by(wkday) %>%
      do(tidy(lm(dummy ~ female, data = .))) %>%
      filter(term != "(Intercept)")
    ## A tibble: 7 x 6
    ## Groups:   wkday [7]
    #  wkday     term   estimate std.error statistic p.value
    #  <chr>     <chr>     <dbl>     <dbl>     <dbl>   <dbl>
    #1 Friday    female  0.0226     0.0219     1.03   0.303 
    #2 Monday    female  0.0126     0.0231     0.547  0.584 
    #3 Saturday  female -0.0234     0.0231    -1.01   0.312 
    #4 Sunday    female  0.00857    0.0221     0.388  0.698 
    #5 Thursday  female  0.0326     0.0222     1.47   0.143 
    #6 Tuesday   female -0.0355     0.0205    -1.73   0.0841
    #7 Wednesday female -0.0174     0.0219    -0.797  0.426 
    

    This is simpler, there is no need to create tmp, it can be created in the pipe. To filter out the intercepts is left optional.

    dta%>%
      bind_cols(
        model.matrix(~0 + wkday, dta) %>% as.data.frame
      ) %>%
      select(-wkday) %>% 
      pivot_longer(
        cols = -c(id, female),
        names_to = "wkday",
        values_to = "dummy"
      ) %>% 
      mutate(wkday = sub("^wkday", "", wkday)) %>% 
      group_by(wkday) %>%
      do(tidy(lm(dummy ~ female, data = .)))