I would like to test how being female affects the day of hospital discharge. For this I would like to run a series of regression where the dependent variable is =1 if Monday is the discharge day and =0 otherwise. Next, model would be =1 if Tuesday, and =0 otherwise... etc. The days of the week at the moment are stored in a categorical variable called wkday
.
How would I do this quickly using tidymodel
for in a for-loop
for example? Here is what I have so far...
# libraries:
library(tidyr)
library(dplyr)
# create dataset:
id <- seq(1:1000)
wkdays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
wkday <- sample(wkdays,1000, replace=T)
female <- sample(0:1, 1000, replace = T)
dta <- data.frame(id=id, wkday=wkday, female=female)
dta$mon <- ifelse(dta$wkday=="Monday",1,0)
dta$tues <- ifelse(dta$wkday=="Tuesday",1,0)
dta$wed <- ifelse(dta$wkday=="Wednesday",1,0)
dta$thurs <- ifelse(dta$wkday=="Thursday",1,0)
dta$fri <- ifelse(dta$wkday=="Friday",1,0)
dta$sat <- ifelse(dta$wkday=="Saturday",1,0)
dta$sun <- ifelse(dta$wkday=="Sunday",1,0)
# Model:
mon <- glm(mon ~ female, data=dta, family = "binomial")
tues <- glm(tues ~ female, data=dta, family = "binomial")
.
.
.
summary(mon)
summary(tues)
Maybe something like the following answers the question.
First of all, there is no need to create the dummies one by one by hand, model.matrix
is meant for that.
library(tidyr)
library(dplyr)
library(purrr)
library(broom)
# create dataset:
set.seed(2021)
id <- 1:1000
wkdays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
wkday <- sample(wkdays,1000, replace=T)
female <- sample(0:1, 1000, replace = T)
dta <- data.frame(id=id, wkday=wkday, female=female)
tmp <- model.matrix(~0 + wkday, dta)
colnames(tmp) <- sub("wkday", "", colnames(tmp))
Now the models.
cbind(dta, tmp) %>%
select(-wkday) %>%
pivot_longer(
cols = -c(id, female),
names_to = "wkday",
values_to = "dummy"
) %>%
group_by(wkday) %>%
do(tidy(lm(dummy ~ female, data = .)))
## A tibble: 14 x 6
## Groups: wkday [7]
# wkday term estimate std.error statistic p.value
# <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 Friday (Intercept) 0.128 0.0155 8.26 4.55e-16
# 2 Friday female 0.0226 0.0219 1.03 3.03e- 1
# 3 Monday (Intercept) 0.152 0.0163 9.30 8.52e-20
# 4 Monday female 0.0126 0.0231 0.547 5.84e- 1
# 5 Saturday (Intercept) 0.170 0.0163 10.4 3.78e-24
# 6 Saturday female -0.0234 0.0231 -1.01 3.12e- 1
# 7 Sunday (Intercept) 0.138 0.0156 8.82 4.88e-18
# 8 Sunday female 0.00857 0.0221 0.388 6.98e- 1
# 9 Thursday (Intercept) 0.128 0.0157 8.14 1.13e-15
#10 Thursday female 0.0326 0.0222 1.47 1.43e- 1
#11 Tuesday (Intercept) 0.138 0.0145 9.49 1.63e-20
#12 Tuesday female -0.0355 0.0205 -1.73 8.41e- 2
#13 Wednesday (Intercept) 0.148 0.0155 9.55 9.66e-21
#14 Wednesday female -0.0174 0.0219 -0.797 4.26e- 1
Without the intercepts in the final output (but models with intercept):
cbind(dta, tmp) %>%
select(-wkday) %>%
pivot_longer(
cols = -c(id, female),
names_to = "wkday",
values_to = "dummy"
) %>%
group_by(wkday) %>%
do(tidy(lm(dummy ~ female, data = .))) %>%
filter(term != "(Intercept)")
## A tibble: 7 x 6
## Groups: wkday [7]
# wkday term estimate std.error statistic p.value
# <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#1 Friday female 0.0226 0.0219 1.03 0.303
#2 Monday female 0.0126 0.0231 0.547 0.584
#3 Saturday female -0.0234 0.0231 -1.01 0.312
#4 Sunday female 0.00857 0.0221 0.388 0.698
#5 Thursday female 0.0326 0.0222 1.47 0.143
#6 Tuesday female -0.0355 0.0205 -1.73 0.0841
#7 Wednesday female -0.0174 0.0219 -0.797 0.426
This is simpler, there is no need to create tmp
, it can be created in the pipe. To filter out the intercepts is left optional.
dta%>%
bind_cols(
model.matrix(~0 + wkday, dta) %>% as.data.frame
) %>%
select(-wkday) %>%
pivot_longer(
cols = -c(id, female),
names_to = "wkday",
values_to = "dummy"
) %>%
mutate(wkday = sub("^wkday", "", wkday)) %>%
group_by(wkday) %>%
do(tidy(lm(dummy ~ female, data = .)))