need help to create a function so that this function can produce summary statistics joined back to the original but named after the column (with a prefix (i.e. cyl_event_pct, cyl_distn_events for each variable with values to the corresponding variable in the data frame. The function will require input parameters both the dataframe, and the independent variable. in the example below, the independent variable will be mtcars$am
df <- mtcars %>%
select(cyl,gear,vs,am)
df %>%
gather(variable,value, -ncol(df)) %>%
group_by(variable, value) %>%
summarise (n = n(),
n_events = sum(am),
event_pct = round(mean(am),4)
) %>%
mutate(distn_events = round(n_events/sum(n_events),4))
Would like to add event_pct and distn_events as new variable to original df.
# A tibble: 8 x 6
# Groups: variable [3]
variable value n n_events event_pct distn_events
<chr> <dbl> <int> <dbl> <dbl> <dbl>
1 cyl 4 11 8 0.7273 0.6154
2 cyl 6 7 3 0.4286 0.2308
3 cyl 8 14 2 0.1429 0.1538
4 gear 3 15 0 0.0000 0.0000
5 gear 4 12 8 0.6667 0.6154
6 gear 5 5 5 1.0000 0.3846
7 vs 0 18 6 0.3333 0.4615
8 vs 1 14 7 0.5000 0.5385
hope to see a dataframe after with additional columns added as cyl_event_pct, cyl_distn_events, gear_event_pct, gear_distn_events to the below table with values to the corresponding variable in the data frame(excluding the independent variable of am)
df
cyl gear vs am
Mazda RX4 6 4 0 1
Mazda RX4 Wag 6 4 0 1
Datsun 710 4 4 1 1
Hornet 4 Drive 6 3 1 0
Hornet Sportabout 8 3 0 0
Valiant 6 3 1 0
Duster 360 8 3 0 0
Merc 240D 4 4 1 0
Merc 230 4 4 1 0
Merc 280 6 4 1 0
Merc 280C 6 4 1 0
Thank you in advanced you masters of the universe! JT
You could use
cols <- c("cyl", "gear", "vs")
df[paste0(cols,"_event_pct")] <- lapply(cols, function(x) df_stat[df_stat$variable==x,"event_pct"][match(df[[x]], df_stat[df_stat$variable==x,"value"])])
df[paste0(cols,"_distn_events")] <- lapply(cols, function(x) df_stat[df_stat$variable==x,"distn_events"][match(df[[x]], df_stat[df_stat$variable==x,"value"])])
df
Output is:
cyl gear vs am cyl_event_pct gear_event_pct vs_event_pct cyl_distn_events gear_distn_events vs_distn_events
Mazda RX4 6 4 0 1 0.4286 0.6667 0.3333 0.2308 0.6154 0.4615
Mazda RX4 Wag 6 4 0 1 0.4286 0.6667 0.3333 0.2308 0.6154 0.4615
Datsun 710 4 4 1 1 0.7273 0.6667 0.5000 0.6154 0.6154 0.5385
Hornet 4 Drive 6 3 1 0 0.4286 0.0000 0.5000 0.2308 0.0000 0.5385
Hornet Sportabout 8 3 0 0 0.1429 0.0000 0.3333 0.1538 0.0000 0.4615
Valiant 6 3 1 0 0.4286 0.0000 0.5000 0.2308 0.0000 0.5385
Duster 360 8 3 0 0 0.1429 0.0000 0.3333 0.1538 0.0000 0.4615
Merc 240D 4 4 1 0 0.7273 0.6667 0.5000 0.6154 0.6154 0.5385
Merc 230 4 4 1 0 0.7273 0.6667 0.5000 0.6154 0.6154 0.5385
Merc 280 6 4 1 0 0.4286 0.6667 0.5000 0.2308 0.6154 0.5385
Merc 280C 6 4 1 0 0.4286 0.6667 0.5000 0.2308 0.6154 0.5385
#sample data
> dput(df)
structure(list(cyl = c(6L, 6L, 4L, 6L, 8L, 6L, 8L, 4L, 4L, 6L,
6L), gear = c(4L, 4L, 4L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), vs = c(0L,
0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L), am = c(1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("cyl", "gear", "vs",
"am"), class = "data.frame", row.names = c("Mazda RX4", "Mazda RX4 Wag",
"Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant",
"Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C"
))
> dput(df_stat)
structure(list(variable = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L), .Label = c("cyl", "gear", "vs"), class = "factor"),
value = c(4L, 6L, 8L, 3L, 4L, 5L, 0L, 1L), n = c(11L, 7L,
14L, 15L, 12L, 5L, 18L, 14L), n_events = c(8L, 3L, 2L, 0L,
8L, 5L, 6L, 7L), event_pct = c(0.7273, 0.4286, 0.1429, 0,
0.6667, 1, 0.3333, 0.5), distn_events = c(0.6154, 0.2308,
0.1538, 0, 0.6154, 0.3846, 0.4615, 0.5385)), .Names = c("variable",
"value", "n", "n_events", "event_pct", "distn_events"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8"))