Search code examples
rjoindplyrdata-manipulationsummary

How to join summary statistics as new variables for each statistics back to original dataframe


need help to create a function so that this function can produce summary statistics joined back to the original but named after the column (with a prefix (i.e. cyl_event_pct, cyl_distn_events for each variable with values to the corresponding variable in the data frame. The function will require input parameters both the dataframe, and the independent variable. in the example below, the independent variable will be mtcars$am

df <- mtcars %>% 
  select(cyl,gear,vs,am) 

df %>%
  gather(variable,value, -ncol(df)) %>%
  group_by(variable, value) %>%
  summarise (n = n(),
             n_events = sum(am),
             event_pct = round(mean(am),4)
  ) %>%
  mutate(distn_events = round(n_events/sum(n_events),4)) 

Would like to add event_pct and distn_events as new variable to original df.

   # A tibble: 8 x 6
# Groups:   variable [3]
  variable value     n n_events event_pct distn_events
     <chr> <dbl> <int>    <dbl>     <dbl>        <dbl>
1      cyl     4    11        8    0.7273       0.6154
2      cyl     6     7        3    0.4286       0.2308
3      cyl     8    14        2    0.1429       0.1538
4     gear     3    15        0    0.0000       0.0000
5     gear     4    12        8    0.6667       0.6154
6     gear     5     5        5    1.0000       0.3846
7       vs     0    18        6    0.3333       0.4615
8       vs     1    14        7    0.5000       0.5385

hope to see a dataframe after with additional columns added as cyl_event_pct, cyl_distn_events, gear_event_pct, gear_distn_events to the below table with values to the corresponding variable in the data frame(excluding the independent variable of am)

df
                    cyl gear vs am
Mazda RX4             6    4  0  1
Mazda RX4 Wag         6    4  0  1
Datsun 710            4    4  1  1
Hornet 4 Drive        6    3  1  0
Hornet Sportabout     8    3  0  0
Valiant               6    3  1  0
Duster 360            8    3  0  0
Merc 240D             4    4  1  0
Merc 230              4    4  1  0
Merc 280              6    4  1  0
Merc 280C             6    4  1  0

Thank you in advanced you masters of the universe! JT


Solution

  • You could use

    cols <- c("cyl", "gear", "vs")
    df[paste0(cols,"_event_pct")] <- lapply(cols, function(x) df_stat[df_stat$variable==x,"event_pct"][match(df[[x]], df_stat[df_stat$variable==x,"value"])])
    df[paste0(cols,"_distn_events")] <- lapply(cols, function(x) df_stat[df_stat$variable==x,"distn_events"][match(df[[x]], df_stat[df_stat$variable==x,"value"])])
    df
    

    Output is:

                      cyl gear vs am cyl_event_pct gear_event_pct vs_event_pct cyl_distn_events gear_distn_events vs_distn_events
    Mazda RX4           6    4  0  1        0.4286         0.6667       0.3333           0.2308            0.6154          0.4615
    Mazda RX4 Wag       6    4  0  1        0.4286         0.6667       0.3333           0.2308            0.6154          0.4615
    Datsun 710          4    4  1  1        0.7273         0.6667       0.5000           0.6154            0.6154          0.5385
    Hornet 4 Drive      6    3  1  0        0.4286         0.0000       0.5000           0.2308            0.0000          0.5385
    Hornet Sportabout   8    3  0  0        0.1429         0.0000       0.3333           0.1538            0.0000          0.4615
    Valiant             6    3  1  0        0.4286         0.0000       0.5000           0.2308            0.0000          0.5385
    Duster 360          8    3  0  0        0.1429         0.0000       0.3333           0.1538            0.0000          0.4615
    Merc 240D           4    4  1  0        0.7273         0.6667       0.5000           0.6154            0.6154          0.5385
    Merc 230            4    4  1  0        0.7273         0.6667       0.5000           0.6154            0.6154          0.5385
    Merc 280            6    4  1  0        0.4286         0.6667       0.5000           0.2308            0.6154          0.5385
    Merc 280C           6    4  1  0        0.4286         0.6667       0.5000           0.2308            0.6154          0.5385
    


    #sample data
    > dput(df)
    structure(list(cyl = c(6L, 6L, 4L, 6L, 8L, 6L, 8L, 4L, 4L, 6L, 
    6L), gear = c(4L, 4L, 4L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), vs = c(0L, 
    0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L), am = c(1L, 1L, 1L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("cyl", "gear", "vs", 
    "am"), class = "data.frame", row.names = c("Mazda RX4", "Mazda RX4 Wag", 
    "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant", 
    "Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C"
    ))
    > dput(df_stat)
    structure(list(variable = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 
    3L, 3L), .Label = c("cyl", "gear", "vs"), class = "factor"), 
        value = c(4L, 6L, 8L, 3L, 4L, 5L, 0L, 1L), n = c(11L, 7L, 
        14L, 15L, 12L, 5L, 18L, 14L), n_events = c(8L, 3L, 2L, 0L, 
        8L, 5L, 6L, 7L), event_pct = c(0.7273, 0.4286, 0.1429, 0, 
        0.6667, 1, 0.3333, 0.5), distn_events = c(0.6154, 0.2308, 
        0.1538, 0, 0.6154, 0.3846, 0.4615, 0.5385)), .Names = c("variable", 
    "value", "n", "n_events", "event_pct", "distn_events"), class = "data.frame", row.names = c("1", 
    "2", "3", "4", "5", "6", "7", "8"))