Search code examples
rreshapefrequencytabularsummary

Data reshape and summarization by frequency distribution


I have a dataset obtained by survey:

structure(list(question_1_A...1 = c("A", "A", "B", "C"), question_1_B = c("A", 
"B", "B", "C"), question_1_C = c("A", "A", "A", "B"), question_2_A...4 = c("A", 
"D", "B", "C"), question_2_B = c("D", NA, "B", "C"), question_2_C = c("E", 
"D", "A", "B")), row.names = c(NA, -4L), class = c("tbl_df", 
"tbl", "data.frame"))

I would like to get a dataframe according to this figure:

enter image description here

Thanks!


Solution

  • We may use split.default after removing the substring in the column names

    out <-  lapply(split.default(df1, sub(".*_([A-C]).*", "\\1", names(df1))),
          function(x)  {
         x1 <- round(t(proportions(table(stack(x)[2:1]), 1)), 2)
         x1 })
    

    -output

    > out
    $A
          ind
    values question_1_A...1 question_2_A...4
         A             0.50             0.25
         B             0.25             0.25
         C             0.25             0.25
         D             0.00             0.25
    
    $B
          ind
    values question_1_B question_2_B
         A         0.25         0.00
         B         0.50         0.33
         C         0.25         0.33
         D         0.00         0.33
    
    $C
          ind
    values question_1_C question_2_C
         A         0.75         0.25
         B         0.25         0.25
         D         0.00         0.25
         E         0.00         0.25
    

    If we want it in gt, then we can have

    library(dplyr)
    library(tidyr)
    library(stringr)
    library(gt)
    df1 %>%
        rename_with(~ str_remove(., "\\.+\\d+$")) %>%
        pivot_longer(cols = everything(), names_to = c(".value", "grp"), 
          names_pattern = "(.*)_([A-Z])$", values_drop_na = TRUE) %>%
        pivot_longer(cols = starts_with('question'), 
           names_to = 'question', values_drop_na = TRUE) %>%
        count(grp, question, value)  %>% 
        group_by(grp, question) %>% 
        mutate(n =  sprintf('(%.2f)', round(n/sum(n), 2))) %>% 
        ungroup %>%
        unite(value, value, n, sep = " ") %>% 
        mutate(rn = data.table::rowid(grp, question)) %>% 
        pivot_wider(names_from = question, values_from = value) %>% 
        select(-rn) %>%
        group_by(grp) %>%
        gt(.)
    

    -output

    enter image description here