Search code examples
rdplyrpivottidyr

pivot_wider for multiple choice questionnaire


have a dataset from a survey. 2 questions have multiple choice options and it doesn't allow me to use pivot_wider

here is a sample data.

nam=c("per_id","quest","answ")
per_id=c("0012","0023","0045","0003","0003","0045","0003","0012","0023","0004","0004","0001","0005","0546","0001","0546","0005","0004","0004")
quest=c("A","A","A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B")
answ=c("Self","Self","Self","Self","Father","Sister","Daughter","Mother","Father","Self","Father","X1","Y1","Z1","Y1","Z2","X1","X1","X1")

df=cbind.data.frame(per_id,quest,answ)
names(df)=nam

what I want to have is

here


Solution

  • You might try the following approach using split + Reduce + merge based on unique rows.

    Approach

    l = lapply(split(df, df$quest), \(x) { x = unique(x); x$quest = NULL; x })
    # names(l$A)[2] = "A"
    # names(l$B)[2] = "B"
    # in case you have more than two quests: 
    l = lapply(names(l), \(x) setNames(l[[x]], c(names(l[[x]])[1], x)))
    Reduce(\(...) merge(..., all = TRUE), l) 
    

    Result

       per_id        A    B
    1    0001     <NA>   X1
    2    0001     <NA>   Y1
    3    0003     Self <NA>
    4    0003   Father <NA>
    5    0003 Daughter <NA>
    6    0004     Self   X1
    7    0004   Father   X1
    8    0005     <NA>   Y1
    9    0005     <NA>   X1
    10   0012     Self <NA>
    11   0012   Mother <NA>
    12   0023     Self <NA>
    13   0023   Father <NA>
    14   0045     Self <NA>
    15   0045   Sister <NA>
    16   0546     <NA>   Z1
    17   0546     <NA>   Z2
    

    Data

    df = structure(list(per_id = c("0012", "0023", "0045", "0003", "0003", 
    "0045", "0003", "0012", "0023", "0004", "0004", "0001", "0005", 
    "0546", "0001", "0546", "0005", "0004", "0004"), quest = c("A", 
    "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", 
    "B", "B", "B", "B", "B"), answ = c("Self", "Self", "Self", "Self", 
    "Father", "Sister", "Daughter", "Mother", "Father", "Self", "Father", 
    "X1", "Y1", "Z1", "Y1", "Z2", "X1", "X1", "X1")), class = "data.frame", row.names = c(NA, 
    -19L))