Search code examples
rdata.tabletransposemeltdcast

Long to wide with different number of rows for each group/ID


I have the following data frame:

df <- data.frame(stringsAsFactors=FALSE,
              ID = c(2016070707, 2016070707, 2016070707, 2017080808, 2017080808,
                     2017080808, 2017080808),
       team_name = c("Harlequins", "Harlequins", "Harlequins", "Bristol Rugby",
                     "Bristol Rugby", "Bristol Rugby", "Bristol Rugby"),
     player_name = c("Karl Dickson", "Tim Swiel", "Alofa Alofa",
                     "Jason Woodward", "Jack Wallace", "Will Hurrell",
                     "Tusi Pisi"),
             DGC = c(7, 5, 0, 3, 0, 8, 9),
              MR = c(3, 8, 31, 25, 0, 85, 0),
              CB = c(2, 9, 1, 13, 0, 0, 0)
  )

That I would like to transform to wide version:

new_df <- data.frame(stringsAsFactors=FALSE,
                ID = c(2016070707, 2017080808),
         team_name = c("Harlequins", "Bristol Rugby"),
          player_1 = c("Karl Dickson", "Jason Woodward"),
          player_2 = c("Tim Swiel", "Jack Wallace"),
          player_3 = c("Alofa Alofa", "Will Hurrell"),
          player_4 = c(NA, "Tusi Pisi"),
             DGC_1 = c(7, 3),
             DGC_2 = c(5, 0),
             DGC_3 = c(0, 8),
             DGC_4 = c(NA, 9),
              MR_1 = c(3, 25),
              MR_2 = c(8, 0),
              MR_3 = c(31, 85),
              MR_4 = c(NA, 0),
              CB_1 = c(2, 13),
              CB_2 = c(9, 0),
              CB_3 = c(1, 0),
              CB_4 = c(NA, 0)
      )

However, as you may see the first team has 3 players and second team has 4 players. How can I transpose df to new_df and also create these extra columns like player_4, DGC_4 in case there is different number of players in the team. And the difference will be compensated by "NAs" (just like in new_df).


Solution

  • Here's one solution using package data.table

    df <- data.frame(stringsAsFactors=FALSE,
                     ID = c(2016070707, 2016070707, 2016070707, 2017080808, 2017080808,
                            2017080808, 2017080808),
                     team_name = c("Harlequins", "Harlequins", "Harlequins", "Bristol Rugby",
                                   "Bristol Rugby", "Bristol Rugby", "Bristol Rugby"),
                     player_name = c("Karl Dickson", "Tim Swiel", "Alofa Alofa",
                                     "Jason Woodward", "Jack Wallace", "Will Hurrell",
                                     "Tusi Pisi"),
                     DGC = c(7, 5, 0, 3, 0, 8, 9),
                     MR = c(3, 8, 31, 25, 0, 85, 0),
                     CB = c(2, 9, 1, 13, 0, 0, 0)
    )
    
    library(data.table)
    #convert to data.table
    setDT(df)
    #create a sequence from 1 to N for each ID
    df[, idx := 1:.N, by = ID]
    #cast wide
    dcast(ID + team_name ~ idx, data = df, value.var = c("player_name", "DGC", "MR", "CB"), )
    #>            ID     team_name  player_name_1 player_name_2 player_name_3
    #> 1: 2016070707    Harlequins   Karl Dickson     Tim Swiel   Alofa Alofa
    #> 2: 2017080808 Bristol Rugby Jason Woodward  Jack Wallace  Will Hurrell
    #>    player_name_4 DGC_1 DGC_2 DGC_3 DGC_4 MR_1 MR_2 MR_3 MR_4 CB_1 CB_2
    #> 1:          <NA>     7     5     0    NA    3    8   31   NA    2    9
    #> 2:     Tusi Pisi     3     0     8     9   25    0   85    0   13    0
    #>    CB_3 CB_4
    #> 1:    1   NA
    #> 2:    0    0
    

    Created on 2019-01-27 by the reprex package (v0.2.1)