Search code examples
rreshape2panel-data

Reshaping the HRS data from wide to long format and creating a time variable


I have the following dataset (containing around 25 more variables other than Weight = W and Height = H), all spanning 10 years.

Currently it has the following form and no time index.

df <- structure(list(data = structure(1:4, .Label = c("Ind_1", "Ind_2", 
"Ind_3", "Ind_4"), class = "factor"), r1weight = c(56, 76, 87, 64
), r2weight = c(57, 75, 88, 66), r3weight = c(56, 76, 87, 65), r4weight = c(56L, 
73L, 85L, 63L), r5weight = c(55L, 77L, 84L, 65L), r1height = c(151L, 163L, 
173L, 153L), r2height = c(154L, 164L, NA, 154L), r3height = c(NA, 165L, NA, 
152L), r4height = c(153L, 162L, 172L, 154L), r5height = c(152,161,171,154)), class = 
"data.frame", row.names = c(NA, 
 -4L)) 

  data  r1w r2w r3w r4w r5w r1h r2h r3h r4h r5h
1 Ind_1  56  57  56  56  55 151 154  NA 153 152
2 Ind_2  76  75  76  73  77 163 164 165 162 161
3 Ind_3  87  88  87  85  84 173  NA  NA 172 171
4 Ind_4  64  66  65  63  65 153 154 152 154 154`

I need to add time variable and reshape to long format, hopefully getting something like this.

dflong <- structure(list(time = structure(1:20, .Label = c("1", "2", 
     "3", "4", "5", "1","2","3","4","5", "1","2","3","4","5","1","2","3","4","5"), 
     class = "factor"), Ind = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4), W = c(56,57,56,56,55,76,75,76,73,77,87,88,87,85,84,64,66,65,63,65),
     H = c(151,154,NA,153,152,163,164,165,162,161,173,NA,NA,172,171,153,154,152,154,154)), class = "data.frame", row.names = c(NA, -20L))

which looks

   time Ind  W   H
1     1   1 56 151
2     2   1 57 154
3     3   1 56  NA
4     4   1 56 153
5     5   1 55 152
6     1   2 76 163
7     2   2 75 164
8     3   2 76 165
9     4   2 73 162
10    5   2 77 161
11    1   3 87 173
12    2   3 88  NA
13    3   3 87  NA
14    4   3 85 172
15    5   3 84 171
16    1   4 64 153
17    2   4 66 154
18    3   4 65 152
19    4   4 63 154
20    5   4 65 154`

I tried to use the reshape2-command and so far I've got:

library(reshape2)
dflong <- melt(df,id.vars = c("idhhpn",r1w-r10w, r1h-r10h (help writing compactly),
     time(needs help constructing) )`

I dont want to write "r1w, r2w, r3w", but more like r1weight-r10weight so I dont have to write all 10 time instances for all 25 variable.

So far I've got to this point

enter image description here

by using the follwing code

melt <- melt(setDT(HRSdata), measure = patterns("idhhpn", "srhlt", "highbp", "diabetes", "cancer", "lungev", "heartp", "strokev", "psychev", "arth", "obese", "agey", "marpart", "male", "black", "hispan", "logass", "logdebt", "atotal", "debt", "lths", "hsorged", "somehs", "scorAA", "bachelor", "graduate", "works62", "works65", "momagey", "dadagey", "dadalive", "momalive", "vigact3", "smokesn"), 
     value.name = c("idhhpn", "srhlt", "highbp", "diabetes", "cancer", "lungev", "heartp", "strokev", "psychev", "arth", "obese", "agey", "marpart", "male", "black", "hispan", "logass", "logdebt", "atotal", "debt", "lths", "hsorged", "somehs", "scorAA", "bachelor", "graduate", "works62", "works65", "momagey", "dadagey", "dadalive", "momalive", "vigact3", "smokesn"), 
     variable.name = "time")[, 
      idhhpn := as.integer(sub("\\D+", "", HRSdata))][order(idhhpn)][, .(time, idhhpn, srhlt, highbp, diabetes, cancer, lungev, heartp, strokev, psychev, arth, obese, agey, marpart, male, black, hispan, logass, logdebt, atotal, debt, lths, hsorged, somehs, scorAA, bachelor, graduate, works62, works65, momagey, dadagey, dadalive, momalive, vigact3, smokesn        )]          

Solution

  • An option using data.table that makes use of the measure/patterns would be to use melt. In the example, the column names have common patterns as 'weight', 'height' which we specify it in measure parameter to convert it to 'long' format, then extract the numeric part with sub to create 'Ind'

    library(data.table)
    melt(setDT(df), measure = patterns("weight", "height"), value.name = c("W", "H"), 
      variable.name = "time")[, 
        Ind := as.integer(sub("\\D+", "", data))][order(Ind)][, .(time, Ind, W, H)]
    #   time Ind  W   H
    # 1:    1   1 56 151
    # 2:    2   1 57 154
    # 3:    3   1 56  NA
    # 4:    4   1 56 153
    # 5:    5   1 55 152
    # 6:    1   2 76 163
    # 7:    2   2 75 164
    # 8:    3   2 76 165
    # 9:    4   2 73 162
    #10:    5   2 77 161
    #11:    1   3 87 173
    #12:    2   3 88  NA
    #13:    3   3 87  NA
    #14:    4   3 85 172
    #15:    5   3 84 171
    #16:    1   4 64 153
    #17:    2   4 66 154
    #18:    3   4 65 152
    #19:    4   4 63 154
    #20:    5   4 65 154