Search code examples
rdata.tablelapplymeltdcast

Melting two datasets together resulting in NAs in R


I am trying to put together two datasets but seem to be encountering a problem.

Here is my first dataset:

head(merged_assays)
# A tibble: 6 × 13
  squirrel_id grid  sex   ageclass   age cohort  year trialnumber trialdate  assay.local.density assay_avg_fam   OFT1    OFT2
        <dbl> <chr> <chr> <chr>    <dbl>  <int> <dbl> <chr>       <date>                   <dbl>         <dbl>  <dbl>   <dbl>
        7110 AG    F     A            2   2003  2005 1           2005-05-16               0                0  -0.595 -0.0859
        6748 AG    F     Y            1   2004  2005 1           2005-05-17               0.942            0  -0.228 -0.0403
        7545 AG    F     A            3   2002  2005 1           2005-05-17               0.942          146  -0.366 -0.272 
        7841 AG    F     A            2   2003  2005 1           2005-05-17               1.13           228. -1.38   0.252 
        6348 SU    F     A            5   2000  2005 1           2005-05-18               0                0   1.61   0.0803
        7553 SU    F     A            3   2002  2005 1           2005-05-18               0.753            0   0.206  0.442 

The second dataset:

 head(merged_axys)
# A tibble: 6 × 13
  squirrel_id grid  sex   axy_ageclass axy_age byear axy_yr axy_id                 axy_date   axy.local.density axy_avg_fam    PC1     PC2
        <dbl> <chr> <chr> <chr>          <dbl> <int>  <dbl> <chr>                  <date>                 <dbl>       <dbl>  <dbl>   <dbl>
       19553 JO    F     A                  3  2013   2016 19553-2016-03-14-dawn  2016-03-14              2.83        421. -1.51  0.148  
       19553 JO    F     A                  3  2013   2016 19553-2016-03-14-day   2016-03-14              2.83        421.  1.16  0.00814
       19553 JO    F     A                  3  2013   2016 19553-2016-03-14-dusk  2016-03-14              2.83        421. -1.55  0.0961 
       19553 JO    F     A                  3  2013   2016 19553-2016-03-14-night 2016-03-14              2.83        421. -1.41  0.282  
       19553 JO    F     A                  3  2013   2016 19553-2016-03-15-dawn  2016-03-15              2.83        421. -0.541 1.41   
       19553 JO    F     A                  3  2013   2016 19553-2016-03-15-day   2016-03-15              2.83        421.  1.07  0.107  

I then do the following:

L <- lapply(list(merged_assays, merged_axys), reshape2::melt, id.vars = 1:11)

DT <- data.table::rbindlist(L, use.names = FALSE, fill = FALSE)

merged<-data.table::dcast(DT, ... ~ variable, value.var = "value")

However, I get this output:

merged       
       squirrel_id grid  sex ageclass age cohort year         trialnumber  trialdate assay.local.density assay_avg_fam       OFT1       OFT2
    1:          NA <NA> <NA>     <NA>  NA     NA 2022  NA-2022-06-14-dawn 2022-06-14                  NA            NA         NA         NA
    2:          NA <NA> <NA>     <NA>  NA     NA 2022   NA-2022-06-14-day 2022-06-14                  NA            NA         NA         NA
    3:          NA <NA> <NA>     <NA>  NA     NA 2022  NA-2022-06-14-dusk 2022-06-14                  NA            NA         NA         NA
    4:          NA <NA> <NA>     <NA>  NA     NA 2022 NA-2022-06-14-night 2022-06-14                  NA            NA         NA         NA
    5:          NA <NA> <NA>     <NA>  NA     NA 2022  NA-2022-06-15-dawn 2022-06-15                  NA            NA         NA         NA
   ---                                                                                                                                      
38848:       25881   KL    F        Y   1   2022 2023                   1 2023-05-18                   0             0 -2.3801853  2.6514424
38849:       25970   KL    F        Y   1   2022 2023                   1 2023-05-20                   0             0  2.5803171  1.7898205
38850:       26010   JO    F        Y   1   2022 2023                   1 2023-05-23                   0             0  0.4207975  0.2827015
38851:       26017   KL    F        Y   1   2022 2023                   1 2023-05-18                   0             0 -0.7111936 -0.1329425
38852:      100071   KL    F        J   0   2023 2023                   1 2023-05-18                   0             0 -2.2823039  2.6170468
             PC1         PC2
    1: -1.503872  0.15660897
    2:  1.244080 -0.09492022
    3: -1.455461  0.21956184
    4: -1.527755  0.12555208
    5: -1.478833  0.18916969
   ---                      
38848:        NA          NA
38849:        NA          NA
38850:        NA          NA
38851:        NA          NA
38852:        NA          NA

The desired output:

squirrel_id grid  sex   ageclass   age cohort  year trialnumber      trialdate  assay.local.density assay_avg_fam   OFT1    OFT2    PC1   PC2
7110 AG    F     A            2   2003  2005 1                       2005-05-16               0                0  -0.595 -0.0859   NA   NA
6748 AG    F     Y            1   2004  2005 1                       2005-05-17               0.942            0  -0.228 -0.0403   NA   NA
7545 AG    F     A            3   2002  2005 1                       2005-05-17               0.942          146  -0.366 -0.272   NA   NA 
7841 AG    F     A            2   2003  2005 1                       2005-05-17               1.13           228. -1.38   0.252   NA   NA 
6348 SU    F     A            5   2000  2005 1                       2005-05-18               0                0   1.61   0.0803   NA   NA
7553 SU    F     A            3   2002  2005 1                       2005-05-18               0.753            0   0.206  0.442   NA   NA 
19553 JO    F     A            3  2013   2016 19553-2016-03-14-dawn  2016-03-14              2.83            421.   NA     NA -1.51  0.148  
19553 JO    F     A            3  2013   2016 19553-2016-03-14-day   2016-03-14              2.83            421.   NA     NA  1.16  0.00814
19553 JO    F     A            3  2013   2016 19553-2016-03-14-dusk  2016-03-14              2.83            421.   NA     NA -1.55  0.0961 
19553 JO    F     A            3  2013   2016 19553-2016-03-14-night 2016-03-14              2.83            421.   NA     NA -1.41  0.282  
19553 JO    F     A            3  2013   2016 19553-2016-03-15-dawn  2016-03-15              2.83            421.   NA     NA -0.541 1.41   
19553 JO    F     A            3  2013   2016 19553-2016-03-15-day   2016-03-15              2.83            421.   NA     NA  1.07  0.107  

Does anyone know what I am doing wrong in when I am putting these two datasets together? I'd appreciate any suggestions - including solutions that aren't in the same format as the code I've been working on.


Solution

  • Up front, I'm assuming that the difference in column names is not a concern for you. While some seem seem harmless (ageclass vs axy_ageclass), I'm not as confident about trialnumber-vs-axy_id.

    Assuming that you know these are good, we need to change the names in order to do what you want.

    First attempt goes through the melt-rbind-dcast method you suggested. To do this, we need to not use rbindlist, since it will not ignore the column names as I think you want.

    L <- lapply(list(merged_assays, merged_axys), reshape2::melt, id.vars = 1:11)
    names(L[[2]]) <- names(L[[1]])
    as.data.table(do.call(rbind, L)) |>
      data.table::dcast(... ~ variable, value.var="value")
    # Key: <squirrel_id, grid, sex, ageclass, age, cohort, year, trialnumber, trialdate, assay.local.density, assay_avg_fam>
    # #     squirrel_id   grid    sex ageclass   age cohort  year            trialnumber  trialdate assay.local.density assay_avg_fam   OFT1    OFT2    PC1     PC2
    # #           <int> <char> <char>   <char> <int>  <int> <int>                 <char>     <char>               <num>         <num>  <num>   <num>  <num>   <num>
    # #  1:        6348     SU      F        A     5   2000  2005                      1 2005-05-18               0.000             0  1.610  0.0803     NA      NA
    # #  2:        6748     AG      F        Y     1   2004  2005                      1 2005-05-17               0.942             0 -0.228 -0.0403     NA      NA
    # #  3:        7110     AG      F        A     2   2003  2005                      1 2005-05-16               0.000             0 -0.595 -0.0859     NA      NA
    # #  4:        7545     AG      F        A     3   2002  2005                      1 2005-05-17               0.942           146 -0.366 -0.2720     NA      NA
    # #  5:        7553     SU      F        A     3   2002  2005                      1 2005-05-18               0.753             0  0.206  0.4420     NA      NA
    # #  6:        7841     AG      F        A     2   2003  2005                      1 2005-05-17               1.130           228 -1.380  0.2520     NA      NA
    # #  7:       19553     JO      F        A     3   2013  2016  19553-2016-03-14-dawn 2016-03-14               2.830           421     NA      NA -1.510 0.14800
    # #  8:       19553     JO      F        A     3   2013  2016   19553-2016-03-14-day 2016-03-14               2.830           421     NA      NA  1.160 0.00814
    # #  9:       19553     JO      F        A     3   2013  2016  19553-2016-03-14-dusk 2016-03-14               2.830           421     NA      NA -1.550 0.09610
    # # 10:       19553     JO      F        A     3   2013  2016 19553-2016-03-14-night 2016-03-14               2.830           421     NA      NA -1.410 0.28200
    # # 11:       19553     JO      F        A     3   2013  2016  19553-2016-03-15-dawn 2016-03-15               2.830           421     NA      NA -0.541 1.41000
    # # 12:       19553     JO      F        A     3   2013  2016   19553-2016-03-15-day 2016-03-15               2.830           421     NA      NA  1.070 0.10700
    

    Second attempt changes the names before melting,

    names(merged_axys)[1:11] <- names(merged_assays)[1:11]
    L <- lapply(list(merged_assays, merged_axys), reshape2::melt, id.vars = 1:11)
    data.table::rbindlist(L, use.names = FALSE, fill = FALSE) |>
      dcast(... ~ variable, value.name = "value")
    #  Key: <squirrel_id, grid, sex, ageclass, age, cohort, year, trialnumber, trialdate, assay.local.density, assay_avg_fam>
    #     squirrel_id   grid    sex ageclass   age cohort  year            trialnumber  trialdate assay.local.density assay_avg_fam   OFT1    OFT2    PC1     PC2
    #           <int> <char> <char>   <char> <int>  <int> <int>                 <char>     <char>               <num>         <num>  <num>   <num>  <num>   <num>
    #  1:        6348     SU      F        A     5   2000  2005                      1 2005-05-18               0.000             0  1.610  0.0803     NA      NA
    #  2:        6748     AG      F        Y     1   2004  2005                      1 2005-05-17               0.942             0 -0.228 -0.0403     NA      NA
    #  3:        7110     AG      F        A     2   2003  2005                      1 2005-05-16               0.000             0 -0.595 -0.0859     NA      NA
    #  4:        7545     AG      F        A     3   2002  2005                      1 2005-05-17               0.942           146 -0.366 -0.2720     NA      NA
    #  5:        7553     SU      F        A     3   2002  2005                      1 2005-05-18               0.753             0  0.206  0.4420     NA      NA
    #  6:        7841     AG      F        A     2   2003  2005                      1 2005-05-17               1.130           228 -1.380  0.2520     NA      NA
    #  7:       19553     JO      F        A     3   2013  2016  19553-2016-03-14-dawn 2016-03-14               2.830           421     NA      NA -1.510 0.14800
    #  8:       19553     JO      F        A     3   2013  2016   19553-2016-03-14-day 2016-03-14               2.830           421     NA      NA  1.160 0.00814
    #  9:       19553     JO      F        A     3   2013  2016  19553-2016-03-14-dusk 2016-03-14               2.830           421     NA      NA -1.550 0.09610
    # 10:       19553     JO      F        A     3   2013  2016 19553-2016-03-14-night 2016-03-14               2.830           421     NA      NA -1.410 0.28200
    # 11:       19553     JO      F        A     3   2013  2016  19553-2016-03-15-dawn 2016-03-15               2.830           421     NA      NA -0.541 1.41000
    # 12:       19553     JO      F        A     3   2013  2016   19553-2016-03-15-day 2016-03-15               2.830           421     NA      NA  1.070 0.10700
    

    Third attempt skips the melting and just rbinds them, since I don't think that's really necessary (I could be wrong). Still, this renames the second frame's first 11 columns again.

    # from original frames
    names(merged_axys)[1:11] <- names(merged_assays)[1:11]
    rbindlist(list(merged_assays, merged_axys), fill = TRUE, use.names = TRUE)
    #     squirrel_id   grid    sex ageclass   age cohort  year            trialnumber  trialdate assay.local.density assay_avg_fam   OFT1    OFT2    PC1     PC2
    #           <int> <char> <char>   <char> <int>  <int> <int>                 <char>     <char>               <num>         <num>  <num>   <num>  <num>   <num>
    #  1:        7110     AG      F        A     2   2003  2005                      1 2005-05-16               0.000             0 -0.595 -0.0859     NA      NA
    #  2:        6748     AG      F        Y     1   2004  2005                      1 2005-05-17               0.942             0 -0.228 -0.0403     NA      NA
    #  3:        7545     AG      F        A     3   2002  2005                      1 2005-05-17               0.942           146 -0.366 -0.2720     NA      NA
    #  4:        7841     AG      F        A     2   2003  2005                      1 2005-05-17               1.130           228 -1.380  0.2520     NA      NA
    #  5:        6348     SU      F        A     5   2000  2005                      1 2005-05-18               0.000             0  1.610  0.0803     NA      NA
    #  6:        7553     SU      F        A     3   2002  2005                      1 2005-05-18               0.753             0  0.206  0.4420     NA      NA
    #  7:       19553     JO      F        A     3   2013  2016  19553-2016-03-14-dawn 2016-03-14               2.830           421     NA      NA -1.510 0.14800
    #  8:       19553     JO      F        A     3   2013  2016   19553-2016-03-14-day 2016-03-14               2.830           421     NA      NA  1.160 0.00814
    #  9:       19553     JO      F        A     3   2013  2016  19553-2016-03-14-dusk 2016-03-14               2.830           421     NA      NA -1.550 0.09610
    # 10:       19553     JO      F        A     3   2013  2016 19553-2016-03-14-night 2016-03-14               2.830           421     NA      NA -1.410 0.28200
    # 11:       19553     JO      F        A     3   2013  2016  19553-2016-03-15-dawn 2016-03-15               2.830           421     NA      NA -0.541 1.41000
    # 12:       19553     JO      F        A     3   2013  2016   19553-2016-03-15-day 2016-03-15               2.830           421     NA      NA  1.070 0.10700
    

    Original data:

    merged_assays <- structure(list(squirrel_id = c(7110L, 6748L, 7545L, 7841L, 6348L, 7553L), grid = c("AG", "AG", "AG", "AG", "SU", "SU"), sex = c("F", "F", "F", "F", "F", "F"), ageclass = c("A", "Y", "A", "A", "A", "A"), age = c(2L, 1L, 3L, 2L, 5L, 3L), cohort = c(2003L, 2004L, 2002L, 2003L, 2000L, 2002L), year = c(2005L, 2005L, 2005L, 2005L, 2005L, 2005L), trialnumber = c(1L, 1L, 1L, 1L, 1L, 1L), trialdate = c("2005-05-16", "2005-05-17", "2005-05-17", "2005-05-17", "2005-05-18", "2005-05-18"), assay.local.density = c(0,  0.942, 0.942, 1.13, 0, 0.753), assay_avg_fam = c(0, 0, 146, 228, 0, 0), OFT1 = c(-0.595, -0.228, -0.366, -1.38, 1.61, 0.206), OFT2 = c(-0.0859, -0.0403, -0.272, 0.252, 0.0803, 0.442)), row.names = c(NA, -6L), class = "data.frame")
    merged_axys <- structure(list(squirrel_id = c(19553L, 19553L, 19553L, 19553L, 19553L, 19553L), grid = c("JO", "JO", "JO", "JO", "JO", "JO"), sex = c("F", "F", "F", "F", "F", "F"), axy_ageclass = c("A", "A", "A", "A", "A", "A"), axy_age = c(3L, 3L, 3L, 3L, 3L, 3L), byear = c(2013L, 2013L, 2013L, 2013L, 2013L, 2013L), axy_yr = c(2016L, 2016L, 2016L, 2016L, 2016L, 2016L), axy_id = c("19553-2016-03-14-dawn", "19553-2016-03-14-day", "19553-2016-03-14-dusk", "19553-2016-03-14-night", "19553-2016-03-15-dawn", "19553-2016-03-15-day" ), axy_date = c("2016-03-14", "2016-03-14", "2016-03-14", "2016-03-14", "2016-03-15", "2016-03-15"), axy.local.density = c(2.83, 2.83, 2.83, 2.83, 2.83, 2.83), axy_avg_fam = c(421, 421, 421, 421, 421, 421), PC1 = c(-1.51, 1.16, -1.55, -1.41, -0.541, 1.07), PC2 = c(0.148, 0.00814, 0.0961, 0.282, 1.41, 0.107)), row.names = c(NA, -6L), class = "data.frame")