Search code examples
rmergetidyrmelt

Merge two datasets, keep all row differences and add similar rows


I have two datasets Data and Data1. I want to merge these keeping all difference while adding the numeric values together in the new table for all common rows. Is there any easy tool for this?

head(Data)
  contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
1   chr1   905373         .         T         C        2        4          6            0             0        6          0             0
2   chr1   911428         .         C         T        1        2          3            0             0        3          0             0
3   chr1   953279         .         T         C      146      126        272            0             0      273          1             0
4   chr1   962184         .         T         C       14       15         29            0             0       29          0             0
5   chr1  1024129         .         T         G        1        0          1            0             0        1          0             0
6   chr1  1039514         .         C         T        1        1          2            0             0        2          0             0
head(Data1)
  contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
1   chr1   905373         .         T         C        2        3          5            0             0        5          0             0
2   chr1   933024         .         C         T        1        0          1            0             0        1          0             0
3   chr1   953279         .         T         C      122      124        246            0             0      248          2             0
4   chr1   962184         .         T         C       17       21         38            0             0       38          0             0
5   chr1  1022518         .         G         T        0        1          1            0             0        1          0             0
6   chr1  1024129         .         T         G        1        2          3            0             0        3          0             0

Wanted Output example

contig  position    variantID   refAllele   altAllele   refCount    altCount    totalCount lowMAPQDepth lowBaseQDepth   rawDepth    otherBases  improperPairs
1   chr1    905373  .   T   C   4   7   11  0   0   11  0   0
2   chr1    911428  .   C   T   1   2   3   0   0   3   0   0
2   chr1    933024  .   C   T   1   0   1   0   0   1   0   0
4   chr1    953279  .   T   C   268 150 518 0   0   521 3   0

As we can see in column position site 905373 is common is added together from column refCount on. while site 911428 and 933024 are both unique to their datasets but inserted into the new dataset. Is their a none painful way of creating the output table?

Data <- structure(list(contig = c("chr1", "chr1", "chr1", "chr1", "chr1", 
"chr1"), position = c(905373L, 911428L, 953279L, 962184L, 1024129L, 
1039514L), variantID = c(".", ".", ".", ".", ".", "."), refAllele = c("T", 
"C", "T", "T", "T", "C"), altAllele = c("C", "T", "C", "C", "G", 
"T"), refCount = c(2L, 1L, 146L, 14L, 1L, 1L), altCount = c(4L, 
2L, 126L, 15L, 0L, 1L), totalCount = c(6L, 3L, 272L, 29L, 1L, 
2L), lowMAPQDepth = c(0L, 0L, 0L, 0L, 0L, 0L), lowBaseQDepth = c(0L, 
0L, 0L, 0L, 0L, 0L), rawDepth = c(6L, 3L, 273L, 29L, 1L, 2L), 
    otherBases = c(0L, 0L, 1L, 0L, 0L, 0L), improperPairs = c(0L, 
    0L, 0L, 0L, 0L, 0L)), row.names = c(NA, 6L), class = "data.frame")

Data1 <- structure(list(contig = c("chr1", "chr1", "chr1", "chr1", "chr1", 
"chr1"), position = c(905373L, 933024L, 953279L, 962184L, 1022518L, 
1024129L), variantID = c(".", ".", ".", ".", ".", "."), refAllele = c("T", 
"C", "T", "T", "G", "T"), altAllele = c("C", "T", "C", "C", "T", 
"G"), refCount = c(2L, 1L, 122L, 17L, 0L, 1L), altCount = c(3L, 
0L, 124L, 21L, 1L, 2L), totalCount = c(5L, 1L, 246L, 38L, 1L, 
3L), lowMAPQDepth = c(0L, 0L, 0L, 0L, 0L, 0L), lowBaseQDepth = c(0L, 
0L, 0L, 0L, 0L, 0L), rawDepth = c(5L, 1L, 248L, 38L, 1L, 3L), 
    otherBases = c(0L, 0L, 2L, 0L, 0L, 0L), improperPairs = c(0L, 
    0L, 0L, 0L, 0L, 0L)), row.names = c(NA, 6L), class = "data.frame")

Solution

  • Here a posibility:

    Data
    #>   contig position variantID refAllele altAllele refCount altCount totalCount
    #> 1   chr1   905373         .         T         C        2        4          6
    #> 2   chr1   911428         .         C         T        1        2          3
    #> 3   chr1   953279         .         T         C      146      126        272
    #> 4   chr1   962184         .         T         C       14       15         29
    #> 5   chr1  1024129         .         T         G        1        0          1
    #> 6   chr1  1039514         .         C         T        1        1          2
    #>   lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
    #> 1            0             0        6          0             0
    #> 2            0             0        3          0             0
    #> 3            0             0      273          1             0
    #> 4            0             0       29          0             0
    #> 5            0             0        1          0             0
    #> 6            0             0        2          0             0
    
    Data1
    #>   contig position variantID refAllele altAllele refCount altCount totalCount
    #> 1   chr1   905373         .         T         C        2        3          5
    #> 2   chr1   933024         .         C         T        1        0          1
    #> 3   chr1   953279         .         T         C      122      124        246
    #> 4   chr1   962184         .         T         C       17       21         38
    #> 5   chr1  1022518         .         G         T        0        1          1
    #> 6   chr1  1024129         .         T         G        1        2          3
    #>   lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
    #> 1            0             0        5          0             0
    #> 2            0             0        1          0             0
    #> 3            0             0      248          2             0
    #> 4            0             0       38          0             0
    #> 5            0             0        1          0             0
    #> 6            0             0        3          0             0
    
    aggregate(. ~ contig +  position + variantID + refAllele + altAllele, rbind(Data, Data1), sum)
    #>   contig position variantID refAllele altAllele refCount altCount totalCount
    #> 1   chr1   905373         .         T         C        4        7         11
    #> 2   chr1   953279         .         T         C      268      250        518
    #> 3   chr1   962184         .         T         C       31       36         67
    #> 4   chr1  1024129         .         T         G        2        2          4
    #> 5   chr1   911428         .         C         T        1        2          3
    #> 6   chr1   933024         .         C         T        1        0          1
    #> 7   chr1  1039514         .         C         T        1        1          2
    #> 8   chr1  1022518         .         G         T        0        1          1
    #>   lowMAPQDepth lowBaseQDepth rawDepth otherBases improperPairs
    #> 1            0             0       11          0             0
    #> 2            0             0      521          3             0
    #> 3            0             0       67          0             0
    #> 4            0             0        4          0             0
    #> 5            0             0        3          0             0
    #> 6            0             0        1          0             0
    #> 7            0             0        2          0             0
    #> 8            0             0        1          0             0