Search code examples
rbioconductor

Unique coordinates using GenomicRanges


How can i find unique (non overlapping genomic coordinates) by comparing two datasets using GenomicRanges?

dataset1 =

chr start         end       CNA
1   170900001   171500001   loss
1   11840001    19420001    loss
1   60300001    62700001    gain
1   25520001    25820001    gain

dataset2 =

chr  start       end        CNA
1   170940001   171500001   gain
1   60300001    62700001    gain
1   25520001    25840001    gain
1   119860001   123040001   loss
1   171500001   171580001   gain
1   79240001    84420001    gain

expected output

     chr     start       end        CNA
     1     170940001 171500001  gain
     1    119860001  123040001    loss
     1    171500001  171580001    gain
     1    79240001   84420001     gain

Solution

  • Try this:

    require("GenomicRanges"        )
    
    #data
    x1 <- read.table(text="chr start         end       CNA
    1   170900001   171500001   loss
    1   11840001    19420001    loss
    1   60300001    62700001    gain
    1   25520001    25820001    gain",header=TRUE)
    
    x2 <- read.table(text="chr  start       end        CNA
    1   170940001   171500001   loss
    1   60300001    62700001    gain
    1   25520001    25840001    gain
    1   119860001   123040001   loss
    1   171500001   171580001   gain
    1   79240001    84420001    gain",header=TRUE)
    
    g1 <-  GRanges(seqnames=paste0("chr",x1$chr),
                   IRanges(start=x1$start,
                           end=x1$end)
                   )
    
    g2 <-  GRanges(seqnames=paste0("chr",x2$chr),
                   IRanges(start=x2$start,
                           end=x2$end)
                   )
    
    #result
    setdiff(g1,g2)
    
    #output
    # GRanges object with 2 ranges and 0 metadata columns:
    #       seqnames                 ranges strand
    #          <Rle>              <IRanges>  <Rle>
    #   [1]     chr1 [ 11840001,  19420001]      *
    #   [2]     chr1 [170900001, 170940000]      *
    #   -------
    #   seqinfo: 1 sequence from an unspecified genome; no seqlengths
    

    EDIT: Subset based on CNA and setdiff, then merge - c(). Although, doesn't match with expected output...

    #result
    g1_loss <- setdiff(g1[g1@elementMetadata$CNA=="loss"],
                       g2[g2@elementMetadata$CNA=="loss"])
    g1_loss@elementMetadata$CNA <- "loss"
    
    g2_loss <- setdiff(g2[g2@elementMetadata$CNA=="loss"],
                       g1[g1@elementMetadata$CNA=="loss"])
    g2_loss@elementMetadata$CNA <- "loss"
    
    g1_gain <- setdiff(g1[g1@elementMetadata$CNA=="gain"],
                       g2[g2@elementMetadata$CNA=="gain"])
    g1_gain@elementMetadata$CNA <- "gain"
    
    g2_gain <- setdiff(g2[g2@elementMetadata$CNA=="gain"],
                       g1[g1@elementMetadata$CNA=="gain"])
    g2_gain@elementMetadata$CNA <- "gain"
    
    #merge
    c(g1_gain,g2_gain,g1_loss,g1_gain)
    
    #output
    # GRanges object with 5 ranges and 1 metadata column:
    #     seqnames                 ranges strand |         CNA
    #        <Rle>              <IRanges>  <Rle> | <character>
    # [1]     chr1 [ 25820002,  25840001]      * |        gain
    # [2]     chr1 [ 79240001,  84420001]      * |        gain
    # [3]     chr1 [170940001, 171580001]      * |        gain
    # [4]     chr1 [ 11840001,  19420001]      * |        loss
    # [5]     chr1 [170900001, 171500001]      * |        loss
    # -------
    #   seqinfo: 1 sequence from an unspecified genome; no seqlengths