Search code examples
rbioinformaticsgenome

How to coerce data.frame objects to Genomic Ranges objects in R?


I have several bed files in R as data.frame objects. Now I want to find overlap between at lean two bed files by element-wise.

To be clarify my question, I need to make a test row by row in 1st bed files (already in data frame objects), so taking only one row of the data frame as query, then give it to the interval tree where interval tree holding 2nd bed files(but need to coerce GRanges objects first).

my data looks like :

   idx  chrom     start    End    name        score  p-value
    1   chr1      32727    32817  MACS_peak_1  8.69 1.150748e-11
    2   chr1      52489    52552  MACS_peak_2  4.26 2.347418e-11
    3   chr1      65527    65590  MACS_peak_3  4.19 2.386635e-11
    4   chr1      65773    65904  MACS_peak_4  2.02 4.950495e-11
    5   chr1      66001    66117  MACS_peak_5  5.66 1.766784e-11
    6   chr1     115700   115769  MACS_peak_6 10.30 9.708738e-12
    7   chr1     136389   136452  MACS_peak_7  4.26 2.347418e-11
    8   chr1     235352   235415  MACS_peak_8  4.26 2.347418e-11
    9   chr1     235636   235700  MACS_peak_9  5.66 1.766784e-11
    10  chr1     432895   432958 MACS_peak_10  4.26 2.347418e-11


f1 <- function(bed.1, bed.2){
  query<- GRanges()
  subject = bed.2
  for(i in 1: length(bed.1)){
    query<-bed.1[i]
    o <- GenomicRanges::findOverlaps(query, subject, minoverlap = 2L, algorithm="intervaltree")
    hitfrom_<-query[queryHits(o)]
    hitTo_<-subject[subjectHits(o)]
    pint <-pintersect(hitfrom_, hitTo_)
    return(pint)
  }
}

This is my code how to iterate set of GRanges objects in bed.1 and call findOverlap() function to find where is the overlapped GRanges. This code doesn't give me the results what I want. somebody help me out ?? Thank you


Solution

  • I guess you don't need to operate row by row.

    text1 <- "idx  chrom     start    End    name        score  p-value
    1   chr1      32727    32817  MACS_peak_1  8.69 1.150748e-11
    2   chr1      52489    52552  MACS_peak_2  4.26 2.347418e-11
    3   chr1      65527    65590  MACS_peak_3  4.19 2.386635e-11
    4   chr1      65773    65904  MACS_peak_4  2.02 4.950495e-11
    5   chr1      66001    66117  MACS_peak_5  5.66 1.766784e-11
    6   chr1     115700   115769  MACS_peak_6 10.30 9.708738e-12
    7   chr1     136389   136452  MACS_peak_7  4.26 2.347418e-11
    8   chr1     235352   235415  MACS_peak_8  4.26 2.347418e-11
    9   chr1     235636   235700  MACS_peak_9  5.66 1.766784e-11
    10  chr1     432895   432958 MACS_peak_10  4.26 2.347418e-11"
    
    bed1 <- read.table(text=text1, head=T, as.is=T)
    
    library(GenomicRanges)
    
    bed1.gr <- GRanges(bed1$chrom, IRanges(bed1$start, bed1$End))
    bed2 <- data.frame(chr=c("chr1", "chr1"),
                       start=c(30000, 130000),
                       end=c(60000, 200000), stringsAsFactors = FALSE)
    bed2.gr <- GRanges(bed2$chr, IRanges(bed2$start, bed2$end))
    op <- findOverlaps(bed1.gr, bed2.gr)
    
    op.df <- data.frame(que=queryHits(op), sub=subjectHits(op),
                        stringsAsFactors = FALSE)
    
    bed1$que <- 1:nrow(bed1)
    bed2$sub <- 1:nrow(bed2)
    
    bed.n <- merge(bed1, op.df, by="que", all=T)
    bed.n <- merge(bed.n, bed2, by="sub", all=T)
    bed.n$que <- NULL
    bed.n$sub <- NULL
    bed.n
    #    idx chrom start.x    End         name score      p.value  chr start.y   end
    # 1    1  chr1   32727  32817  MACS_peak_1  8.69 1.150748e-11 chr1   30000 6e+04
    # 2    2  chr1   52489  52552  MACS_peak_2  4.26 2.347418e-11 chr1   30000 6e+04
    # 3    7  chr1  136389 136452  MACS_peak_7  4.26 2.347418e-11 chr1  130000 2e+05
    # 4    5  chr1   66001  66117  MACS_peak_5  5.66 1.766784e-11 <NA>      NA    NA
    # 5    6  chr1  115700 115769  MACS_peak_6 10.30 9.708738e-12 <NA>      NA    NA
    # 6    3  chr1   65527  65590  MACS_peak_3  4.19 2.386635e-11 <NA>      NA    NA
    # 7    4  chr1   65773  65904  MACS_peak_4  2.02 4.950495e-11 <NA>      NA    NA
    # 8    9  chr1  235636 235700  MACS_peak_9  5.66 1.766784e-11 <NA>      NA    NA
    # 9   10  chr1  432895 432958 MACS_peak_10  4.26 2.347418e-11 <NA>      NA    NA
    # 10   8  chr1  235352 235415  MACS_peak_8  4.26 2.347418e-11 <NA>      NA    NA