Search code examples
rbioinformaticsbioconductor

merge two data.frame with condition in R


I would like to compare two data sets df1 and df2 in such a way that, the unique characters in df2$ID should be added as a new column in df1 and assign df2$Xp value for each gene, if the coordinates of df1 overlaps with the coordinates of df2:

 df1 <- read.table(text="
        Gene     chr  Start End 
        Gm12724   4  1000   1105    
        Zfhx2     4  1254   1369    
        Usp17lc   7  5004   5412
        Lingo1    7  5698   5789
        Sart3     7  5987   6041
        Olfr978   4  1452   1564
    ", header=T)


    df2 <- read.table(text="
        ID      chr Start   End     Xp
        S8411     4  989    1258   0.312
        S8411     4  1300   1800   0.144
        S8411     7  5641   6874   0.136
        S8413     4  1307   1360  -1.999
",header=T)

expected output

 df3 <- read.table(" 
        Gene    chr   Start End   S8411  S8413
        Gm12724   4  1000   1105  0.312     0
        Zfhx2     4  1294   1369  0.144     -1.999
        Usp17lc   7  5004   5412    0       0
        Lingo1    7  5698   5789  0.136     0
        Sart3     7  5987   6041  0.136     0
        Olfr978   4  1452   1564   0.144    0
",header=T)

Solution

  • May be this helps

    library(data.table)
    setkey(setDT(df1),chr, Start, End)
    setkey(setDT(df2), chr, Start, End)
    res <- foverlaps(df1, df2,  type='any')[
       ( Start > i.Start| End> i.End)|is.na(Start)][,
       c('Start', 'End') := list(i.Start, i.End)][,7:8 := NULL]
    dcast(res, ...~ID, value.var='Xp', fill=0)[, -7, with=FALSE]
    #    chr Start  End    Gene S8411  S8413
    #1:   4  1000 1105 Gm12724 0.312  0.000
    #2:   4  1254 1369   Zfhx2 0.144 -1.999
    #3:   4  1452 1564 Olfr978 0.144  0.000
    #4:   7  5004 5412 Usp17lc 0.000  0.000
    #5:   7  5698 5789  Lingo1 0.136  0.000
    #6:   7  5987 6041   Sart3 0.136  0.000