Search code examples
rdataframeloopsconditional-statementsrbind

how to extract rows which match based on two conditions from two dataframes


I have two dataframe one is snp region and other is Hic region.

dput(hic_region[1:15,1:4])
structure(list(rf1 = c(57584944, 57584944, 57584944, 57584944, 
57584944, 57584944, 57584944, 57584944, 50263463, 5e+07, 50263463, 
50263463, 35172197, 35172197, 57584944), rt1 = c(57624944, 57624944, 
57624944, 57624944, 57624944, 57624944, 57624944, 57624944, 50303463, 
51423463, 50303463, 50303463, 35212197, 35212197, 57624944), 
    rf2 = c(1899354, 1899354, 1899354, 1899354, 1899354, 1579354, 
    1899354, 1899354, 1899354, 1779354, 1899354, 1899354, 1899354, 
    1899354, 1499354), rt2 = c(1939354, 1939354, 1939354, 1939354, 
    1939354, 1619354, 1939354, 1939354, 1939354, 1819354, 1939354, 
    1939354, 1939354, 1939354, 1539354)), row.names = c(NA, -15L
), class = c("tbl_df", "tbl", "data.frame"))
> dput(snp_region[1:10,1:6])
structure(list(Gene = c("ENSG00000132819", "ENSG00000101162", 
"ENSG00000132819", "ENSG00000101162", "ENSG00000101162", "ENSG00000101162", 
"ENSG00000101162", "ENSG00000101162", "ENSG00000101162", "ENSG00000101162"
), `Gene-Chr` = c(20, 20, 20, 20, 20, 20, 20, 20, 20, 20), `Gene-Pos` = c(55975426, 
57598009, 55975426, 57598009, 57598009, 57598009, 57598009, 57598009, 
57598009, 57598009), RsId = c("rs6084653", "rs156356", "rs1741314", 
"rs6136489", "rs4814776", "rs13042885", "rs4814779", "rs6045615", 
"rs11696739", "rs4618126"), `SNP-Chr` = c(20, 20, 20, 20, 20, 
20, 20, 20, 20, 20), `SNP-Pos` = c(4157072, 1819280, 4155193, 
1923734, 1921523, 1924707, 1923271, 1931582, 1600925, 1930885
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))

I want to match the gene present in snp region to hic region from rf1 to rt1, if its present between these two region(rf1 and rt1), select that and then match snp position from snp region using rf2 and rt2 region. I wrote a code but it gives me repeated values.

snp <- data.frame()
for(i in 1:dim(snp_region)[1]){
  snp_pos <- snp_region$`SNP-Pos`[i]
  for(j in 1:dim(hic_region)[1]){
  if(snp_region$`Gene-Pos`[i] %in% seq(hic_region$rf1[j],hic_region$rt1[j],1)){
   hic_region1 <- hic_region[j,]
    if(snp_pos %in% seq(hic_region1$rf2,hic_region1$rt2,1)){
      preset <<- snp_region[i,]
    } 
  }
  }
  snp <- rbind(snp,preset)
}


dput(snp[1:5,1:6])
structure(list(Gene = c("ENSG00000131069", "ENSG00000131069", 
"ENSG00000131069", "ENSG00000101162", "ENSG00000101162"), `Gene-Chr` = c(20, 
20, 20, 20, 20), `Gene-Pos` = c(33487859, 33487859, 33487859, 
57598009, 57598009), RsId = c("rs4142441", "rs4142441", "rs4142441", 
"rs6136489", "rs4814776"), `SNP-Chr` = c(20, 20, 20, 20, 20), 
    `SNP-Pos` = c(42839620, 42839620, 42839620, 1923734, 1921523
    )), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"
))

I wanted to get some feedback if I am doing something wrong, or if there is a better way to run two conditions to match rows for two dataframe. Thank you.


Solution

  • Here's another option using data.table. This process is called a non-equi join.

    In the on portion, you can create logic for Gene-Pos to be between rt1 and rf2 AND SNP-Pos to be between rf2 and rt2. Then select the columns you want included across both tables.

    library(data.table)
    
    setDT(hic_region)
    setDT(snp_region)
    
    matched_genes = unique(snp_region[
      hic_region, 
      on = .(`Gene-Pos` >= rf1, `Gene-Pos` <= rt1, `SNP-Pos` >= rf2, `SNP-Pos` <= rt2), 
      .(Gene, `Gene-Chr`, `Gene-Pos`=`x.Gene-Pos`, RsId, `SNP-Chr`, `SNP-Pos`=`x.SNP-Pos`, rf1, rt1, rf2, rt2), 
      nomatch = NULL
    ])
    
    matched_genes
    
    # Output
    
                  Gene Gene-Chr Gene-Pos       RsId SNP-Chr SNP-Pos      rf1      rt1     rf2     rt2
    1: ENSG00000101162       20 57598009  rs6136489      20 1923734 57584944 57624944 1899354 1939354
    2: ENSG00000101162       20 57598009  rs4814776      20 1921523 57584944 57624944 1899354 1939354
    3: ENSG00000101162       20 57598009 rs13042885      20 1924707 57584944 57624944 1899354 1939354
    4: ENSG00000101162       20 57598009  rs4814779      20 1923271 57584944 57624944 1899354 1939354
    5: ENSG00000101162       20 57598009  rs6045615      20 1931582 57584944 57624944 1899354 1939354
    6: ENSG00000101162       20 57598009  rs4618126      20 1930885 57584944 57624944 1899354 1939354
    7: ENSG00000101162       20 57598009 rs11696739      20 1600925 57584944 57624944 1579354 1619354