Search code examples
rdataframedplyrconditional-statementsdata-cleaning

Compare two columns and replace the value in column 2 with NA if same as column 1


I have a data frame with 4 columns and I want to compare val1 and val2 and if the numerical values are the same then replace val2 with "NA".Notice row3 of the dataframe has 123 in val1 and 1236 in val2 so these are not the same and thus should be left as such.

actual data:
structure(list(record_id = c(1L, 2L, 2L, 3L, 4L, 5L), name = c("CDK19", 
"DYNC1H1", "G6PD", "NPRL2", "CHD2", "LGI1"), val1 = c("c.386A>C", 
"c.G5864T", "c.T1058C", "c.323_339+19del", "c.2876+3_2876+6delAAGT", 
"c.757G>A"), val2 = c(NA, "c.5864G>T", NA, "c.323_339+19del", 
"c.2876+3_2876+6delAAGT", "c.757G>A")), row.names = c(NA, -6L
), class = c("tbl_df", "tbl", "data.frame"))

dummy data:

# rec_id    name    val1    val2
# 1 abby    g.B123C g.123B>C
# 1 Amy     g.B183C g.123B>C
# 2 abby    g.B123C g.1236B>C
# 3 Amy     g.2876+3_2876+6delABC   g.2876+3_2876+6delABC
# 4 Amber   g.2876+3_2876+6delABC   g.2876+3_2876+6delABC
# 5 Axe     g.2876G>C   NA

Desired output

# rec_id    name    val1    val2
# 1 abby    g.B123C NA
# 1 Amy g.B183C g.123B>C
# 2 abby    g.B123C g.1236B>C
# 3 Amy g.2876+3_2876+6delABC   NA
# 4 Amber   g.2876+3_2876+6delABC   NA
# 5 Axe g.2876+g>c  NA

My code:

mydata %>%
  for (i in 1:nrow(mydata)) {
    if (mydata$val1[i] == mydata$val2[i]) {
      mydata$val2[i] <- NA
    }
  }

Update: more data:

df2 <- structure(list(record_id = c(125L, 125L), name = c("ADAMTS17", "ARHGEF10"), val1 = c("arr.GRCh37.15q26.1q26.3_92401782_102383473_x3dn", "arr.GRCh37.8p23.3p23.2_191530_3846288_x1dn"), val2 = c(NA_character_, NA_character_)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"))
df3 <- structure(list(record_id = 586:591, name = c("COQ4", "COQ4", "TTC7A", "COL11A1", "GJB2", "LNX1"), val1 = c("c.190C>T", "c.190C>T", "c.1433T>C", "c.4325T>C", "c.109G>A", "c.1662A>C"), val2 = c("c.197_198delGCinsAA", "c.197_198delGCinsAA", "c.1288-1G>T", NA, NA, "c.680C>T")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) 

Solution

  • Here's one approach:

    library(stringr)
    
    mutate(df3, val2 = if_else(as.character(str_extract_all(val1, "\\d+")) == as.character(str_extract_all(val2, "\\d+")), 
                               NA_character_, val2))
    

    We extract the numbers using , turn the lists into character vectors, then compare.

    Data:

    mydata <- structure(list(rec_id = c(1L, 1L, 2L, 3L, 4L, 5L), name = c("abby", 
    "Amy", "abby", "Amy", "Amber", "Axe"), val1 = c("g.B123C", "g.B183C", 
    "g.B123C", "g.2876+3_2876+6delABC", "g.2876+3_2876+6delABC", 
    "g.2876G>C"), val2 = c("g.123B>C", "g.123B>C", "g.1236B>C", "g.2876+3_2876+6delABC", 
    "g.2876+3_2876+6delABC", NA)), class = "data.frame", row.names = c(NA, 
    -6L))