I have a data frame with 4 columns and I want to compare val1 and val2 and if the numerical values are the same then replace val2 with "NA".Notice row3 of the dataframe has 123 in val1 and 1236 in val2 so these are not the same and thus should be left as such.
actual data:
structure(list(record_id = c(1L, 2L, 2L, 3L, 4L, 5L), name = c("CDK19",
"DYNC1H1", "G6PD", "NPRL2", "CHD2", "LGI1"), val1 = c("c.386A>C",
"c.G5864T", "c.T1058C", "c.323_339+19del", "c.2876+3_2876+6delAAGT",
"c.757G>A"), val2 = c(NA, "c.5864G>T", NA, "c.323_339+19del",
"c.2876+3_2876+6delAAGT", "c.757G>A")), row.names = c(NA, -6L
), class = c("tbl_df", "tbl", "data.frame"))
dummy data:
# rec_id name val1 val2
# 1 abby g.B123C g.123B>C
# 1 Amy g.B183C g.123B>C
# 2 abby g.B123C g.1236B>C
# 3 Amy g.2876+3_2876+6delABC g.2876+3_2876+6delABC
# 4 Amber g.2876+3_2876+6delABC g.2876+3_2876+6delABC
# 5 Axe g.2876G>C NA
Desired output
# rec_id name val1 val2
# 1 abby g.B123C NA
# 1 Amy g.B183C g.123B>C
# 2 abby g.B123C g.1236B>C
# 3 Amy g.2876+3_2876+6delABC NA
# 4 Amber g.2876+3_2876+6delABC NA
# 5 Axe g.2876+g>c NA
My code:
mydata %>%
for (i in 1:nrow(mydata)) {
if (mydata$val1[i] == mydata$val2[i]) {
mydata$val2[i] <- NA
}
}
Update: more data:
df2 <- structure(list(record_id = c(125L, 125L), name = c("ADAMTS17", "ARHGEF10"), val1 = c("arr.GRCh37.15q26.1q26.3_92401782_102383473_x3dn", "arr.GRCh37.8p23.3p23.2_191530_3846288_x1dn"), val2 = c(NA_character_, NA_character_)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"))
df3 <- structure(list(record_id = 586:591, name = c("COQ4", "COQ4", "TTC7A", "COL11A1", "GJB2", "LNX1"), val1 = c("c.190C>T", "c.190C>T", "c.1433T>C", "c.4325T>C", "c.109G>A", "c.1662A>C"), val2 = c("c.197_198delGCinsAA", "c.197_198delGCinsAA", "c.1288-1G>T", NA, NA, "c.680C>T")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
Here's one approach:
library(stringr)
mutate(df3, val2 = if_else(as.character(str_extract_all(val1, "\\d+")) == as.character(str_extract_all(val2, "\\d+")),
NA_character_, val2))
We extract the numbers using stringr, turn the lists into character vectors, then compare.
Data:
mydata <- structure(list(rec_id = c(1L, 1L, 2L, 3L, 4L, 5L), name = c("abby",
"Amy", "abby", "Amy", "Amber", "Axe"), val1 = c("g.B123C", "g.B183C",
"g.B123C", "g.2876+3_2876+6delABC", "g.2876+3_2876+6delABC",
"g.2876G>C"), val2 = c("g.123B>C", "g.123B>C", "g.1236B>C", "g.2876+3_2876+6delABC",
"g.2876+3_2876+6delABC", NA)), class = "data.frame", row.names = c(NA,
-6L))