As a learning exercise, I am trying to re-create a regex expression in R the manual way.
For example, suppose I have this string:
var1 <- c("111 222 a1C 5b2", "B2G-6l3 atttr", "nothing here", "something P2b5p2 something")
I want to see if each element has the consecutive pattern: letter, number, letter, space/no space/separator, number, letter, number.
I tried to manually define conditions for this problem:
cond_1 <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
"N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z")
cond_2 <- c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
cond_3 <- c("", " ", "-")
Then, I tried to write a loop to check if each element in var1 satisfies these conditions:
original_value <- c()
pattern_found <- c()
value <- c()
for (i in var1) {
chars <- strsplit(i, "")[[1]]
found <- FALSE
for (j in 1:(length(chars) - 6)) {
# Check if the pattern is found
if (chars[j] %in% cond_1 && chars[j+1] %in% cond_2 && chars[j+2] %in% cond_1 &&
chars[j+3] %in% cond_3 && chars[j+4] %in% cond_2 && chars[j+5] %in% cond_1 &&
chars[j+6] %in% cond_2) {
found <- TRUE
break
}
}
original_value <- c(original_value, i)
pattern_found <- c(pattern_found, ifelse(found, "yes", "no"))
value <- c(value, ifelse(found, paste(chars[j:(j+6)], collapse = ""), NA))
}
df <- data.frame(original_value, pattern_found, value)
The code seems to have partly worked:
original_value pattern_found value
1 111 222 a1C 5b2 yes a1C 5b2
2 B2G-6l3 atttr yes B2G-6l3
3 nothing here no <NA>
4 something P2b5p2 something no <NA>
How can I fix this?
PS: Here is the classic regex approach:
pattern <- "[a-zA-Z]\\d[a-zA-Z][- ,_]*\\d[a-zA-Z]\\d"
original_value <- c()
pattern_found <- c()
value <- c()
for (i in var1) {
if (grepl(pattern, i)) {
original_value <- c(original_value, i)
pattern_found <- c(pattern_found, "yes")
value <- c(value, regmatches(i, regexpr(pattern, i)))
} else {
original_value <- c(original_value, i)
pattern_found <- c(pattern_found, "no")
value <- c(value, NA)
}
}
df <- data.frame(original_value, pattern_found, value)
You could manually build a regex pattern with character classes directly using your source vectors, e.g.
cond_1 <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
"N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z")
cond_2 <- c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
cond_3 <- c(" ", "-")
r1 <- paste0("[", paste(cond_1, collapse=""), "]")
r2 <- paste0("[", paste(cond_2, collapse=""), "]")
r3 <- paste0("[", paste(cond_3, collapse=""), "]")
regex <- paste0(r1, r2, r1, r3, "?", r2, r1, r2)
regex
[1] "[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ][0123456789][abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ][ -]?[0123456789][abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ][0123456789]"