I've built a spell check function for a sample of 1000 rows to ensure its efficiency, using the 'hunspell' package and the Australian English dictionary. The spell-checker ignores abbreviations. My actual data has close to 2 million lines, I therefore need to convert the 'for' loops into the 'apply' family functions.
I'm almost there I, but the last part isn't working. Below are the original for loop functions:
for(i in 1:nrow(data_words))
{
print(i)
string1 <- data_words$title[i]
string2 <- ""
for(j in 1:sapply(strsplit(string1, " "), length))
{
w <- word(string1, j)
# if word is not an abbreviation
if (!isAbbreviation(w))
{
# correct word
w <- correctText(w)
}
string2 <- paste0(string2, w, sep = " ")
# add word in new column 'spell_check'
data_words$spell_check[i] <- string2
}
}
isAbbreviation <- function(x)
{
abb = FALSE
# all capitalised letters
if(str_detect(x, "^[:upper:]+$"))
{
abb = TRUE
}
# dealing with abbs that end in an 's'
b = str_extract_all(x, "(\\b[A-Z]+\\b)|\\b[A-Z]+s+\\b")
list_empty = rlang::is_empty(unlist(b))
if(!list_empty)
{
abb = TRUE
}
return(abb)
}
correctText = function(x)
{
sapply(1:length(x), function(y)
{
# get misspelled words
bad_words = hunspell(x[y], dict = "en_AU")[[1]]
# if list of misspelled words is not empty
if(length(bad_words))
{
for (i in 1:length(bad_words))
{
list_empty = rlang::is_empty(unlist(hunspell_suggest(bad_words[i],
dict = "en_AU")))
# if suggestion list is not empty
if(!list_empty)
{
# correct word
good = unlist(lapply(hunspell_suggest(bad_words[i], dict = "en_AU"), `[[`, 1))
}
else
{
# else leave word is it is
good = bad_words[i]
}
# replace mispelled words with corrected ones
x[y] <<- gsub(bad_words[i], good, x[y])
}
}
})
x
}
Reproducible sample of phrases to be corrected:
library(dplyr)
library(stringr)
library(hunspell)
library(textclean)
sample <-
c("Paaediatrics AsseSssing Febrile Infant Child", "Manuual Handling Traain Trainer", "Catheterise CTHs", "Labelinsfbsbinsajectables", "Mentouring", "techhnical", "Basic Life Support BSL", "BloodSafe cliniiical transfusion practice", "Astthma", "Zika virus preegnancy update")
data_words <- data.frame(matrix(nrow = length(sample), ncol = 1))
names(data_words) <- "title"
data_words$title <- sample
data_words <- as_tibble(data_words)
I had a go at it, please refer to the below functions:
# the abbreviation function remains the same
# function to correct a misspelled word
correctTheWord <- function(bad_word)
{
# print(bad_word)
if (!isAbbreviation(bad_word))
{
list_empty = rlang::is_empty(unlist(hunspell_suggest(bad_word,
dict = "en_AU")))
if (!list_empty)
{
good = unlist(
lapply(hunspell_suggest(bad_word, dict = "en_AU"),
`[[`,
1
))
}
else
{
good = bad_word
}
}
else
{
good = bad_word
}
}
# correct whole row function
correctText = function(x)
{
sapply(1:length(x), function(y)
{
bad = hunspell(x[y], dict = "en_AU")[[1]]
if (length(bad))
{
return(mgsub(x, bad, lapply(bad, correctTheWord)))
}
else
{
return(x)
}
})
}
# testing the first 2 titles
correctText("Paaediatrics AsseSssing Febrile Infant Child")
correctText("Manuual Handling Traain Trainer")
# this is not working
data_words$spell_check <-
apply(data_words[, 1], 2, correctText)
Also, can my functions can be simplified further?
This will identify and replace incorrectly spelt words with the correct spelling. Note that it will ignore abbreviations as desired, and it assumes all words are separated by a space.
# First, define isAbbreviation
isAbbreviation <- function(x)
{
abb = FALSE
# all capitalised letters
if(str_detect(x, "^[:upper:]+$"))
{
abb = TRUE
}
# dealing with abbs that end in an 's'
b = str_extract_all(x, "(\\b[A-Z]+\\b)|\\b[A-Z]+s+\\b")
list_empty = rlang::is_empty(unlist(b))
if(!list_empty)
{
abb = TRUE
}
return(abb)
}
sample <-
c("Paaediatrics AsseSssing Febrile Infant Child", "Manuual Handling Traain Trainer",
"Catheterise CTHs", "Labelinsfbsbinsajectables", "Mentouring", "techhnical",
"Basic Life Support BSL", "BloodSafe cliniiical transfusion practice", "Astthma",
"Zika virus preegnancy update", "Basic Labelinsfbsbinsajectables technical")
data_words <- data.frame(matrix(nrow = length(sample), ncol = 1))
names(data_words) <- "title"
data_words$title <- sample
data_words <- as_tibble(data_words)
correct_spelling <- function(text) {
words <- text %>%
str_split(" ") %>%
.[[1]]
abbreviation <- words %>% sapply(isAbbreviation) %>%
unname
# Abbreviations return false here, which is inconsequential since we don't replace them
correct <- words %>%
sapply(function(x) {hunspell_check(x, dict = dictionary("en_AU")) } ) %>%
unname
# Correct the word if incorrect and not abbreviation
if(!any(!(!abbreviation) & (!correct))) {
misspelled_and_not_abbreviation <- words[(!abbreviation) & (!correct)]
suggestions <- misspelled_and_not_abbreviation %>%
hunspell_suggest(dict = dictionary("en_AU"))
suggested_words <- sapply(seq_along(suggestions), function(y, i)
{ ifelse(length(y[[1]]) == 0, misspelled_and_not_abbreviation[i], y[[i]][1]) },
y=suggestions)
words[as.logical((!abbreviation) * (!correct))] <- suggested_words
}
words %>% paste0(collapse = " ")
}
data_words$spell_check2 <- data_words$title %>% sapply(correct_spelling) %>% unname
which gives
data_words
# title spell_check2
# <chr> <chr>
# 1 Paaediatrics AsseSssing Febrile Infant Child Paediatrics Assessing Febrile Infant Child
# 2 Manuual Handling Traain Trainer Manual Handling Train Trainer
# 3 Catheterise CTHs Catheterise CTHs
# 4 Labelinsfbsbinsajectables Labelinsfbsbinsajectables
# 5 Mentouring Mentoring
# 6 techhnical technical
# 7 Basic Life Support BSL Basic Life Support BSL
# 8 BloodSafe cliniiical transfusion practice Blood Safe clinical transfusion practice
# 9 Astthma Asthma
# 10 Zika virus preegnancy update Erika virus pregnancy update
# 11 Basic Labelinsfbsbinsajectables technical Basic Labelinsfbsbinsajectables technical