Search code examples
rapplylapplysapply

Function to replace incorrectly spelled words with correctly spelled words in R?


I've built a spell check function for a sample of 1000 rows to ensure its efficiency, using the 'hunspell' package and the Australian English dictionary. The spell-checker ignores abbreviations. My actual data has close to 2 million lines, I therefore need to convert the 'for' loops into the 'apply' family functions.

I'm almost there I, but the last part isn't working. Below are the original for loop functions:

for(i in 1:nrow(data_words))
{
  print(i)
  
  string1 <- data_words$title[i]
  string2 <- ""
  
  for(j in 1:sapply(strsplit(string1, " "), length))
  {
    w <- word(string1, j)
    
    # if word is not an abbreviation
    if (!isAbbreviation(w))
    {
      # correct word
      w <- correctText(w)
    }
    
    string2 <- paste0(string2, w, sep = " ")
    
    # add word in new column 'spell_check'
    data_words$spell_check[i] <- string2
    
  }
}

isAbbreviation <- function(x)
{
  abb = FALSE
  
  # all capitalised letters
  if(str_detect(x, "^[:upper:]+$"))
  {
    abb = TRUE
  }
  
  # dealing with abbs that end in an 's'
  b = str_extract_all(x, "(\\b[A-Z]+\\b)|\\b[A-Z]+s+\\b")
  list_empty = rlang::is_empty(unlist(b))
  
  if(!list_empty)
  {
    abb = TRUE
  }
  return(abb)
}

correctText = function(x)
{
  sapply(1:length(x), function(y)
  {
    # get misspelled words
    bad_words = hunspell(x[y], dict = "en_AU")[[1]]
    
    # if list of misspelled words is not empty
    if(length(bad_words))
    {
      for (i in 1:length(bad_words))
      {
        list_empty = rlang::is_empty(unlist(hunspell_suggest(bad_words[i], 
                                                             dict = "en_AU")))
        # if suggestion list is not empty
        if(!list_empty)
        {
          # correct word
          good = unlist(lapply(hunspell_suggest(bad_words[i], dict = "en_AU"), `[[`, 1))
        }
        else
        {
          # else leave word is it is
          good = bad_words[i]
        }
        # replace mispelled words with corrected ones
        x[y] <<- gsub(bad_words[i], good, x[y])
      }
    }
  })
  x
}

Reproducible sample of phrases to be corrected:

library(dplyr)
library(stringr)
library(hunspell)
library(textclean)

sample <- 
  c("Paaediatrics AsseSssing Febrile Infant Child", "Manuual Handling Traain Trainer", "Catheterise CTHs", "Labelinsfbsbinsajectables", "Mentouring", "techhnical", "Basic Life Support BSL", "BloodSafe cliniiical transfusion practice", "Astthma", "Zika virus preegnancy update")

data_words <- data.frame(matrix(nrow = length(sample), ncol = 1))
names(data_words) <- "title"
data_words$title <- sample
data_words <- as_tibble(data_words)

I had a go at it, please refer to the below functions:

# the abbreviation function remains the same

# function to correct a misspelled word
correctTheWord <- function(bad_word)
{
  # print(bad_word)
  
  if (!isAbbreviation(bad_word))
  {
    list_empty = rlang::is_empty(unlist(hunspell_suggest(bad_word,
                                                         dict = "en_AU")))
    
    if (!list_empty)
    {
      good = unlist(
        lapply(hunspell_suggest(bad_word, dict = "en_AU"),
               `[[`,
               1
        ))
    }
    else
    {
      good = bad_word
    }
  }
  
  else
  {
    good = bad_word
  }
}

# correct whole row function
correctText = function(x)
{
  sapply(1:length(x), function(y)
  {
    bad = hunspell(x[y], dict = "en_AU")[[1]]
    
    if (length(bad))
    {
      return(mgsub(x, bad, lapply(bad, correctTheWord)))
    }
    else
    {
      return(x)
    }
  })
}


# testing the first 2 titles
correctText("Paaediatrics AsseSssing Febrile Infant Child")
correctText("Manuual Handling Traain Trainer")


# this is not working 
data_words$spell_check <- 
  apply(data_words[, 1], 2,  correctText)

Also, can my functions can be simplified further?


Solution

  • This will identify and replace incorrectly spelt words with the correct spelling. Note that it will ignore abbreviations as desired, and it assumes all words are separated by a space.

    
    # First, define isAbbreviation
    
    isAbbreviation <- function(x)
    {
      abb = FALSE
      
      # all capitalised letters
      if(str_detect(x, "^[:upper:]+$"))
      {
        abb = TRUE
      }
      
      # dealing with abbs that end in an 's'
      b = str_extract_all(x, "(\\b[A-Z]+\\b)|\\b[A-Z]+s+\\b")
      list_empty = rlang::is_empty(unlist(b))
      
      if(!list_empty)
      {
        abb = TRUE
      }
      return(abb)
    }
    
    
    sample <- 
      c("Paaediatrics AsseSssing Febrile Infant Child", "Manuual Handling Traain Trainer", 
        "Catheterise CTHs", "Labelinsfbsbinsajectables", "Mentouring", "techhnical", 
        "Basic Life Support BSL", "BloodSafe cliniiical transfusion practice", "Astthma", 
        "Zika virus preegnancy update", "Basic Labelinsfbsbinsajectables technical")
    
    data_words <- data.frame(matrix(nrow = length(sample), ncol = 1))
    names(data_words) <- "title"
    data_words$title <- sample
    data_words <- as_tibble(data_words)
    
    
    correct_spelling <- function(text) {
      
      words <- text %>% 
      str_split(" ") %>% 
      .[[1]]
    
      abbreviation <- words %>% sapply(isAbbreviation) %>% 
        unname
      
      # Abbreviations return false here, which is inconsequential since we don't replace them 
      correct <- words %>% 
        sapply(function(x) {hunspell_check(x, dict = dictionary("en_AU")) } ) %>% 
        unname
      
      # Correct the word if incorrect and not abbreviation
      if(!any(!(!abbreviation) & (!correct))) {
      
        misspelled_and_not_abbreviation <- words[(!abbreviation) & (!correct)] 
      
      
        suggestions <- misspelled_and_not_abbreviation %>% 
          hunspell_suggest(dict = dictionary("en_AU")) 
        
        suggested_words <- sapply(seq_along(suggestions), function(y, i) 
          { ifelse(length(y[[1]]) == 0, misspelled_and_not_abbreviation[i], y[[i]][1]) }, 
          y=suggestions)
    
        words[as.logical((!abbreviation) * (!correct))] <- suggested_words 
      
      }
      
      words %>% paste0(collapse = " ")
    
      
    }
    
    data_words$spell_check2 <- data_words$title %>% sapply(correct_spelling) %>% unname
    

    which gives

    data_words
    
    #    title                                        spell_check2                              
    #    <chr>                                        <chr>                                     
    #  1 Paaediatrics AsseSssing Febrile Infant Child Paediatrics Assessing Febrile Infant Child
    #  2 Manuual Handling Traain Trainer              Manual Handling Train Trainer             
    #  3 Catheterise CTHs                             Catheterise CTHs                          
    #  4 Labelinsfbsbinsajectables                    Labelinsfbsbinsajectables                 
    #  5 Mentouring                                   Mentoring                                 
    #  6 techhnical                                   technical                                 
    #  7 Basic Life Support BSL                       Basic Life Support BSL                    
    #  8 BloodSafe cliniiical transfusion practice    Blood Safe clinical transfusion practice  
    #  9 Astthma                                      Asthma                                    
    # 10 Zika virus preegnancy update                 Erika virus pregnancy update              
    # 11 Basic Labelinsfbsbinsajectables technical    Basic Labelinsfbsbinsajectables technical