r text-mining tidyverse stringr quanteda

How to output in R all possible deviations of a word for a fixed distance value?

I have a word and want to output in R all possible deviatons (replacement, substitution, insertion) for a fixed distance value into a vector.

For instance, the word "Cat" and a fixed distance value of 1 results in a vector with the elements "cot", "at", ...

Solution

I'm going to assume that you want all actual words, not just permutations of the characters with an edit distance of 1 that would include non-words such as "zat".

We can do this using adist() to compute the edit distance between your target word and all eligible English words, taken from some word list. Here, I used the English syllable dictionary from the quanteda package (you did tag this question as quanteda after all) but this could have been any vector of English dictionary words from any other source as well.

To narrow things down, we first exclude all words that are different in length from the target word by your distance value.

distfn <- function(word, distance = 1) {
  # select eligible words for efficiency
  eligible_y_words <- names(quanteda::data_int_syllables)
  wordlengths <- nchar(eligible_y_words)
  eligible_y_words <- eligible_y_words[wordlengths >= (nchar(word) - distance) &
    wordlengths <= (nchar(word) + distance)]
  # compute Levenshtein distance
  distances <- utils::adist(word, eligible_y_words)[1, ]
  # return only those for the requested distance value
  eligible_y_words[distances == distance]
}

distfn("cat", 1)
##  [1] "at"   "bat"  "ca"   "cab"  "cac"  "cad"  "cai"  "cal"  "cam"  "can" 
## [11] "cant" "cao"  "cap"  "caq"  "car"  "cart" "cas"  "cast" "cate" "cato"
## [21] "cats" "catt" "cau"  "caw"  "cay"  "chat" "coat" "cot"  "ct"   "cut" 
## [31] "dat"  "eat"  "fat"  "gat"  "hat"  "kat"  "lat"  "mat"  "nat"  "oat" 
## [41] "pat"  "rat"  "sat"  "scat" "tat"  "vat"  "wat"

To demonstrate how this works on longer words, with alternative distance values.

distfn("coffee", 1)
## [1] "caffee"  "coffeen" "coffees" "coffel"  "coffer"  "coffey"  "cuffee" 
## [8] "toffee"

distfn("coffee", 2)
##  [1] "caffey"   "calfee"   "chafee"   "chaffee"  "cofer"    "coffee's"
##  [7] "coffelt"  "coffers"  "coffin"   "cofide"   "cohee"    "coiffe"  
## [13] "coiffed"  "colee"    "colfer"   "combee"   "comfed"   "confer"  
## [19] "conlee"   "coppee"   "cottee"   "coulee"   "coutee"   "cuffe"   
## [25] "cuffed"   "diffee"   "duffee"   "hoffer"   "jaffee"   "joffe"   
## [31] "mcaffee"  "moffet"   "noffke"   "offen"    "offer"    "roffe"   
## [37] "scoffed"  "soffel"   "soffer"   "yoffie"

(Yes, according to the CMU pronunciation dictionary, those are all actual words...)

EDIT: Make for all permutations of letters, not just actual words

This involves permutations from the alphabet that have the fixed edit distances from the input word. Here I've done it not particular efficiently by forming all permutations of letters within the eligible ranges, and then computing their edit distance from the target word, and then selecting them. So it's a variation of above, except instead of a dictionary, it uses permuted words.

distfn2 <- function(word, distance = 1) {
  result <- character()

  # start with deletions
  for (i in max((nchar(word) - distance), 0):(nchar(word) - 1)) {
    result <- c(
      result,
      combn(unlist(strsplit(word, "", fixed = TRUE)), i,
        paste,
        collapse = "", simplify = TRUE
      )
    )
  }

  # now for changes and insertions
  for (i in (nchar(word)):(nchar(word) + distance)) {
    # all possible edits
    edits <- apply(expand.grid(rep(list(letters), i)),
      1, paste0,
      collapse = ""
    )
    # remove original word
    edits <- edits[edits != word]
    # get all distances, add to result
    distances <- utils::adist(word, edits)[1, ]
    result <- c(result, edits[distances == distance])
  }

  result
}

For the OP example:

distfn2("cat", 1)
##   [1] "ca"   "ct"   "at"   "caa"  "cab"  "cac"  "cad"  "cae"  "caf"  "cag" 
##  [11] "cah"  "cai"  "caj"  "cak"  "cal"  "cam"  "can"  "cao"  "cap"  "caq" 
##  [21] "car"  "cas"  "aat"  "bat"  "dat"  "eat"  "fat"  "gat"  "hat"  "iat" 
##  [31] "jat"  "kat"  "lat"  "mat"  "nat"  "oat"  "pat"  "qat"  "rat"  "sat" 
##  [41] "tat"  "uat"  "vat"  "wat"  "xat"  "yat"  "zat"  "cbt"  "cct"  "cdt" 
##  [51] "cet"  "cft"  "cgt"  "cht"  "cit"  "cjt"  "ckt"  "clt"  "cmt"  "cnt" 
##  [61] "cot"  "cpt"  "cqt"  "crt"  "cst"  "ctt"  "cut"  "cvt"  "cwt"  "cxt" 
##  [71] "cyt"  "czt"  "cau"  "cav"  "caw"  "cax"  "cay"  "caz"  "cata" "catb"
##  [81] "catc" "catd" "cate" "catf" "catg" "cath" "cati" "catj" "catk" "catl"
##  [91] "catm" "catn" "cato" "catp" "catq" "catr" "cats" "caat" "cbat" "acat"
## [101] "bcat" "ccat" "dcat" "ecat" "fcat" "gcat" "hcat" "icat" "jcat" "kcat"
## [111] "lcat" "mcat" "ncat" "ocat" "pcat" "qcat" "rcat" "scat" "tcat" "ucat"
## [121] "vcat" "wcat" "xcat" "ycat" "zcat" "cdat" "ceat" "cfat" "cgat" "chat"
## [131] "ciat" "cjat" "ckat" "clat" "cmat" "cnat" "coat" "cpat" "cqat" "crat"
## [141] "csat" "ctat" "cuat" "cvat" "cwat" "cxat" "cyat" "czat" "cabt" "cact"
## [151] "cadt" "caet" "caft" "cagt" "caht" "cait" "cajt" "cakt" "calt" "camt"
## [161] "cant" "caot" "capt" "caqt" "cart" "cast" "catt" "caut" "cavt" "cawt"
## [171] "caxt" "cayt" "cazt" "catu" "catv" "catw" "catx" "caty" "catz"

Also works with other edit distances, although it becomes very slow for longer words.

d2 <- distfn2("cat", 2)
set.seed(100)
c(head(d2, 50), sample(d2, 50), tail(d2, 50))
##   [1] "c"     "a"     "t"     "ca"    "ct"    "at"    "aaa"   "baa"  
##   [9] "daa"   "eaa"   "faa"   "gaa"   "haa"   "iaa"   "jaa"   "kaa"  
##  [17] "laa"   "maa"   "naa"   "oaa"   "paa"   "qaa"   "raa"   "saa"  
##  [25] "taa"   "uaa"   "vaa"   "waa"   "xaa"   "yaa"   "zaa"   "cba"  
##  [33] "aca"   "bca"   "cca"   "dca"   "eca"   "fca"   "gca"   "hca"  
##  [41] "ica"   "jca"   "kca"   "lca"   "mca"   "nca"   "oca"   "pca"  
##  [49] "qca"   "rca"   "cnts"  "cian"  "pcatb" "cqo"   "uawt"  "hazt" 
##  [57] "cpxat" "aaet"  "ckata" "caod"  "ncatl" "qcamt" "cdtp"  "qajt" 
##  [65] "bckat" "qcatr" "cqah"  "rcbt"  "cvbt"  "bbcat" "vcaz"  "ylcat"
##  [73] "cahz"  "jcgat" "mant"  "jatd"  "czlat" "cbamt" "cajta" "cafp" 
##  [81] "cizt"  "cmaut" "qwat"  "jcazt" "hdcat" "ucant" "hate"  "cajtl"
##  [89] "caaty" "cix"   "nmat"  "cajit" "cmnat" "caobt" "catoi" "ncau" 
##  [97] "ucoat" "ncamt" "jath"  "oats"  "chatz" "ciatz" "cjatz" "ckatz"
## [105] "clatz" "cmatz" "cnatz" "coatz" "cpatz" "cqatz" "cratz" "csatz"
## [113] "ctatz" "cuatz" "cvatz" "cwatz" "cxatz" "cyatz" "czatz" "cabtz"
## [121] "cactz" "cadtz" "caetz" "caftz" "cagtz" "cahtz" "caitz" "cajtz"
## [129] "caktz" "caltz" "camtz" "cantz" "caotz" "captz" "caqtz" "cartz"
## [137] "castz" "cattz" "cautz" "cavtz" "cawtz" "caxtz" "caytz" "caztz"
## [145] "catuz" "catvz" "catwz" "catxz" "catyz" "catzz"

This could be speeded up by less brute force formation of all permutations and then applying adist() to them - it could consist of changes or insertions of known edit distances generated algorithmically from letters.