Search code examples
swiftstringnlpspell-checkinguitextchecker

How to get context of a sentence


So i'm working a an app that can patch words that are broken.

Lets take:

mny people say there is a error in this sentence

With swift here we can us UITextChecker and get a wonderful result of what the word mny could actually be... However, i actually get a couple of choices, one of which is many and among the other you have money so obviously money wouldn't fit in very well in this sentence. Are there any way to check if the sentence itself is logical?


Solution

  • Consider that this still needs to be improved. I updated this swift 3 solution to Swift 5. Worth to mention that it was originally inspired by this python tutorial

    Create a new iOS project, add there a text file named bigtext.txt which will contain this text. This will be our "learning" dictionary. Then in ViewController:

    import UIKit
    import NaturalLanguage
    
    class ViewController: UIViewController {
    
        override func viewDidLoad() {
            super.viewDidLoad()
    
            let inputString = "mny people say there is a error in this sentence"
            var newString = inputString
    
            // Read a text file and "study" the model
            guard let path = Bundle.main.path(forResource: "bigtext", ofType: "txt") else {
                print("Path not available")
                return
            }
            let checker = SpellChecker(contentsOfFile: path)
    
            // better to use this to iterate between words in a sentence
            let tokenizer = NLTokenizer(unit: .word)
            tokenizer.string = inputString
            tokenizer.enumerateTokens(in: inputString.startIndex..<inputString.endIndex) { tokenRange, _ in
                let word = String(inputString[tokenRange])
                let checked = checker?.correct(word: word)
                let candidates = checker?.candidates(word: word)
    
                if word == checked {
                    print("\(word) unchanged")
                } else {
                    if let checked = checked {
                        newString.replaceSubrange(tokenRange, with: checked)
                    }
                    print("Correct:\t\(word) -> \(String(describing: checked))")
                    print("Candidates:\t\(word) -> \(String(describing: candidates))")
                }
                return true
            }
            print("Result: \(newString)")
        }
    }
    
    func edits(word: String) -> Set<String> {
        if word.isEmpty { return [] }
    
        let splits = word.indices.map {
            (word[word.startIndex..<$0], word[$0..<word.endIndex])
        }
    
        let deletes = splits.map { $0.0 +  String($0.1.dropFirst()) }
    
        let transposes: [String] = splits.map { left, right in
            if let fst = right.first {
                let drop1 = String(right.dropFirst())
                if let snd = drop1.first {
                    let drop2 = String(drop1.dropFirst())
                    return "\(left)\(snd)\(fst)\(drop2)"
                }
            }
            return ""
        }.filter { !$0.isEmpty }
    
        let alphabet = "abcdefghijklmnopqrstuvwxyz"
    
        let replaces = splits.flatMap { left, right in
            alphabet.map { "\(left)\($0)\(String(right.dropFirst()))" }
        }
    
        let inserts = splits.flatMap { left, right in
            alphabet.map { "\(left)\($0)\(right)" }
        }
        let setString = [String(deletes.first!)] + transposes + replaces + inserts
        return Set(setString)
    }
    
    struct SpellChecker {
    
        var knownWords: [String:Int] = [:]
    
        mutating func train(word: String) {
            if let idx = knownWords[word] {
                knownWords[word] = idx + 1
            }
            else {
                knownWords[word] = 1
            }
        }
    
        init?(contentsOfFile file: String) {
            do {
                let text = try String(contentsOfFile: file, encoding: .utf8).lowercased()
                let words = text.unicodeScalars.split(whereSeparator: { !("a"..."z").contains($0) }).map { String($0) }
                for word in words { self.train(word: word) }
            }
            catch {
                return nil
            }
        }
    
        func knownEdits2(word: String) -> Set<String>? {
            var known_edits: Set<String> = []
            for edit in edits(word: word) {
                if let k = known(words: edits(word: edit)) {
                    known_edits.formUnion(k)
                }
            }
            return known_edits.isEmpty ? nil : known_edits
        }
    
        func known<S: Sequence>(words: S) -> Set<String>? where S.Iterator.Element == String {
            let s = Set(words.filter { self.knownWords.index(forKey: $0) != nil })
            return s.isEmpty ? nil : s
        }
    
        func candidates(word: String) -> Set<String> {
            guard let result = known(words: [word]) ?? known(words: edits(word: word)) ?? knownEdits2(word: word) else {
                return Set<String>()
            }
    
            return result
        }
    
        func correct(word: String) -> String {
            return candidates(word: word).reduce(word) {
                (knownWords[$0] ?? 1) < (knownWords[$1] ?? 1) ? $1 : $0
            }
        }
    }
    

    Will output you:

    Correct:    mny -> Optional("may")
    Candidates: mny -> Optional(Set(["any", "ny", "may", "many"]))
    people unchanged
    say unchanged
    there unchanged
    is unchanged
    a unchanged
    error unchanged
    in unchanged
    this unchanged
    sentence unchanged
    Result: may people say there is a error in this sentence
    

    Please, consider that we took first correction candidate. Need first to clarify ourselves the word order and understand the sentence context.