Search code examples
pythonalgorithmperformancedata-structures

Find words with only one letter difference from a list of words


I'm trying to find all the letters in a list that differ from each other by word. I then output them into an array. The trouble is it takes my current code 17 seconds for 18, 000 words and I trying to cut down the time it takes.

Is there anyway I can speed this up?

#compares words and returns true if they are neighbours
def isneighbour(word1, word2):
    diff = 0
    for i in range(len(word1)):
        if word1[i] != word2[i]:
            diff += 1
            if diff > 1:
                return False
    return diff == 1


def find_neighbours(lst):
    #take a word from lst
    #compare to rest of words from lst
    hood = []
    #convert dictionary to list
    #search for all words of same length
    keylst = list(lst.keys())
    length = len(keylst) -1
    for i in range(0, length -1):
        temp = []
        temp.append(keylst[i])
        for j in range(i + 1, length):
            if len(keylst[i]) == len(keylst[j]):
                #test to see if two words of the same length are neighbours
                if isneighbour(keylst[i], keylst[j]):
                    temp.append(keylst[j])
        if len(temp)>1:
            quicksort(temp)
            hood.append(temp)
    return hood

I've had a google around and can't see how I can improve on this. I am student, so all suggestions will be useful.


Solution

  • Levenshtein Distance

    • It depends on your definition of "differ from each other by [a] letter".

    • It seems to me that the order of letters does matter, which makes the problem more complicated.

    • If these are your assumptions, I'd start with Levenshtein which uses dynamic programming. You can tune the algorithm, based on what the expected output is, or you can use it to narrow down your search space since the Levenshtein distance is pretty fast.

    
    import random, string, collections
    
    
    def levenshtein_edit_distance(A, B):
        dp = [[0] * (len(B) + 1) for _ in range(len(A) + 1)]
        for i in range(len(A) + 1):
            dp[i][0] = i
        for j in range(len(B) + 1):
            dp[0][j] = j
    
        for i in range(1, len(A) + 1):
            for j in range(1, len(B) + 1):
                if A[i - 1] == B[j - 1]:
                    dp[i][j] = dp[i - 1][j - 1]
                else:
                    dp[i][j] = 1 + min(dp[i - 1][j - 1], dp[i - 1][j], dp[i][j - 1])
    
        return dp[-1][-1]
    
    
    def _gen_words(l):
        return ''.join(random.choice(string.ascii_lowercase) for _ in range(l))
    
    
    def _modify(word, diffs=1):
        word = list(word)
        for _ in range(diffs):
            i = random.randint(0, len(word) - 1)
            word[i] = random.choice(string.ascii_lowercase)
        return ''.join(word)
    
    
    words = [_gen_words(3)]
    
    random.seed(10)
    
    for _ in range(100000):
        diffs = random.randint(1, 1)
        w = _modify(words[-1], diffs)
        words.append(w)
    
    unique_words = list(set(words))
    
    memo = collections.defaultdict(list)
    for a, b in zip(unique_words[:], unique_words[1:]):
        dist = levenshtein_edit_distance(a, b)
        if dist == 1:
            memo[a] += [b]
    
    print(memo)
    
    

    Prints

    defaultdict(<class 'list'>, {'sqb': ['sqe'], 'evn': ['lvn'], 'wsr': ['wfr'], 'nnl': ['cnl'], 'joe': ['boe'], 'qol': ['qzl'], 'rbm': ['rbk'], 'tsj': ['tsg'], 'sga': ['sgt'], 'abw': ['arw'], 'yjl': ['fjl'], 'wth': ['wah'], 'zkn': ['ukn'], 'shs': ['xhs'], 'szi': ['czi'], 'eps': ['cps'], 'asb': ['hsb'], 'ish': ['iss'], 'owm': ['rwm'], 'vae': ['vab'], 'dou': ['doz'], 'qeh': ['qch'], 'wod': ['woj'], 'pyg': ['ryg'], 'ctf': ['jtf'], 'hjw': ['hjq'], 'alx': ['clx'], 'ikj': ['ikp'], 'gyr': ['gym'], 'pkj': ['kkj'], 'bcy': ['xcy'], 'ebr': ['evr'], 'gcm': ['gcq'], 'iqw': ['iqx'], 'bml': ['bma'], 'wrz': ['wwz'], 'psk': ['ksk'], 'wtp': ['wzp'], 'inc': ['inn'], 'qbp': ['mbp'], 'bas': ['aas'], 'jfx': ['jyx'], 'osa': ['tsa'], 'shn': ['sun'], 'lhs': ['fhs'], 'fzm': ['fzu'], 'rzc': ['ruc'], 'idi': ['ido'], 'sdw': ['sdo'], 'lye': ['hye'], 'dht': ['cht'], 'gjy': ['gyy'], 'efs': ['efb'], 'rnb': ['rqb'], 'gtw': ['rtw'], 'pgf': ['pgn'], 'yhz': ['ycz'], 'rre': ['rce'], 'box': ['fox'], 'zql': ['zqn'], 'wmc': ['wmx'], 'tny': ['tey'], 'ayy': ['azy'], 'jka': ['jkn'], 'ost': ['ist'], 'ktc': ['kcc'], 'zxl': ['zxo'], 'jfs': ['jvs'], 'jgy': ['rgy'], 'tql': ['tel'], 'yne': ['yje'], 'mpa': ['ypa'], 'uwg': ['ufg'], 'uec': ['uee'], 'ppr': ['rpr'], 'eln': ['egn'], 'opp': ['opk'], 'rip': ['jip'], 'zge': ['fge'], 'jfb': ['jft'], 'jcu': ['jcq'], 'ngc': ['ngq'], 'vrw': ['vcw'], 'uml': ['fml'], 'lez': ['lmz'], 'nhy': ['uhy'], 'nuz': ['nuj'], 'wjj': ['wjg'], 'bqa': ['uqa'], 'kil': ['kix'], 'ymj': ['yxj'], 'bqg': ['bhg'], 'lop': ['lok'], 'fev': ['fei'], 'fjp': ['fje'], 'wzw': ['wiw'], 'ayp': ['anp'], 'xvf': ['yvf'], 'ptw': ['pte'], 'cts': ['ats']})


    From Wiki

    enter image description here