I'm trying to find all the letters in a list that differ from each other by word. I then output them into an array. The trouble is it takes my current code 17 seconds for 18, 000 words and I trying to cut down the time it takes.
Is there anyway I can speed this up?
#compares words and returns true if they are neighbours
def isneighbour(word1, word2):
diff = 0
for i in range(len(word1)):
if word1[i] != word2[i]:
diff += 1
if diff > 1:
return False
return diff == 1
def find_neighbours(lst):
#take a word from lst
#compare to rest of words from lst
hood = []
#convert dictionary to list
#search for all words of same length
keylst = list(lst.keys())
length = len(keylst) -1
for i in range(0, length -1):
temp = []
temp.append(keylst[i])
for j in range(i + 1, length):
if len(keylst[i]) == len(keylst[j]):
#test to see if two words of the same length are neighbours
if isneighbour(keylst[i], keylst[j]):
temp.append(keylst[j])
if len(temp)>1:
quicksort(temp)
hood.append(temp)
return hood
I've had a google around and can't see how I can improve on this. I am student, so all suggestions will be useful.
It depends on your definition of "differ from each other by [a] letter".
It seems to me that the order of letters does matter, which makes the problem more complicated.
If these are your assumptions, I'd start with Levenshtein which uses dynamic programming. You can tune the algorithm, based on what the expected output is, or you can use it to narrow down your search space since the Levenshtein distance is pretty fast.
import random, string, collections
def levenshtein_edit_distance(A, B):
dp = [[0] * (len(B) + 1) for _ in range(len(A) + 1)]
for i in range(len(A) + 1):
dp[i][0] = i
for j in range(len(B) + 1):
dp[0][j] = j
for i in range(1, len(A) + 1):
for j in range(1, len(B) + 1):
if A[i - 1] == B[j - 1]:
dp[i][j] = dp[i - 1][j - 1]
else:
dp[i][j] = 1 + min(dp[i - 1][j - 1], dp[i - 1][j], dp[i][j - 1])
return dp[-1][-1]
def _gen_words(l):
return ''.join(random.choice(string.ascii_lowercase) for _ in range(l))
def _modify(word, diffs=1):
word = list(word)
for _ in range(diffs):
i = random.randint(0, len(word) - 1)
word[i] = random.choice(string.ascii_lowercase)
return ''.join(word)
words = [_gen_words(3)]
random.seed(10)
for _ in range(100000):
diffs = random.randint(1, 1)
w = _modify(words[-1], diffs)
words.append(w)
unique_words = list(set(words))
memo = collections.defaultdict(list)
for a, b in zip(unique_words[:], unique_words[1:]):
dist = levenshtein_edit_distance(a, b)
if dist == 1:
memo[a] += [b]
print(memo)
defaultdict(<class 'list'>, {'sqb': ['sqe'], 'evn': ['lvn'], 'wsr': ['wfr'], 'nnl': ['cnl'], 'joe': ['boe'], 'qol': ['qzl'], 'rbm': ['rbk'], 'tsj': ['tsg'], 'sga': ['sgt'], 'abw': ['arw'], 'yjl': ['fjl'], 'wth': ['wah'], 'zkn': ['ukn'], 'shs': ['xhs'], 'szi': ['czi'], 'eps': ['cps'], 'asb': ['hsb'], 'ish': ['iss'], 'owm': ['rwm'], 'vae': ['vab'], 'dou': ['doz'], 'qeh': ['qch'], 'wod': ['woj'], 'pyg': ['ryg'], 'ctf': ['jtf'], 'hjw': ['hjq'], 'alx': ['clx'], 'ikj': ['ikp'], 'gyr': ['gym'], 'pkj': ['kkj'], 'bcy': ['xcy'], 'ebr': ['evr'], 'gcm': ['gcq'], 'iqw': ['iqx'], 'bml': ['bma'], 'wrz': ['wwz'], 'psk': ['ksk'], 'wtp': ['wzp'], 'inc': ['inn'], 'qbp': ['mbp'], 'bas': ['aas'], 'jfx': ['jyx'], 'osa': ['tsa'], 'shn': ['sun'], 'lhs': ['fhs'], 'fzm': ['fzu'], 'rzc': ['ruc'], 'idi': ['ido'], 'sdw': ['sdo'], 'lye': ['hye'], 'dht': ['cht'], 'gjy': ['gyy'], 'efs': ['efb'], 'rnb': ['rqb'], 'gtw': ['rtw'], 'pgf': ['pgn'], 'yhz': ['ycz'], 'rre': ['rce'], 'box': ['fox'], 'zql': ['zqn'], 'wmc': ['wmx'], 'tny': ['tey'], 'ayy': ['azy'], 'jka': ['jkn'], 'ost': ['ist'], 'ktc': ['kcc'], 'zxl': ['zxo'], 'jfs': ['jvs'], 'jgy': ['rgy'], 'tql': ['tel'], 'yne': ['yje'], 'mpa': ['ypa'], 'uwg': ['ufg'], 'uec': ['uee'], 'ppr': ['rpr'], 'eln': ['egn'], 'opp': ['opk'], 'rip': ['jip'], 'zge': ['fge'], 'jfb': ['jft'], 'jcu': ['jcq'], 'ngc': ['ngq'], 'vrw': ['vcw'], 'uml': ['fml'], 'lez': ['lmz'], 'nhy': ['uhy'], 'nuz': ['nuj'], 'wjj': ['wjg'], 'bqa': ['uqa'], 'kil': ['kix'], 'ymj': ['yxj'], 'bqg': ['bhg'], 'lop': ['lok'], 'fev': ['fei'], 'fjp': ['fje'], 'wzw': ['wiw'], 'ayp': ['anp'], 'xvf': ['yvf'], 'ptw': ['pte'], 'cts': ['ats']})