I am building a program for comparing each promocode(might contain ocr error) in a list to all the promocode in another list(list of correct promocodes)
the expected output is edit distance and the promo code with least edit distance to the one which is getting compared.
my code
import csv
from nltk.metrics import distance
with open("all_correct_promo.csv","rb") as file1:
reader1 = csv.reader(file1)
correctPromoList = list(reader1)
#print correctPromoList
with open("all_extracted_promo.csv","rb") as file2:
reader2 = csv.reader(file2)
extractedPromoList = list(reader2)
#print extractedPromoList
def find_min_edit(str_,list_):
nearest_correct_promos = []
distances = {}
min_dist = 100 # arbitrary large assignment
for correct_promo in list_:
dist = distance.edit_distance(extracted,correct_promo,True) # compute Levenshtein distance
distances[correct_promo] = dist # store each score for real promo codes
if dist<min_dist:
min_dist = dist # store min distance
# extract all real promo codes with minimum Levenshtein distance
nearest_correct_promos.append(','.join([i[0] for i in distances.items() if i[1]==min_dist]))
return ','.join(nearest_correct_promos) # return a comma separated string of nearest real promo codes
incorrectPromo = {}
count = 0
for extracted in extractedPromoList:
print 'Computing %dth promo code...' % count
incorrectPromo[extracted] = find_min_edit(extracted,correctPromoList) # get comma separated str of real promo codes nearest to extracted
count+=1
print incorrectPromo
Expected output
Computing 0th promo code...
Computing 1th promo code...
Computing 2th promo code...
{'abc': 'abc', 'abd': 'abx,aba,abz,abc', 'acd': 'abx,aba,abz,abc'}
BUT, my code is showing the following errors
Computing 0th promo code...
Traceback (most recent call last):
File "correctpromo_test4.py", line 31, in <module>
incorrectPromo[extracted] = find_min_edit(extracted,correctPromoList) # get
comma separated str of real promo codes nearest to extracted
File "correctpromo_test4.py", line 20, in find_min_edit
distances[correct_promo] = dist # store each score for real promo codes
TypeError: unhashable type: 'list'
You are reading the CSV as a list of lists - the function find_min_edit() is expecting a list of strings as its second argument; what you are passing is a list of lists of strings.
Changing the way you read the csv files with sort this stuff out -
Instead of
with open("all_correct_promo.csv","rb") as file1:
reader1 = csv.reader(file1)
correctPromoList = list(reader1)
Just use this
with open("all_correct_promo.csv","rb") as file1:
reader1 = csv.reader(file1)
correctPromoList = [''.join(i) for i in reader1]
print correctPromoList
Do this for both the CSVs, that will sort it out...