I'm not sure where I'm going wrong here and why my data is returning wrong. Writing this code to use fuzzywuzzy to clean bad input road names against a list of correct names, replacing the incorrect with the closest match.
It's returning all lines of data2
back. I'm looking for it to return the same, or replaced lines of data1
back to me.
My Minimal, Reproducible Example:
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
data1 =('3176 DETRIT ROAD')
data2 =('DETROIT RD')
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data2 = data2.split(',')
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2[0])
My full code
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def convert_tolist(string):
li = list(string.split(" "))
return li
with open(r"Cass_Howard - Copy.csv") as csv_file,\
open("Final_Test_Clean.csv", "w") as f,\
open(r"TEST_no_dups12.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader = csv.reader(f, delimiter=',')
file_1 = csv_file
file_2 = ul
for data1, data2 in zip(file_1, file_2):
data1 = data1.split(',')
data1 = data1[18]
data1 = data1.upper()
data2 = data2.strip()
data2 = data2.split(',')
data2 = ''.join(data2)
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data1 = list(data1)
convert_tolist(data2)
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2)
street2
query data(around 950 lines)
DETROIT ROAD
DETROIT ROAD
MANNIX ST
MANNIX ST
data2
choices data(around 200 lines)
ACRES
ADERSON RD
AIRPORT RD
ALGONQUIN
My issue here was fuzzywuzzy requires you to pass an iterable so I had to add data2 = data2.split(',')
to get the full strings to return.
My other issue was thinking I needed to use zip()
to compare my files when zip()
is for comparing parallel and not cross products.
Here is what I came up with that works. There is no issue with the code but fuzzywuzzy is not accurate enough for this tool to be practical to clean my data with the amount of typos in my address data.
If you can think of a way for me to clean up the if/else statement i'd be willing to hear it.
import os
import csv
import shutil
import usaddress
import pandas as pd
from fuzzywuzzy import process
with open(r"TEST_Cass_Howard.csv") as csv_file, \
open(".\Scratch\Final_Test_Clean.csv", "w") as f, \
open(r"TEST_Unique_List.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader1 = csv.reader(f, delimiter=',')
correct = list(ul)
for line in csv_reader:
line = line[18]
line = line.upper()
if line == '' or line == ' ':
line = ''
else:
try:
addressbrk = usaddress.tag(line)
except usaddress.RepeatedLabelError:
addressbrk = line
line = addressbrk # alt output: ('Please fix the incorect format of: %s,' % addressbrk)
if line != '':
roadnum2 = line[0].get('AddressNumber', '')
roadir2 = line[0].get('StreetNamePreDirectional', '')
roadname2 = line[0].get('StreetName', '')
roaddsg2 = line[0].get('StreetNamePostType', '')
else:
line = ''
if line != '':
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
else:
street2 = ''
if street2 != '':
if street2 not in correct:
street2 = process.extractOne(street2, correct)
else:
street2 = '\n'
if street2 != '':
tgthr = (roadnum2, street2[0])
else:
tgthr = ''
if tgthr != '':
final = (' '.join(tgthr))
else:
final = ('Null' + '\n')
f.writelines(final)
original = r"TEST_Cass_Howard.csv"
target = (r'.\Scratch\Cass_Howard_Clean.csv')
shutil.copyfile(original, target)
df1 = pd.read_csv('.\Scratch\Final_Test_Clean.csv', header=None)
df1.columns = ["WELL_ADDR_CLN"]
df = pd.read_csv('.\Scratch\Cass_Howard_Clean.csv')
df = df.join(df1)
new_data = df['WELL_ADDR_CLN']
df = df.drop(columns=['WELL_ADDR_CLN'])
df.insert(loc=19, column='WELL_ADDR_CLN', value=new_data)
os.remove(".\Scratch\Cass_Howard_Clean.csv")
df.to_csv("Cass_Howard_Clean.csv", index=False)
os.remove('.\Scratch\Final_Test_Clean.csv')