Search code examples
pythonlistreturn-valuearcgisfuzzywuzzy

fuzzywuzzy returning single characters, not strings


I'm not sure where I'm going wrong here and why my data is returning wrong. Writing this code to use fuzzywuzzy to clean bad input road names against a list of correct names, replacing the incorrect with the closest match.

It's returning all lines of data2 back. I'm looking for it to return the same, or replaced lines of data1 back to me.

My Minimal, Reproducible Example:

import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

data1 =('3176 DETRIT ROAD')
data2 =('DETROIT RD')

try:
    data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
    pass

roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')

street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data2 = data2.split(',')

if street2 not in data2:
    street2 = process.extract(street2, data2)
    print(street2[0])

My full code

import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def convert_tolist(string):
    li = list(string.split(" "))
    return li

with open(r"Cass_Howard - Copy.csv") as csv_file,\
        open("Final_Test_Clean.csv", "w") as f,\
        open(r"TEST_no_dups12.csv") as ul:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    csv_reader = csv.reader(f, delimiter=',')
    file_1 = csv_file
    file_2 = ul

    for data1, data2 in zip(file_1, file_2):
        data1 = data1.split(',')
        data1 = data1[18]
        data1 = data1.upper()
        data2 = data2.strip()
        data2 = data2.split(',')
        data2 = ''.join(data2)
        try:
            data1 = usaddress.tag(data1)
        except usaddress.RepeatedLabelError:
            pass

        roaddnum2 = data1[0].get('AddressNumber', '')
        roadir2 = data1[0].get('StreetNamePreDirectional', '')
        roadname2 = data1[0].get('StreetName', '')
        roaddsg2 = data1[0].get('StreetNamePostType', '')

        street2 = (roadir2, roadname2, roaddsg2)
        street2 = " ".join(street2)
        street2 = street2.strip()
        data1 = list(data1)
        convert_tolist(data2)


        if street2 not in data2:
            street2 = process.extract(street2, data2)
            print(street2)

street2 query data(around 950 lines)

DETROIT ROAD
DETROIT ROAD
MANNIX ST
MANNIX ST

data2 choices data(around 200 lines)

ACRES
ADERSON RD
AIRPORT RD
ALGONQUIN

Solution

  • My issue here was fuzzywuzzy requires you to pass an iterable so I had to add data2 = data2.split(',') to get the full strings to return.

    My other issue was thinking I needed to use zip() to compare my files when zip() is for comparing parallel and not cross products.

    Here is what I came up with that works. There is no issue with the code but fuzzywuzzy is not accurate enough for this tool to be practical to clean my data with the amount of typos in my address data.

    If you can think of a way for me to clean up the if/else statement i'd be willing to hear it.

    import os
    import csv
    import shutil
    import usaddress
    import pandas as pd
    from fuzzywuzzy import process
    
    with open(r"TEST_Cass_Howard.csv") as csv_file, \
            open(".\Scratch\Final_Test_Clean.csv", "w") as f, \
            open(r"TEST_Unique_List.csv") as ul:
        csv_reader = csv.reader(csv_file, delimiter=',')
        next(csv_reader)
        csv_reader1 = csv.reader(f, delimiter=',')
        correct = list(ul)
        for line in csv_reader:
            line = line[18]
            line = line.upper()
            if line == '' or line == ' ':
                line = ''
            else:
                try:
                    addressbrk = usaddress.tag(line)
                except usaddress.RepeatedLabelError:
                    addressbrk = line
                line = addressbrk  # alt output: ('Please fix the incorect format of: %s,' % addressbrk)
            if line != '':
                roadnum2 = line[0].get('AddressNumber', '')
                roadir2 = line[0].get('StreetNamePreDirectional', '')
                roadname2 = line[0].get('StreetName', '')
                roaddsg2 = line[0].get('StreetNamePostType', '')
            else:
                line = ''
            if line != '':
                street2 = (roadir2, roadname2, roaddsg2)
                street2 = " ".join(street2)
                street2 = street2.strip()
            else:
                street2 = ''
            if street2 != '':
                if street2 not in correct:
                    street2 = process.extractOne(street2, correct)
                else:
                    street2 = '\n'
            if street2 != '':
                tgthr = (roadnum2, street2[0])
            else:
                tgthr = ''
            if tgthr != '':
                final = (' '.join(tgthr))
            else:
                final = ('Null' + '\n')
            f.writelines(final)
    original = r"TEST_Cass_Howard.csv" 
    target = (r'.\Scratch\Cass_Howard_Clean.csv')
    shutil.copyfile(original, target)
    df1 = pd.read_csv('.\Scratch\Final_Test_Clean.csv', header=None)
    df1.columns = ["WELL_ADDR_CLN"]
    df = pd.read_csv('.\Scratch\Cass_Howard_Clean.csv')
    df = df.join(df1)
    new_data = df['WELL_ADDR_CLN']
    df = df.drop(columns=['WELL_ADDR_CLN'])
    df.insert(loc=19, column='WELL_ADDR_CLN', value=new_data)
    os.remove(".\Scratch\Cass_Howard_Clean.csv")
    df.to_csv("Cass_Howard_Clean.csv", index=False)
    os.remove('.\Scratch\Final_Test_Clean.csv')