Search code examples
pythonpandasmatchfuzzyfuzzywuzzy

How do to fuzzy matching on excel file using Pandas?


I have a table called account with two columns - ID & NAME. ID is a hash which is unique but NAME is a string which might have duplicates.

I'm trying to write a python script to read this excel file and match 0-3 similar NAME values, but I just cannot seem to get it to work. Could someone help out? Thanks

import pandas as pd
from fuzzywuzzy import fuzz
import difflib

def get_spr(row):
    d = name1.apply(lambda x: (fuzz.ratio(x['NAME'], row['NAME']) * 0 if row['ID'] == x['ID'] else 1), axis=1)
    d = d[d>= 60]
    if len(d) == 0:
        v = ['']*2
    else:
        v = name1.ix[d.idxmax(),['ID' , 'NAME']].values
    return pd.Series(v, index=['ID', 'NAME'])


def score(tablerow):
    d = name1.apply(lambda x: fuzz.ratio(x['NAME'],tablerow['NAME']) * (0 if x['ID']==tablerow['ID'] else 1), axis=1)
    d = d[d>90]
    if len(d) == 0:
        v = [''] * 2
    else:
        v = name1.ix[d.order(ascending=False).head(3).index, ['ID' , 'NAME']].values
    return pd.DataFrame(v, index=['ID', 'NAME'])

account = "account_test.xlsx"

xl_acc1 = pd.ExcelFile(account)
xl_acc2 = pd.ExcelFile(account)

acc1 = xl_acc1.parse(xl_acc1.sheet_names[0])
acc2 = xl_acc2.parse(xl_acc2.sheet_names[0])

name1 = acc1[pd.notnull(acc1['NAME'])]
name2 = acc2[pd.notnull(acc2['NAME'])]
print 'Doing Fuzzy Matching'


name2= pd.concat((name2,name2.apply(get_spr, axis=1)), axis=1)
name2.to_excel(pd.ExcelWriter('res.xlsx'),'acc')

Any help would be much appreciated!

The file has rows like this:-

ID                    NAME
0016F00001c7GDZQA2  Daniela Abriani
0016F00001c7GPnQAM  Daniel Abriani
0016F00001c7JRrQAM  Nisha Well
0016F00001c7Jv8QAE  Katherine
0016F00001c7cXiQAI  Katerine
0016F00001c7dA3QAI  Katherin
0016F00001c7kHyQAI  Nursing and Midwifery Council Research Office
0016F00001c8G8OQAU  Nisa Well

Expected (output dataframe) would be something like:

      ID               NAME          ID2          NAME2
    <hash1>          katherine      <hash2>       katerine
    <hash1>          katherine      <hash3>       katherin
    <hash4>          Nisa Well      <hash5>       Nisha Well

Issue: The above code just reproduces the input as the output saved file without actually concatenating any matches.


Solution

  • I don't think you need to do this in pandas. Here is my sloppy solution but it gets your desired output using a dictionary.

    from fuzzywuzzy import process
    df = pd.DataFrame([
                    ['0016F00001c7GDZQA2',  'Daniela Abriani'],
                    ['0016F00001c7GPnQAM',  'Daniel Abriani'],
                    ['0016F00001c7JRrQAM',  'Nisha Well'],
                    ['0016F00001c7Jv8QAE', 'Katherine'],
                    ['0016F00001c7cXiQAI', 'Katerine'],
                    ['0016F00001c7dA3QAI',  'Katherin'],
                    ['0016F00001c7kHyQAI',  'Nursing and Midwifery Council Research Office'],
                    ['0016F00001c8G8OQAU',  'Nisa Well']], 
                    columns=['ID', 'NAME'])
    

    get unique hashes in to a dictionary.

    hashdict = dict(zip(df['ID'], df['NAME']))
    

    define a function checkpair. You'll need it to remove reciprocal hash pairs. This method will add (hash1, hash2) and (hash2, hash1), but I think you only want to keep one of those pairs:

    def checkpair (a,b,l):
        for x in l:
            if (a,b) == (x[2],x[0]):
                l.remove(x)
    

    Now iterate through hashdict.items() finding the top 3 matches along the way. The fuzzywuzzy docs detail the process method.

    matches = []
    for k,v in hashdict.items():
    
        #see docs for extract -- 4 because you are comparing a name to itself
        top3 = process.extract(v, hashdict, limit=4)
    
        #remove the hashID compared to itself
        for h in top3:
            if k == h[2]:
                top3.remove(h)
    
        #append tuples to the list "matches" if it meets a score criteria      
        [matches.append((k, v, x[2], x[0], x[1])) for x in top3 if x[1] > 60] #change score?
    
        #remove reciprocal pairs
        [checkpair(m[0], m[2], matches) for m in matches]
    
    df = pd.DataFrame(matches, columns=['id1', 'name1', 'id2', 'name2', 'score'])
    # write to file
    writer = pd.ExcelWriter('/path/to/your/file.xlsx')
    df.to_excel(writer,'Sheet1')
    writer.save()
    

    Output:

        id1     name1   id2     name2   score
    0   0016F00001c7JRrQAM  Nisha Well  0016F00001c8G8OQAU  Nisa Well   95
    1   0016F00001c7GPnQAM  Daniel Abriani  0016F00001c7GDZQA2  Daniela Abriani     97
    2   0016F00001c7Jv8QAE  Katherine   0016F00001c7dA3QAI  Katherin    94
    3   0016F00001c7Jv8QAE  Katherine   0016F00001c7cXiQAI  Katerine    94
    4   0016F00001c7dA3QAI  Katherin    0016F00001c7cXiQAI  Katerine    88