Search code examples
python-3.xpandasapplymultiple-arguments

Apply function to pandas series given varying arguments


Initial question

I want to calculate the Levenshtein distance between multiple strings, one in a series, the other in a list. I tried my hands on map, zip, etc., but I only got the desired result using a for loop and apply. Is there a way to improve style and especially speed?

Here is what I tried and it does what it is supposed to do, but lacks of speed given a large series.

import stringdist

strings = ['Hello', 'my', 'Friend', 'I', 'am']
s = pd.Series(data=strings, index=strings)
c = ['me', 'mine', 'Friend']
df = pd.DataFrame()
for w in c:
    df[w] = s.apply(lambda x: stringdist.levenshtein(x, w))

## Result: ##
        me  mine  Friend
Hello    4     5       6
my       1     3       6
Friend   5     4       0
I        2     4       6
am       2     4       6

Solution

Thanks to @Dames and @molybdenum42, I can provide the solution I used, directly beneath the question. For more insights, please check their great answers below.

import stringdist
from itertools import product

strings = ['Hello', 'my', 'Friend', 'I', 'am']
s = pd.Series(data=strings, index=strings)
c = ['me', 'mine', 'Friend']

word_combinations = np.array(list(product(s.values, c)))
vectorized_levenshtein = np.vectorize(stringdist.levenshtein)
result = vectorized_levenshtein(word_combinations[:, 0],       
word_combinations[:, 1])
result = result.reshape((len(s), len(c)))
df = pd.DataFrame(result, columns=c, index=s)

This results in the desired data frame.


Solution

  • Setup:

    import stringdist
    import pandas as pd
    import numpy as np
    import itertools
    
    s = pd.Series(data=['Hello', 'my', 'Friend'],
                  index=['Hello', 'my', 'Friend'])
    c = ['me', 'mine', 'Friend']
    

    Options

    1. option: an easy one-liner
    df = pd.DataFrame([s.apply(lambda x: stringdist.levenshtein(x, w)) for w in c])
    
    1. option: np.fromfunction (thanks to @baccandr)
    @np.vectorize
    def lavdist(a, b):
        return stringdist.levenshtein(c[a], s[b])
    
    df = pd.DataFrame(np.fromfunction(lavdist, (len(c), len(s)), dtype = int), 
                      columns=c, index=s)
    
    1. option: see @molybdenum42
    word_combinations = np.array(list(itertools.product(s.values, c)))
    vectorized_levenshtein = np.vectorize(stringdist.levenshtein)
    result = vectorized_levenshtein(word_combinations[:,0], word_combinations[:,1])
    df = pd.DataFrame([word_combinations[:,1], word_combinations[:,1], result])
    df = df.set_index([0,1])[2].unstack()
    
    1. (the best) option: modified option 3
    word_combinations = np.array(list(itertools.product(s.values, c)))
    vectorized_levenshtein = np.vectorize(distance)
    result = vectorized_levenshtein(word_combinations[:,0], word_combinations[:,1])
    result = result.reshape((len(s), len(c)))
    df = pd.DataFrame(result, columns=c, index=s)
    

    Performance testing:

    import timeit
    from Levenshtein import distance
    import pandas as pd
    import numpy as np
    import itertools
    
    s = pd.Series(data=['Hello', 'my', 'Friend'],
                  index=['Hello', 'my', 'Friend'])
    c = ['me', 'mine', 'Friend']
    
    test_code0 = """
    df = pd.DataFrame()
    for w in c:
        df[w] = s.apply(lambda x: distance(x, w))
    """
    
    test_code1 = """
    df = pd.DataFrame({w:s.apply(lambda x: distance(x, w)) for w in c})
    """
    
    test_code2 = """
    @np.vectorize
    def lavdist(a, b):
        return distance(c[a], s[b])
    
    df = pd.DataFrame(np.fromfunction(lavdist, (len(c), len(s)), dtype = int), 
                      columns=c, index=s)
    """
    
    test_code3 = """
    word_combinations = np.array(list(itertools.product(s.values, c)))
    vectorized_levenshtein = np.vectorize(distance)
    result = vectorized_levenshtein(word_combinations[:,0], word_combinations[:,1])
    df = pd.DataFrame([word_combinations[:,1], word_combinations[:,1], result])
    df = df.set_index([0,1])[2] #.unstack() produces error
    """
    
    test_code4 = """
    word_combinations = np.array(list(itertools.product(s.values, c)))
    vectorized_levenshtein = np.vectorize(distance)
    result = vectorized_levenshtein(word_combinations[:,0], word_combinations[:,1])
    result = result.reshape((len(s), len(c)))
    df = pd.DataFrame(result, columns=c, index=s)
    """
    
    test_setup = "from __main__ import distance, s, c, pd, np, itertools"
    
    print("test0", timeit.timeit(test_code0, number = 1000, setup = test_setup))
    print("test1", timeit.timeit(test_code1, number = 1000, setup = test_setup))
    print("test2", timeit.timeit(test_code2, number = 1000, setup = test_setup))
    print("test3", timeit.timeit(test_code3, number = 1000, setup = test_setup))
    print("test4", timeit.timeit(test_code4, number = 1000, setup = test_setup))
    

    Results

    # results
    # test0 1.3671939949999796
    # test1 0.5982696900009614
    # test2 0.3246431229999871
    # test3 2.0100400850005826
    # test4 0.23796007100099814