Search code examples
pythonoptimizationsimilaritypearson

Pearson Similarity Score, how can I optimise this further?


I have an implemented of Pearson's Similarity score for comparing two dictionaries of values. More time is spent in this method than anywhere else (potentially many millions of calls), so this is clearly the critical method to optimise.

Even the slightest optimisation could have a big impact on my code, so I'm keen to explore even the smallest improvements.

Here's what I have so far:

def simple_pearson(v1,v2):

    si = [val for val in v1 if val in v2]

    n = len(si)

    if n==0: return 0.0

    sum1 = 0.0
    sum2 = 0.0
    sum1_sq = 0.0
    sum2_sq = 0.0
    p_sum = 0.0

    for v in si:
        val_1 = v1[v]
        val_2 = v2[v]
        sum1+=val_1
        sum2+=val_2
        sum1_sq+=pow(val_1,2)
        sum2_sq+=pow(val_2,2)
        p_sum+=val_1*val_2

    # Calculate Pearson score
    num = p_sum-(sum1*sum2/n)
    temp = (sum1_sq-pow(sum1,2)/n) * (sum2_sq-pow(sum2,2)/n)
    if temp < 0.0:
        temp = -temp
    den = sqrt(temp)
    if den==0: return 1.0

    r = num/den

    return r

Solution

  • Scipy is the fastest!

    I have don some tests with the code above and also with a version I found on my comp, see below for results and the code:

    pearson 14.7597990757
    sim_pearson 15.6806837987
    scipy:pearsonr 0.451986019188
    
    
    try:
        import psyco
        psyco.full()
    except ImportError:
        pass
    
    from math import sqrt
    
    def sim_pearson(set1, set2):
        si={}
        for item in set1:
            if item in set2:
                si[item] = 1
    
        #number of elements
        n = len(si)
    
        #if none common, return 0 similarity
        if n == 0: return 0
    
        #add up all the preferences
        sum1 = sum([set1[item] for item in si])
        sum2 = sum([set2[item] for item in si])
    
        #sum up the squares
        sum_sq1 = sum([pow(set1[item], 2) for item in si])
        sum_sq2 = sum([pow(set2[item], 2) for item in si])
    
        #sum up the products
        sum_p = sum([set1[item] * set2[item] for item in si])
    
        nom = sum_p - ((sum1 * sum2) / n )
        den = sqrt( (sum_sq1 - (sum1)**2 / n) * (sum_sq2 - (sum2)**2 / n) )
    
        if den==0: return 0
        return nom/den
    
    
    
    # from http://stackoverflow.com/questions/1307016/pearson-similarity-score-how-can-i-optimise-this-further
    def pearson(v1, v2):
        vs = [(v1[val],v2[val]) for val in v1 if val in v2]
    
        n = len(vs)
    
        if n==0: return 0.0
    
        sum1,sum2,sum1_sq,sum2_sq,p_sum = 0.0, 0.0, 0.0, 0.0, 0.0
    
        for v1,v2 in vs:
            sum1+=v1
            sum2+=v2
            sum1_sq+=v1*v1
            sum2_sq+=v2*v2
            p_sum+=v1*v2
    
        # Calculate Pearson score
        num = p_sum-(sum1*sum2/n)
        temp = max((sum1_sq-pow(sum1,2)/n) * (sum2_sq-pow(sum2,2)/n),0)
        if temp:
            return num / sqrt(temp)
        return 1.0
    
    
    
    
    
    
    if __name__ == "__main__":
        import timeit
    
        tsetup = """
    from random import randrange
    from __main__ import pearson, sim_pearson
    from scipy.stats import pearsonr
    v1 = [randrange(0,1000) for x in range(1000)]
    v2 = [randrange(0,1000) for x in range(1000)]
    #gc.enable()
    """
        t1 = timeit.Timer(stmt="pearson(v1,v2)", setup=tsetup)
        t2 = timeit.Timer(stmt="sim_pearson(v1,v2)", setup=tsetup)
        t3 = timeit.Timer(stmt="pearsonr(v1,v2)", setup=tsetup)
    
        tt = 1000
    
        print 'pearson', t1.timeit(tt)
        print 'sim_pearson', t2.timeit(tt)
        print 'scipy:pearsonr', t3.timeit(tt)