Search code examples
pythonalgorithmnumpytime-complexityentropy

how to improve algorithm efficiency of entropy weight method in python


Below is the code, however, it's very slow when dealing with large data. (maybe takes >1 days for a 5,000,000 rows, 6 columns dataframe.

Just wondering how could I optimise it? Many Thanks

def ewm(df):
    df = df.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x))))
    rows, cols = df.shape 
    k = 1.0 / math.log(rows)
 
    lnf = [[None] * cols for i in range(rows)]
    for i in range(0, rows):         
        for j in range(0, cols):
            if df.iloc[i][j] == 0:
                lnfij = 0.0
            else:
                p = df.iloc[i][j] / df.iloc[:,j].sum()
                lnfij = math.log(p) * p * (-k)
            lnf[i][j] = lnfij
    lnf = pd.DataFrame(lnf)
 
    d = 1 - lnf.sum(axis=0)
    w = [[None] * 1 for i in range(cols)]
    for j in range(0, cols):
        wj = d[j] / sum(d)
        w[j] = wj
    
    w = pd.DataFrame(w)
    w = w.round(5)    #.applymap(lambda x:format(x,'.5f'))
    w.index = df.columns
    w.columns =['weight']
    return w

Solution

  • Having numpy do the loops should speed it up alot

    import numpy as np
    import pandas as pd
    
    def ewm(df):
        df = df.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x))))
        rows, cols = df.shape 
        k = 1.0 / math.log(rows)
        
        p = df / df.sum(axis=0)
        lnf = -np.log(p , where = df!=0 )*p*k
        
        d = 1 - lnf.sum(axis=0)
        w = d / d.sum()
        
        w = pd.DataFrame(w)
        w = w.round(5)
        w.index = df.columns
        w.columns =['weight']
        return w