Below is the code, however, it's very slow when dealing with large data. (maybe takes >1 days for a 5,000,000 rows, 6 columns dataframe.
Just wondering how could I optimise it? Many Thanks
def ewm(df):
df = df.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x))))
rows, cols = df.shape
k = 1.0 / math.log(rows)
lnf = [[None] * cols for i in range(rows)]
for i in range(0, rows):
for j in range(0, cols):
if df.iloc[i][j] == 0:
lnfij = 0.0
else:
p = df.iloc[i][j] / df.iloc[:,j].sum()
lnfij = math.log(p) * p * (-k)
lnf[i][j] = lnfij
lnf = pd.DataFrame(lnf)
d = 1 - lnf.sum(axis=0)
w = [[None] * 1 for i in range(cols)]
for j in range(0, cols):
wj = d[j] / sum(d)
w[j] = wj
w = pd.DataFrame(w)
w = w.round(5) #.applymap(lambda x:format(x,'.5f'))
w.index = df.columns
w.columns =['weight']
return w
Having numpy do the loops should speed it up alot
import numpy as np
import pandas as pd
def ewm(df):
df = df.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x))))
rows, cols = df.shape
k = 1.0 / math.log(rows)
p = df / df.sum(axis=0)
lnf = -np.log(p , where = df!=0 )*p*k
d = 1 - lnf.sum(axis=0)
w = d / d.sum()
w = pd.DataFrame(w)
w = w.round(5)
w.index = df.columns
w.columns =['weight']
return w