I am trying to download from pandas datareader library the stock price and calculate the (daily, weekly,monthly,etc...) return based on the ticker that I provide.
After downloading the data, I execute a kstest at the distribution of this data and evaluate if it is similar to a bi-normal distribution (sum of two normal distribution) based on the p-value provided.
Since I am executing only one kstest for this distribution I want to maximize the p-value(minimize -p-value) utilizing the "minimize" library in Python varying the values of mean, standard deviation and weight of this two distribution.
import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.optimize import minimize
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt
from pandas_datareader import data
import time
import xlwt
import matplotlib.ticker as mtick
from sklearn import datasets
def Puxa_Preco(ticker,start_date,end_date,lag):
dados= data.get_data_yahoo(ticker, start_date, end_date )
from sklearn import datasets
data_set = np.log(dados['Close'])-np.log(dados['Close'] .shift(lag))
data_set = data_set.fillna(method='ffill')
data_set = data_set.dropna()
y = pd.DataFrame()
y=data_set
x = np.arange(len(y))
size = len(y)
print(y)
return y
def mixnormal_cdf(distribuicao, weight1, mean1, stdv1,weight2, mean2, stdv2):
"""
CDF of a mixture of two normal distributions.
"""
return (weight1*st.norm.cdf(distribuicao, mean1, stdv1) +
weight2*st.norm.cdf(distribuicao, mean2, stdv2))
def Objetivo(X,distribuicao):
peso_dist_1 = X[0]
mi1 = X[1]
sigma1 = X[2]
peso_dist_2 = 1-X[0]
mi2 = X[3]
sigma2 = X[4]
stat2, pvalue = st.kstest(distribuicao, cdf=mixnormal_cdf,
args=(peso_dist_1, mi1, sigma1,peso_dist_2, mi2, sigma2))
''' Kolmogorov-Smirnov Test, to test whether or not the data is from a given distribution. The
returned p-value indicates the probability that the data is from the given distribution,
i.e. a low p-value means the data are likely not from the tested distribution.
Note that, for this test, it is necessary to specify shape, location, and scale parameters,
to obtain meaningful results (c,loc,scale).
stat2: the test statistic, e.g. the max distance between the
cumulated distributions '''
return -pvalue
ticker = 'PETR4.SA'
start_date = '2010-01-02' #yyyy-mm-dd
end_date = '2015-01-02'
for lag in range(1,503):
distribuicao = Puxa_Preco(ticker,start_date,end_date,lag)
n = len(distribuicao)
ChuteInicial=[0.3,0.0010,0.0010,-0.0030,0.0830] #peso_dist_1, mi1, sigma1, mi2, sigma2
test = [0.2,0.0020,0.0110,0.8,-0.0020,0.0230]
Limites = ([0,1],[-50,+50],[0,+50],[0,1],[-50,+50],[0,+50]) #peso_dist_1, mi1, sigma1, peso_dist_2,mi2, sigma2
print("------------------------------------------------------------------------------------------------")
print("Validation Test:")
print(-Objetivo(test,distribuicao)) #the value should be around -0.90 to verify that the objective function it is ok
solution = minimize(fun=Objetivo,x0=ChuteInicial,args=distribuicao,method='SLSQP',bounds = Limites) #minimize - p-valor
print("------------------------------------------------------------------------------------------------")
print("solution:")
print(solution)
Finding the following solution:
fun: -8.098252265651002e-53
jac: array([-2.13080032e-35, 0.00000000e+00, 0.00000000e+00, -1.93307671e-34, 7.91878934e-35])
message: 'Optimization terminated successfully.'
nfev: 8
nit: 1
njev: 1
status: 6
success: True
x: array([ 0.3 , 0.001, 0.001, -0.003, 0.083])
But I know that the correct answer should be something like (test) : [0.2,0.0020,0.0110,0.8,-0.0020,0.0230] producing a p-value of 0.90
Seems to me that it is running only a few simulations and since it is not changing the p-value it stops.
Is there a way that I can ensure that the "minimize" will stop only after finding a p-value greater than 0.9 ? Can someone please help me?
I tried using the minimize considering Nelder Mead method, and seems more accurate but not even close to the 0.9 p-value that should be the answer and I don't know if the Nelder Mead method considers the limits that I provided.
#solution = minimize(fun=Objetivo,x0=(ChuteInicial),args=distribuicao,method='Nelder-Mead',bounds = Limites,options={'int':1000000})
By able to minimize the k-s statistic instead of p-value and other modifications in defining the cdf function, I think I was able to estimate the parameters. Here is my code and the optimized parameter estimates. I got the idea to minimize the k-s statistic from this paper (https://www.researchgate.net/publication/250298633_Minimum_Kolmogorov-Smirnov_test_statistic_parameter_estimates)
import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.optimize import minimize
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt
from pandas_datareader import data
import time
import xlwt
import matplotlib.ticker as mtick
from sklearn import datasets
def Puxa_Preco(ticker,start_date,end_date,lag):
dados= data.get_data_yahoo(ticker, start_date, end_date )
data_set = np.log(dados['Close'])-np.log(dados['Close'] .shift(lag))
data_set = data_set.fillna(method='ffill')
data_set = data_set.dropna()
y = pd.DataFrame()
y=data_set
x = np.arange(len(y))
size = len(y)
return y
def mixnormal_cdf(distribuicao, weight1, mean1, stdv1,mean2, stdv2):
## CDF of a mixture of two normal distributions.
return (weight1*st.norm.cdf(distribuicao, mean1, stdv1) +
(1-weight1)*st.norm.cdf(distribuicao, mean2, stdv2))
def Objetivo(X, distribuicao):
stat2, pvalue = st.kstest(distribuicao, cdf=mixnormal_cdf, args=X)
return stat2
ticker = 'PETR4.SA'
start_date = '2010-01-02' #yyyy-mm-dd
end_date = '2015-01-02'
for lag in range(1,10):
distribuicao = Puxa_Preco(ticker,start_date,end_date,lag)
ChuteInicial=[0.2,0.0020,0.0110,-0.0020,0.0230] #peso_dist_1, mi1, sigma1, mi2, sigma2
Limites = ([0,1],[-0.1,+0.1],[0,None],[-0.1,0.1],[0,None]) #peso_dist_1, mi1, sigma1, peso_dist_2,mi2, sigma2
print("------------------------------------------------------------------------------------------------")
print("Intial Guess")
print(" weight1: {:.4f}, mean1: {:.4f}, stdv1: {:.4f}, mean2: {:.4f}, stdv2: {:.4f}".format(*ChuteInicial))
ks_stat, p_value = st.kstest(distribuicao, cdf=mixnormal_cdf, args=ChuteInicial)
print("k-s stat: {}, p-value: {}".format(ks_stat, p_value))
print("------------------------------------------------------------------------------------------------")
solution = minimize(fun=Objetivo,x0=ChuteInicial,args=(distribuicao), method='SLSQP',bounds = Limites, options={'ftol':1e-12})
print("Optimized Solution:")
print("------------------------------------------------------------------------------------------------")
print(solution.message)
ks_stat, p_value = st.kstest(distribuicao, cdf=mixnormal_cdf, args=solution.x)
print("Optimized k-s stat: {}, p-value: {}".format(ks_stat, p_value))
print("Optimized weight1: {:.4f}, mean1: {:.4f}, stdv1: {:.4f}, mean2: {:.4f}, stdv2: {:.4f}".format(*solution.x))
print("------------------------------------------------------------------------------------------------")
which results in
LAG: 1
------------------------------------------------------------------------------------------------
Intial Guess
weight1: 0.2000, mean1: 0.0020, stdv1: 0.0110, mean2: -0.0020, stdv2: 0.0230
k-s stat: 0.016419395777755863, p-value: 0.9027260234690881
------------------------------------------------------------------------------------------------
Optimized Solution:
------------------------------------------------------------------------------------------------
Optimization terminated successfully.
Optimized k-s stat: 0.014896801186217778, p-value: 0.952748910069967
Optimized weight1: 0.2016, mean1: 0.0016, stdv1: 0.0108, mean2: -0.0017, stdv2: 0.0227
------------------------------------------------------------------------------------------------
LAG: 2
------------------------------------------------------------------------------------------------
Intial Guess
weight1: 0.2000, mean1: 0.0020, stdv1: 0.0110, mean2: -0.0020, stdv2: 0.0230
k-s stat: 0.0791846286223467, p-value: 5.496852766236095e-07
------------------------------------------------------------------------------------------------
Optimized Solution:
------------------------------------------------------------------------------------------------
Optimization terminated successfully.
Optimized k-s stat: 0.013690695822088705, p-value: 0.978164746056248
Optimized weight1: 0.2291, mean1: -0.0047, stdv1: 0.0125, mean2: -0.0012, stdv2: 0.0351
------------------------------------------------------------------------------------------------
LAG: 3
------------------------------------------------------------------------------------------------
Intial Guess
weight1: 0.2000, mean1: 0.0020, stdv1: 0.0110, mean2: -0.0020, stdv2: 0.0230
k-s stat: 0.13436209329730314, p-value: 2.533666062868497e-19
------------------------------------------------------------------------------------------------
Optimized Solution:
------------------------------------------------------------------------------------------------
Optimization terminated successfully.
Optimized k-s stat: 0.01622740259609401, p-value: 0.9106251238446841
Optimized weight1: 0.2638, mean1: -0.0058, stdv1: 0.0199, mean2: -0.0020, stdv2: 0.0433
------------------------------------------------------------------------------------------------