Search code examples
pythonscipyminimizep-valuekolmogorov-smirnov

How can I minimize -p-value in python provided by the kstest of one distribution based on data price return?


I am trying to download from pandas datareader library the stock price and calculate the (daily, weekly,monthly,etc...) return based on the ticker that I provide.

After downloading the data, I execute a kstest at the distribution of this data and evaluate if it is similar to a bi-normal distribution (sum of two normal distribution) based on the p-value provided.

Since I am executing only one kstest for this distribution I want to maximize the p-value(minimize -p-value) utilizing the "minimize" library in Python varying the values of mean, standard deviation and weight of this two distribution.

import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.optimize import minimize
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt
from pandas_datareader import data
import time
import xlwt
import matplotlib.ticker as mtick
from sklearn import datasets

def Puxa_Preco(ticker,start_date,end_date,lag):    
    dados= data.get_data_yahoo(ticker, start_date, end_date )

    from sklearn import datasets
    data_set =  np.log(dados['Close'])-np.log(dados['Close'] .shift(lag))

    data_set = data_set.fillna(method='ffill') 
    data_set = data_set.dropna() 

    y = pd.DataFrame()
    y=data_set

    x = np.arange(len(y))
    size = len(y)
    print(y)
    return y

def mixnormal_cdf(distribuicao, weight1, mean1, stdv1,weight2, mean2, stdv2):
    """
    CDF of a mixture of two normal distributions.
    """
    return (weight1*st.norm.cdf(distribuicao, mean1, stdv1) +
            weight2*st.norm.cdf(distribuicao, mean2, stdv2))

def Objetivo(X,distribuicao):
    peso_dist_1 = X[0]
    mi1 = X[1]
    sigma1 = X[2]
    peso_dist_2 = 1-X[0]
    mi2 = X[3]
    sigma2 = X[4]

    stat2, pvalue = st.kstest(distribuicao, cdf=mixnormal_cdf,
                                args=(peso_dist_1, mi1, sigma1,peso_dist_2, mi2, sigma2))
    ''' Kolmogorov-Smirnov Test, to test whether or not the data is from a given distribution. The 
        returned p-value indicates the probability that the data is from the given distribution, 
        i.e. a low p-value means the data are likely not from the tested distribution.
        Note that, for this test, it is necessary to specify shape, location, and scale parameters,
        to obtain meaningful results (c,loc,scale). 

        stat2:     the test statistic, e.g. the max distance between the
        cumulated distributions '''

    return -pvalue

ticker = 'PETR4.SA'
start_date  = '2010-01-02'      #yyyy-mm-dd
end_date    = '2015-01-02'

for lag in range(1,503):

    distribuicao = Puxa_Preco(ticker,start_date,end_date,lag)
    n = len(distribuicao)

    ChuteInicial=[0.3,0.0010,0.0010,-0.0030,0.0830]                                      #peso_dist_1, mi1, sigma1, mi2, sigma2
    test = [0.2,0.0020,0.0110,0.8,-0.0020,0.0230]
    Limites = ([0,1],[-50,+50],[0,+50],[0,1],[-50,+50],[0,+50])                              #peso_dist_1, mi1, sigma1, peso_dist_2,mi2, sigma2
    print("------------------------------------------------------------------------------------------------")
    print("Validation Test:")

    print(-Objetivo(test,distribuicao))                                             #the value should be around -0.90 to verify that the objective function it is ok

    solution = minimize(fun=Objetivo,x0=ChuteInicial,args=distribuicao,method='SLSQP',bounds = Limites)             #minimize - p-valor
    print("------------------------------------------------------------------------------------------------")
    print("solution:")
    print(solution)

Finding the following solution:

         fun: -8.098252265651002e-53
         jac: array([-2.13080032e-35,  0.00000000e+00,  0.00000000e+00, -1.93307671e-34, 7.91878934e-35])
     message: 'Optimization terminated successfully.'
        nfev: 8
         nit: 1
        njev: 1
      status: 6
     success: True
           x: array([ 0.3  ,  0.001,  0.001, -0.003,  0.083])

But I know that the correct answer should be something like (test) : [0.2,0.0020,0.0110,0.8,-0.0020,0.0230] producing a p-value of 0.90

Seems to me that it is running only a few simulations and since it is not changing the p-value it stops.

Is there a way that I can ensure that the "minimize" will stop only after finding a p-value greater than 0.9 ? Can someone please help me?

I tried using the minimize considering Nelder Mead method, and seems more accurate but not even close to the 0.9 p-value that should be the answer and I don't know if the Nelder Mead method considers the limits that I provided.

#solution = minimize(fun=Objetivo,x0=(ChuteInicial),args=distribuicao,method='Nelder-Mead',bounds = Limites,options={'int':1000000})            

Solution

  • By able to minimize the k-s statistic instead of p-value and other modifications in defining the cdf function, I think I was able to estimate the parameters. Here is my code and the optimized parameter estimates. I got the idea to minimize the k-s statistic from this paper (https://www.researchgate.net/publication/250298633_Minimum_Kolmogorov-Smirnov_test_statistic_parameter_estimates)

    import warnings
    import numpy as np
    import pandas as pd
    import scipy.stats as st
    from scipy.optimize import minimize
    import statsmodels as sm
    import matplotlib
    import matplotlib.pyplot as plt
    from pandas_datareader import data
    import time
    import xlwt
    import matplotlib.ticker as mtick
    from sklearn import datasets
    
    def Puxa_Preco(ticker,start_date,end_date,lag):    
        dados= data.get_data_yahoo(ticker, start_date, end_date )
    
        data_set =  np.log(dados['Close'])-np.log(dados['Close'] .shift(lag))
    
        data_set = data_set.fillna(method='ffill') 
        data_set = data_set.dropna() 
    
        y = pd.DataFrame()
        y=data_set
    
        x = np.arange(len(y))
        size = len(y)
    
        return y
    
    def mixnormal_cdf(distribuicao, weight1, mean1, stdv1,mean2, stdv2):
        ## CDF of a mixture of two normal distributions.
    
        return (weight1*st.norm.cdf(distribuicao, mean1, stdv1) +
                (1-weight1)*st.norm.cdf(distribuicao, mean2, stdv2))
    
    def Objetivo(X, distribuicao):
        stat2, pvalue = st.kstest(distribuicao, cdf=mixnormal_cdf, args=X)
        return stat2
    
    ticker = 'PETR4.SA'
    start_date  = '2010-01-02'      #yyyy-mm-dd
    end_date    = '2015-01-02'
    
    for lag in range(1,10):
        distribuicao = Puxa_Preco(ticker,start_date,end_date,lag)
    
        ChuteInicial=[0.2,0.0020,0.0110,-0.0020,0.0230]                                      #peso_dist_1, mi1, sigma1, mi2, sigma2
    
        Limites = ([0,1],[-0.1,+0.1],[0,None],[-0.1,0.1],[0,None])                              #peso_dist_1, mi1, sigma1, peso_dist_2,mi2, sigma2
    
        print("------------------------------------------------------------------------------------------------")
        print("Intial Guess")
        print(" weight1: {:.4f}, mean1: {:.4f}, stdv1: {:.4f}, mean2: {:.4f}, stdv2: {:.4f}".format(*ChuteInicial))
        ks_stat, p_value = st.kstest(distribuicao, cdf=mixnormal_cdf, args=ChuteInicial)
        print("k-s stat: {}, p-value: {}".format(ks_stat, p_value))
        print("------------------------------------------------------------------------------------------------")
    
        solution = minimize(fun=Objetivo,x0=ChuteInicial,args=(distribuicao), method='SLSQP',bounds = Limites, options={'ftol':1e-12})
    
        print("Optimized Solution:")
        print("------------------------------------------------------------------------------------------------")
        print(solution.message)
        ks_stat, p_value = st.kstest(distribuicao, cdf=mixnormal_cdf, args=solution.x)
        print("Optimized k-s stat: {}, p-value: {}".format(ks_stat, p_value))
        print("Optimized weight1: {:.4f}, mean1: {:.4f}, stdv1: {:.4f}, mean2: {:.4f}, stdv2: {:.4f}".format(*solution.x))
        print("------------------------------------------------------------------------------------------------")
    

    which results in

     LAG: 1
    ------------------------------------------------------------------------------------------------
    Intial Guess
     weight1: 0.2000, mean1: 0.0020, stdv1: 0.0110, mean2: -0.0020, stdv2: 0.0230
    k-s stat: 0.016419395777755863, p-value: 0.9027260234690881
    ------------------------------------------------------------------------------------------------
    Optimized Solution:
    ------------------------------------------------------------------------------------------------
    Optimization terminated successfully.
    Optimized k-s stat: 0.014896801186217778, p-value: 0.952748910069967
    Optimized weight1: 0.2016, mean1: 0.0016, stdv1: 0.0108, mean2: -0.0017, stdv2: 0.0227
    ------------------------------------------------------------------------------------------------
    LAG: 2
    ------------------------------------------------------------------------------------------------
    Intial Guess
     weight1: 0.2000, mean1: 0.0020, stdv1: 0.0110, mean2: -0.0020, stdv2: 0.0230
    k-s stat: 0.0791846286223467, p-value: 5.496852766236095e-07
    ------------------------------------------------------------------------------------------------
    Optimized Solution:
    ------------------------------------------------------------------------------------------------
    Optimization terminated successfully.
    Optimized k-s stat: 0.013690695822088705, p-value: 0.978164746056248
    Optimized weight1: 0.2291, mean1: -0.0047, stdv1: 0.0125, mean2: -0.0012, stdv2: 0.0351
    ------------------------------------------------------------------------------------------------
    LAG: 3
    ------------------------------------------------------------------------------------------------
    Intial Guess
     weight1: 0.2000, mean1: 0.0020, stdv1: 0.0110, mean2: -0.0020, stdv2: 0.0230
    k-s stat: 0.13436209329730314, p-value: 2.533666062868497e-19
    ------------------------------------------------------------------------------------------------
    Optimized Solution:
    ------------------------------------------------------------------------------------------------
    Optimization terminated successfully.
    Optimized k-s stat: 0.01622740259609401, p-value: 0.9106251238446841
    Optimized weight1: 0.2638, mean1: -0.0058, stdv1: 0.0199, mean2: -0.0020, stdv2: 0.0433
    ------------------------------------------------------------------------------------------------