Search code examples
pythonnumpymatplotlibscipydata-fitting

Gamma function fitting OptimizeWarning: Covariance of the parameters could not be estimated in Python


I am trying to fit functions to histograms using curve_fit from scipy, however a warning occurs when running the program:

import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.special import gamma as gamma_func
import tkinter as tk


def gaussian(x, m, s):
    return (1 / (s * np.sqrt(2 * np.pi))) * (np.exp((-1 / 2) * ((x - m) / s) ** 2))


def gamma(x, k, theta):
    return (x ** (k - 1) * np.exp(-x / theta)) / (theta ** k * gamma_func(k))


def display_gaussian(data_set, mean, sigma):
    plt.figure()
    y, bin_edges = np.histogram(data_set, bins=100, density=True)
    x = (bin_edges[:-1] + bin_edges[1:]) / 2
    plt.hist(data_set, density=True, color='skyblue', label='Histogram', edgecolor='black')

    init_gaussian = [mean, sigma]
    params_gaussian, cov_gaussian = curve_fit(gaussian, x, y, init_gaussian)


    fit = np.linspace(min(data_set), max(data_set), 1000)

    plt.plot(fit, gaussian(fit, *params_gaussian), 'r--',
             label=f'Fit: mean={params_gaussian[0]:.2f}, sigma={params_gaussian[1]:.2f}')
    plt.xlabel('Distribution')
    plt.ylabel('Frequency')
    plt.title('Gaussian distribution')
    plt.legend()
    plt.show(block=False)


def display_gamma(data_set, sigma):
    plt.figure()
    y, bin_edges = np.histogram(data_set, bins=100, density=True)
    x = (bin_edges[:-1] + bin_edges[1:]) / 2
    plt.hist(data_set, density=True, color='forestgreen', label='Histogram', edgecolor='black')

    shape_guess = 2.0
    init_gamma = [shape_guess, sigma]
    params_gamma, cov_gamma = curve_fit(gamma, x, y, init_gamma)


    fit = np.linspace(min(data_set), max(data_set), 1000)

    plt.plot(fit, gamma(fit, *params_gamma), 'y--',
             label=f'Fit: mean={params_gamma[0]:.2f}, sigma={params_gamma[1]:.2f}')
    plt.xlabel('Distribution')
    plt.ylabel('Frequency')
    plt.title('Gamma distribution')
    plt.legend()
    plt.show(block=False)


def run():
    mean = float(m_ent.get())
    sigma = float(s_ent.get())

    data_set = np.random.normal(mean, sigma, 1000)

    display_gaussian(data_set, mean, sigma)
    display_gamma(data_set, sigma)


root = tk.Tk()
root.title("Input parameters")
root.minsize(400, 300)

m_l = tk.Label(root, text="mean", font=15)
m_l.pack(pady=10)
m_ent = tk.Entry(root)
m_ent.pack(pady=10)

s_l = tk.Label(root, text="sigma", font=15)
s_l.pack(pady=10)
s_ent = tk.Entry(root)
s_ent.pack(pady=10)

run_btn = tk.Button(root, text="Run", command=run, font=15)
run_btn.pack(pady=20)

root.mainloop()

OptimizeWarning: Covariance of the parameters could not be estimated

params_gamma, cov_gamma = curve_fit(gamma, x, y, init_gamma)

I am not sure what is the problem here.

I thought the problem was with the bins of the histogram and tried setting it to 'auto', also changing the density, but the warning occurs again. The same problem happened when fitting the Gaussian and after adding density=True the covariance was successfully estimated, but it is not working for fitting the Gamma.


Solution

  • You are facing three challenges:

    • Fitting normally distributed data (which can be negative) with a gamma distribution which does have a positive support;
    • Fitting a PDF to binned data (histogram) while you have access yo the raw data and you can perform a MLE instead;
    • Writing naively the PDF while you can rely on numerically stable version implemented in scipy.stats.

    Here is an example how to draw gamma sample and fit it with MLE:

    import numpy as np
    import matplotlib.pyplot as plt
    from scipy import stats, optimize
    
    law = stats.gamma(a=1.5, scale=0.8)
    data = law.rvs(1_000)
    
    def likelihood_factory(data):
        def wrapped(parameters):
            return - np.sum(stats.gamma(a=parameters[0], scale=parameters[1]).logpdf(data))
        return wrapped
    
    likelihood = likelihood_factory(data)
    
    solution = optimize.minimize(likelihood, x0=[1., 1.], tol=1e-4)
    #  message: Optimization terminated successfully.
    #  success: True
    #   status: 0
    #      fun: 1126.2309539053672
    #        x: [ 1.508e+00  7.876e-01]
    #      nit: 8
    #      jac: [-1.526e-05 -3.052e-05]
    # hess_inv: [[ 3.781e-03 -1.978e-03]
    #            [-1.978e-03  1.447e-03]]
    #     nfev: 33
    #     njev: 11
    

    enter image description here