Search code examples
pythonstatisticsmathematical-optimizationcurve-fittinglog-likelihood

parametric PDF estimation: histogram vs likelihood


Given a sample from a distribution and assuming it is Gaussian (normal distribution with unknown mu, sigma), the task is to find the parameters mean and standard deviation that describe it best.

What is the mathematical difference and why does it yield different results? And if it is different, why and when to use which method? I think both are parametric and can be used in the same cases.

  1. do a least square curve fit with normal distributions on the parameters mu,sigma
  2. maximize the likelihood = sum of PDF(mu,sigma) over samples
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import norm

# define true mean and std for Gaussian normal distribution

mean = 5.0
std = 2.0

# generate random variets (samples) and get histogram

samples = np.random.normal(loc=mean, scale=std, size=100)
hist, bin_edges = np.histogram(samples, density=True)
midpoints = (bin_edges[:-1] + bin_edges[1:])/2.

# fit the Gaussian do find mean and std

def func(x, mean, std):
    return norm.pdf(x, loc=mean, scale=std)

from scipy.optimize import curve_fit

popt, pcov = curve_fit(func, midpoints, hist)
fit_mean, fit_std = popt

print("fitted mean,std:",fit_mean,fit_std)
print("sample mean,std:",np.mean(samples),np.std(samples))

# negative log likelihood approach

def normaldistribution_negloglikelihood(params):
    mu, sigma = params
    return -np.log(np.sum(norm.pdf(samples, loc=mu, scale=sigma)))
    #return -np.sum(norm.pdf(samples, loc=mu, scale=sigma))
    
from scipy.optimize import minimize

result = minimize(normaldistribution_negloglikelihood, x0=[0,1] , bounds=((None,None), (1e-5,None)) )#, method='Nelder-Mead')

if result.success:
    fitted_params = result.x
    #print("fitted_params", fitted_params)
else:
    raise ValueError(result.message)
    

nll_mean, nll_std = fitted_params
print("neg LL mean,std:",nll_mean, nll_std)
    

# plot

plt.plot(midpoints, hist, label="sample histogram")

x = np.linspace(-5,15,500)
plt.plot(x, norm.pdf(x, loc=mean, scale=std), label="true PDF")
plt.plot(x, norm.pdf(x, loc=fit_mean, scale=fit_std), label="fitted PDF")
plt.plot(x, norm.pdf(x, loc=nll_mean, scale=nll_std), label="neg LL estimator")


plt.legend()
plt.show()

plot


Solution

  • Your likelihood is wrong, you should sum the log of pdf, not what you did, so :

    def normaldistribution_negloglikelihood(params):
        mu, sigma = params
        return -np.sum(norm.logpdf(samples, loc=mu, scale=sigma))
        
    result = minimize(normaldistribution_negloglikelihood, x0=[0,1] ,
                      bounds=((None,None), (1e-5,None)) )#, method='Nelder-Mead')
    
    nll_mean, nll_std = result.x
    x = np.linspace(-5,15,500)
    
    plt.plot(x, norm.pdf(x, loc=mean, scale=std), label="true PDF")
    plt.plot(x, norm.pdf(x, loc=nll_mean, scale=nll_std), label="neg LL estimator")
    

    enter image description here

    You should maximize the likelihood. The histogram method is dependent on your bin size and does not use all your data.