Given a sample from a distribution and assuming it is Gaussian (normal distribution with unknown mu, sigma), the task is to find the parameters mean and standard deviation that describe it best.
What is the mathematical difference and why does it yield different results? And if it is different, why and when to use which method? I think both are parametric and can be used in the same cases.
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import norm
# define true mean and std for Gaussian normal distribution
mean = 5.0
std = 2.0
# generate random variets (samples) and get histogram
samples = np.random.normal(loc=mean, scale=std, size=100)
hist, bin_edges = np.histogram(samples, density=True)
midpoints = (bin_edges[:-1] + bin_edges[1:])/2.
# fit the Gaussian do find mean and std
def func(x, mean, std):
return norm.pdf(x, loc=mean, scale=std)
from scipy.optimize import curve_fit
popt, pcov = curve_fit(func, midpoints, hist)
fit_mean, fit_std = popt
print("fitted mean,std:",fit_mean,fit_std)
print("sample mean,std:",np.mean(samples),np.std(samples))
# negative log likelihood approach
def normaldistribution_negloglikelihood(params):
mu, sigma = params
return -np.log(np.sum(norm.pdf(samples, loc=mu, scale=sigma)))
#return -np.sum(norm.pdf(samples, loc=mu, scale=sigma))
from scipy.optimize import minimize
result = minimize(normaldistribution_negloglikelihood, x0=[0,1] , bounds=((None,None), (1e-5,None)) )#, method='Nelder-Mead')
if result.success:
fitted_params = result.x
#print("fitted_params", fitted_params)
else:
raise ValueError(result.message)
nll_mean, nll_std = fitted_params
print("neg LL mean,std:",nll_mean, nll_std)
# plot
plt.plot(midpoints, hist, label="sample histogram")
x = np.linspace(-5,15,500)
plt.plot(x, norm.pdf(x, loc=mean, scale=std), label="true PDF")
plt.plot(x, norm.pdf(x, loc=fit_mean, scale=fit_std), label="fitted PDF")
plt.plot(x, norm.pdf(x, loc=nll_mean, scale=nll_std), label="neg LL estimator")
plt.legend()
plt.show()
Your likelihood is wrong, you should sum the log of pdf, not what you did, so :
def normaldistribution_negloglikelihood(params):
mu, sigma = params
return -np.sum(norm.logpdf(samples, loc=mu, scale=sigma))
result = minimize(normaldistribution_negloglikelihood, x0=[0,1] ,
bounds=((None,None), (1e-5,None)) )#, method='Nelder-Mead')
nll_mean, nll_std = result.x
x = np.linspace(-5,15,500)
plt.plot(x, norm.pdf(x, loc=mean, scale=std), label="true PDF")
plt.plot(x, norm.pdf(x, loc=nll_mean, scale=nll_std), label="neg LL estimator")
You should maximize the likelihood. The histogram method is dependent on your bin size and does not use all your data.