Search code examples
pythonnumpymatplotlibscipy

Plotting scipy skewnorm gives me data of an unexpected magnitude


I am trying to estimate a skew normal distribution to my sample data, and plot it alongside the data. If I plot the data as a histogram I get this plot:

enter image description here

I then try to fit a skew normal distribution to my data and plot that alongside it. However, the fitted data yields a curve that is of the right shape, but significantly lower than I expected. enter image description here

If I remove the histogram from this picture, I get a curve that looks like what I expect. It is just scaled down by a lot: enter image description here

How would I go about representing both of these renderings in one plot? I am sure there is something trivial I am missing. I don't normally code in Python.

Code:

from scipy import stats
import matplotlib.pyplot as plt
import numpy as np

data = [ 10, 10, 11, 10, 11, 11, 10, 10, 15, 15, 14, 18, 11, 10, 11, 13, 13, 10, 13, 16
       , 16, 15, 11, 16, 12, 11, 17, 13, 11, 14, 12, 11, 10, 12, 11, 12, 10, 12, 10, 12
       , 11, 11, 11, 12, 15, 11, 12, 12, 10, 12, 10, 10, 11, 11, 14, 10, 11, 10, 17, 10
       , 15, 10, 11, 11, 10, 9, 12, 11, 13, 12, 12, 11, 11, 16, 15, 21, 11, 11, 11, 13
       , 11, 12, 10, 21, 10, 13, 10, 10, 13, 13, 10, 18, 13, 13, 11, 14, 10, 14, 13, 11
       , 10, 12, 15, 9, 10, 9, 16, 14, 15, 11, 10, 11, 10, 11, 12, 12, 12, 12, 10, 10
       , 10, 11, 13, 11, 19, 11, 15, 13, 13, 11, 10, 13, 10, 10, 10, 12, 10, 10, 18, 12
       , 12, 13, 11, 17, 10, 11, 10, 14, 12, 12, 14, 10, 15, 10, 10, 12, 12, 11, 10, 25
       , 11, 13, 10, 11, 12, 12, 12, 17, 12, 11, 10, 11, 24, 10, 10, 10, 13, 10, 11, 12
       , 10, 12, 12, 11, 24, 11, 15, 11, 13, 13, 12, 11, 10, 11, 10, 12, 10]

X = np.linspace(min(data), max(data), num=200)

fig,ax = plt.subplots()
#ax.hist(data, bins=25)
ax.plot(X, stats.skewnorm.pdf(X, *stats.skewnorm.fit(data)))
fig.savefig("test.png")

Solution

  • As your data is discrete, having explicit bin edges that fall between the integer positions makes a much more suitable histogram. With default bin edges, some bins might be empty, or otherwise bins will get a different number of corresponding input values.

    Normalizing the histogram

    A pdf is normalized to have a total area of 1. You can add ax.hist(..., density=True) to also have the histogram normalized.

    import matplotlib.pyplot as plt
    from matplotlib.ticker import MaxNLocator
    import numpy as np
    from scipy import stats
    
    data = [10, 10, 11, 10, 11, 11, 10, 10, 15, 15, 14, 18, 11, 10, 11, 13, 13, 10, 13, 16, 16, 15, 11, 16, 12, 11, 17, 13, 11, 14, 12, 11, 10, 12, 11, 12, 10, 12, 10, 12, 11, 11, 11, 12, 15, 11, 12, 12, 10, 12, 10, 10, 11, 11, 14, 10, 11, 10, 17, 10, 15, 10, 11, 11, 10, 9, 12, 11, 13, 12, 12, 11, 11, 16, 15, 21, 11, 11, 11, 13, 11, 12, 10, 21, 10, 13, 10, 10, 13, 13, 10, 18, 13, 13, 11, 14, 10, 14, 13, 11, 10, 12, 15, 9, 10, 9, 16, 14, 15, 11, 10, 11, 10, 11, 12, 12, 12, 12, 10, 10, 10, 11, 13, 11, 19, 11, 15, 13, 13, 11, 10, 13, 10, 10, 10, 12, 10, 10, 18, 12, 12, 13, 11, 17, 10, 11, 10, 14, 12, 12, 14, 10, 15, 10, 10, 12, 12, 11, 10, 25, 11, 13, 10, 11, 12, 12, 12, 17, 12, 11, 10, 11, 24, 10, 10, 10, 13, 10, 11, 12, 10, 12, 12, 11, 24, 11, 15, 11, 13, 13, 12, 11, 10, 11, 10, 12, 10]
    
    X = np.linspace(min(data), max(data), num=200)
    bins = np.arange(min(data) - 0.5, max(data) + 1, 1)
    fig, ax = plt.subplots()
    ax.hist(data, bins=bins, density=True)
    fit_params = stats.skewnorm.fit(data)
    ax.plot(X, stats.skewnorm.pdf(X, *fit_params))
    ax.fill_between(X, stats.skewnorm.pdf(X, *fit_params), color='red', alpha=0.3)
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.show()
    

    fitting histogram to stats.skewnorm

    Scaling the pdf

    Instead of normalizing the histogram, you can also multiply the pdf with the histograms area. The area is the sum of the areas of the bars. As the bars' heights sum to the total number of observations, and using a bin width of 1, the area would be len(data).

    To better indicate the discreteness of the data, the bars could be visualized narrower. rwidth= in ax.hist() is a scaling factor.

    import matplotlib.pyplot as plt
    import numpy as np
    from scipy import stats
    
    data = [10, 10, 11, 10, 11, 11, 10, 10, 15, 15, 14, 18, 11, 10, 11, 13, 13, 10, 13, 16, 16, 15, 11, 16, 12, 11, 17, 13, 11, 14, 12, 11, 10, 12, 11, 12, 10, 12, 10, 12, 11, 11, 11, 12, 15, 11, 12, 12, 10, 12, 10, 10, 11, 11, 14, 10, 11, 10, 17, 10, 15, 10, 11, 11, 10, 9, 12, 11, 13, 12, 12, 11, 11, 16, 15, 21, 11, 11, 11, 13, 11, 12, 10, 21, 10, 13, 10, 10, 13, 13, 10, 18, 13, 13, 11, 14, 10, 14, 13, 11, 10, 12, 15, 9, 10, 9, 16, 14, 15, 11, 10, 11, 10, 11, 12, 12, 12, 12, 10, 10, 10, 11, 13, 11, 19, 11, 15, 13, 13, 11, 10, 13, 10, 10, 10, 12, 10, 10, 18, 12, 12, 13, 11, 17, 10, 11, 10, 14, 12, 12, 14, 10, 15, 10, 10, 12, 12, 11, 10, 25, 11, 13, 10, 11, 12, 12, 12, 17, 12, 11, 10, 11, 24, 10, 10, 10, 13, 10, 11, 12, 10, 12, 12, 11, 24, 11, 15, 11, 13, 13, 12, 11, 10, 11, 10, 12, 10]
    
    bins = np.arange(min(data) - 0.5, max(data) + 1, 1)
    X = np.linspace(bins[0], bins[-1], num=200)
    fig, ax = plt.subplots()
    ax.hist(data, bins=bins, density=False, rwidth=0.3)
    fit_params = stats.skewnorm.fit(data)
    ax.plot(X, len(data) * stats.skewnorm.pdf(X, *fit_params), color='crimson')
    ax.fill_between(X, len(data) * stats.skewnorm.pdf(X, *fit_params), color='crimson', alpha=0.3)
    ax.set_xticks(range(min(data), max(data) + 1))
    ax.margins(x=0)
    plt.show()
    

    scaling the pdf with the histogram

    Fitting a negative binomial

    A negative binomial is a discrete distribution. It would look like:

    import matplotlib.pyplot as plt
    import numpy as np
    from scipy.stats import nbinom
    
    data = [10, 10, 11, 10, 11, 11, 10, 10, 15, 15, 14, 18, 11, 10, 11, 13, 13, 10, 13, 16, 16, 15, 11, 16, 12, 11, 17, 13, 11, 14, 12, 11, 10, 12, 11, 12, 10, 12, 10, 12, 11, 11, 11, 12, 15, 11, 12, 12, 10, 12, 10, 10, 11, 11, 14, 10, 11, 10, 17, 10, 15, 10, 11, 11, 10, 9, 12, 11, 13, 12, 12, 11, 11, 16, 15, 21, 11, 11, 11, 13, 11, 12, 10, 21, 10, 13, 10, 10, 13, 13, 10, 18, 13, 13, 11, 14, 10, 14, 13, 11, 10, 12, 15, 9, 10, 9, 16, 14, 15, 11, 10, 11, 10, 11, 12, 12, 12, 12, 10, 10, 10, 11, 13, 11, 19, 11, 15, 13, 13, 11, 10, 13, 10, 10, 10, 12, 10, 10, 18, 12, 12, 13, 11, 17, 10, 11, 10, 14, 12, 12, 14, 10, 15, 10, 10, 12, 12, 11, 10, 25, 11, 13, 10, 11, 12, 12, 12, 17, 12, 11, 10, 11, 24, 10, 10, 10, 13, 10, 11, 12, 10, 12, 12, 11, 24, 11, 15, 11, 13, 13, 12, 11, 10, 11, 10, 12, 10]
    
    loc = min(data) # suppose the distribution starts at the lowest observed value
    mean = np.mean(data)
    var = np.var(data)
    p = (mean - loc) / var
    n = (mean - loc) ** 2 / (var - (mean - loc))
    
    fig, ax = plt.subplots()
    
    bins = np.arange(min(data) - 0.5, max(data) + 1, 1)
    ax.hist(data, bins=bins, rwidth=0.9, density=True)
    
    X = np.arange(min(data) - 1, max(data) + 2)
    ax.plot(X, nbinom.pmf(X, loc=loc, n=n, p=p), color='crimson', marker='o', ls=':')
    ax.fill_between(X, nbinom.pmf(X, loc=loc, n=n, p=p), color='crimson', alpha=0.3)
    ax.set_xticks(np.arange(min(data), max(data) + 1))
    ax.margins(x=0)
    plt.show()
    

    fitting a negative binomial