Plotting scipy skewnorm gives me data of an unexpected magnitude

I am trying to estimate a skew normal distribution to my sample data, and plot it alongside the data. If I plot the data as a histogram I get this plot:

I then try to fit a skew normal distribution to my data and plot that alongside it. However, the fitted data yields a curve that is of the right shape, but significantly lower than I expected.

If I remove the histogram from this picture, I get a curve that looks like what I expect. It is just scaled down by a lot:

How would I go about representing both of these renderings in one plot? I am sure there is something trivial I am missing. I don't normally code in Python.

Code:

from scipy import stats
import matplotlib.pyplot as plt
import numpy as np

data = [ 10, 10, 11, 10, 11, 11, 10, 10, 15, 15, 14, 18, 11, 10, 11, 13, 13, 10, 13, 16
       , 16, 15, 11, 16, 12, 11, 17, 13, 11, 14, 12, 11, 10, 12, 11, 12, 10, 12, 10, 12
       , 11, 11, 11, 12, 15, 11, 12, 12, 10, 12, 10, 10, 11, 11, 14, 10, 11, 10, 17, 10
       , 15, 10, 11, 11, 10, 9, 12, 11, 13, 12, 12, 11, 11, 16, 15, 21, 11, 11, 11, 13
       , 11, 12, 10, 21, 10, 13, 10, 10, 13, 13, 10, 18, 13, 13, 11, 14, 10, 14, 13, 11
       , 10, 12, 15, 9, 10, 9, 16, 14, 15, 11, 10, 11, 10, 11, 12, 12, 12, 12, 10, 10
       , 10, 11, 13, 11, 19, 11, 15, 13, 13, 11, 10, 13, 10, 10, 10, 12, 10, 10, 18, 12
       , 12, 13, 11, 17, 10, 11, 10, 14, 12, 12, 14, 10, 15, 10, 10, 12, 12, 11, 10, 25
       , 11, 13, 10, 11, 12, 12, 12, 17, 12, 11, 10, 11, 24, 10, 10, 10, 13, 10, 11, 12
       , 10, 12, 12, 11, 24, 11, 15, 11, 13, 13, 12, 11, 10, 11, 10, 12, 10]

X = np.linspace(min(data), max(data), num=200)

fig,ax = plt.subplots()
#ax.hist(data, bins=25)
ax.plot(X, stats.skewnorm.pdf(X, *stats.skewnorm.fit(data)))
fig.savefig("test.png")

Solution

As your data is discrete, having explicit bin edges that fall between the integer positions makes a much more suitable histogram. With default bin edges, some bins might be empty, or otherwise bins will get a different number of corresponding input values.

Normalizing the histogram

A pdf is normalized to have a total area of 1. You can add ax.hist(..., density=True) to also have the histogram normalized.

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import numpy as np
from scipy import stats

data = [10, 10, 11, 10, 11, 11, 10, 10, 15, 15, 14, 18, 11, 10, 11, 13, 13, 10, 13, 16, 16, 15, 11, 16, 12, 11, 17, 13, 11, 14, 12, 11, 10, 12, 11, 12, 10, 12, 10, 12, 11, 11, 11, 12, 15, 11, 12, 12, 10, 12, 10, 10, 11, 11, 14, 10, 11, 10, 17, 10, 15, 10, 11, 11, 10, 9, 12, 11, 13, 12, 12, 11, 11, 16, 15, 21, 11, 11, 11, 13, 11, 12, 10, 21, 10, 13, 10, 10, 13, 13, 10, 18, 13, 13, 11, 14, 10, 14, 13, 11, 10, 12, 15, 9, 10, 9, 16, 14, 15, 11, 10, 11, 10, 11, 12, 12, 12, 12, 10, 10, 10, 11, 13, 11, 19, 11, 15, 13, 13, 11, 10, 13, 10, 10, 10, 12, 10, 10, 18, 12, 12, 13, 11, 17, 10, 11, 10, 14, 12, 12, 14, 10, 15, 10, 10, 12, 12, 11, 10, 25, 11, 13, 10, 11, 12, 12, 12, 17, 12, 11, 10, 11, 24, 10, 10, 10, 13, 10, 11, 12, 10, 12, 12, 11, 24, 11, 15, 11, 13, 13, 12, 11, 10, 11, 10, 12, 10]

X = np.linspace(min(data), max(data), num=200)
bins = np.arange(min(data) - 0.5, max(data) + 1, 1)
fig, ax = plt.subplots()
ax.hist(data, bins=bins, density=True)
fit_params = stats.skewnorm.fit(data)
ax.plot(X, stats.skewnorm.pdf(X, *fit_params))
ax.fill_between(X, stats.skewnorm.pdf(X, *fit_params), color='red', alpha=0.3)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.show()

Scaling the pdf

Instead of normalizing the histogram, you can also multiply the pdf with the histograms area. The area is the sum of the areas of the bars. As the bars' heights sum to the total number of observations, and using a bin width of 1, the area would be len(data).

To better indicate the discreteness of the data, the bars could be visualized narrower. rwidth= in ax.hist() is a scaling factor.

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

data = [10, 10, 11, 10, 11, 11, 10, 10, 15, 15, 14, 18, 11, 10, 11, 13, 13, 10, 13, 16, 16, 15, 11, 16, 12, 11, 17, 13, 11, 14, 12, 11, 10, 12, 11, 12, 10, 12, 10, 12, 11, 11, 11, 12, 15, 11, 12, 12, 10, 12, 10, 10, 11, 11, 14, 10, 11, 10, 17, 10, 15, 10, 11, 11, 10, 9, 12, 11, 13, 12, 12, 11, 11, 16, 15, 21, 11, 11, 11, 13, 11, 12, 10, 21, 10, 13, 10, 10, 13, 13, 10, 18, 13, 13, 11, 14, 10, 14, 13, 11, 10, 12, 15, 9, 10, 9, 16, 14, 15, 11, 10, 11, 10, 11, 12, 12, 12, 12, 10, 10, 10, 11, 13, 11, 19, 11, 15, 13, 13, 11, 10, 13, 10, 10, 10, 12, 10, 10, 18, 12, 12, 13, 11, 17, 10, 11, 10, 14, 12, 12, 14, 10, 15, 10, 10, 12, 12, 11, 10, 25, 11, 13, 10, 11, 12, 12, 12, 17, 12, 11, 10, 11, 24, 10, 10, 10, 13, 10, 11, 12, 10, 12, 12, 11, 24, 11, 15, 11, 13, 13, 12, 11, 10, 11, 10, 12, 10]

bins = np.arange(min(data) - 0.5, max(data) + 1, 1)
X = np.linspace(bins[0], bins[-1], num=200)
fig, ax = plt.subplots()
ax.hist(data, bins=bins, density=False, rwidth=0.3)
fit_params = stats.skewnorm.fit(data)
ax.plot(X, len(data) * stats.skewnorm.pdf(X, *fit_params), color='crimson')
ax.fill_between(X, len(data) * stats.skewnorm.pdf(X, *fit_params), color='crimson', alpha=0.3)
ax.set_xticks(range(min(data), max(data) + 1))
ax.margins(x=0)
plt.show()

Fitting a negative binomial

A negative binomial is a discrete distribution. It would look like:

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import nbinom

data = [10, 10, 11, 10, 11, 11, 10, 10, 15, 15, 14, 18, 11, 10, 11, 13, 13, 10, 13, 16, 16, 15, 11, 16, 12, 11, 17, 13, 11, 14, 12, 11, 10, 12, 11, 12, 10, 12, 10, 12, 11, 11, 11, 12, 15, 11, 12, 12, 10, 12, 10, 10, 11, 11, 14, 10, 11, 10, 17, 10, 15, 10, 11, 11, 10, 9, 12, 11, 13, 12, 12, 11, 11, 16, 15, 21, 11, 11, 11, 13, 11, 12, 10, 21, 10, 13, 10, 10, 13, 13, 10, 18, 13, 13, 11, 14, 10, 14, 13, 11, 10, 12, 15, 9, 10, 9, 16, 14, 15, 11, 10, 11, 10, 11, 12, 12, 12, 12, 10, 10, 10, 11, 13, 11, 19, 11, 15, 13, 13, 11, 10, 13, 10, 10, 10, 12, 10, 10, 18, 12, 12, 13, 11, 17, 10, 11, 10, 14, 12, 12, 14, 10, 15, 10, 10, 12, 12, 11, 10, 25, 11, 13, 10, 11, 12, 12, 12, 17, 12, 11, 10, 11, 24, 10, 10, 10, 13, 10, 11, 12, 10, 12, 12, 11, 24, 11, 15, 11, 13, 13, 12, 11, 10, 11, 10, 12, 10]

loc = min(data) # suppose the distribution starts at the lowest observed value
mean = np.mean(data)
var = np.var(data)
p = (mean - loc) / var
n = (mean - loc) ** 2 / (var - (mean - loc))

fig, ax = plt.subplots()

bins = np.arange(min(data) - 0.5, max(data) + 1, 1)
ax.hist(data, bins=bins, rwidth=0.9, density=True)

X = np.arange(min(data) - 1, max(data) + 2)
ax.plot(X, nbinom.pmf(X, loc=loc, n=n, p=p), color='crimson', marker='o', ls=':')
ax.fill_between(X, nbinom.pmf(X, loc=loc, n=n, p=p), color='crimson', alpha=0.3)
ax.set_xticks(np.arange(min(data), max(data) + 1))
ax.margins(x=0)
plt.show()