Search code examples

Data is normally distributed, but ks test return a statistic of 1.0

I have an age variable. When I plotted it using the kde & qq-plot, the distribution seemed normal; however, when I performed the ks-test, the test statistics = 1.0, p = 0.0.

Can someone please help me explain this observation? I use the ks-test on other variables, and the result was consistent with the visualization for others.

# library
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as sps                         

# the age variable 
age = np.array([87, 88, 75, 76, 80, 88, 90, 80, 83, 85, 71, 73, 75, 93, 95, 68, 69,
       66, 68, 78, 80, 83, 81, 82, 85, 76, 77, 88, 90, 80, 81, 85, 86, 87,
       88, 92, 80, 82, 84, 72, 76, 61, 64, 86, 87, 82, 84, 69, 71, 73, 74,
       64, 66, 77, 80, 60, 62, 86, 88, 91, 90, 92, 79, 80, 82, 84, 88, 89,
       69, 70, 73, 75, 82, 85, 88, 89, 81, 83, 84, 86, 88, 71, 73, 75, 70,
       73, 72, 73, 68, 69, 71, 75, 77, 83, 85, 77, 78, 66, 66, 68, 68, 69,
       69, 70, 71, 71, 72, 92, 94, 97, 74, 78, 82, 84, 85, 87, 65, 67, 71,
       73, 81, 83, 85, 78, 79, 80, 75, 78, 68, 70, 72, 79, 81, 83, 80, 81,
       78, 81, 82, 61, 62, 67, 68, 71, 73, 88, 90, 81, 82, 80, 82, 84, 85,
       86, 83, 84, 70, 72, 75, 76, 77, 73, 75, 66, 69, 71, 69, 73, 89, 91,
       92, 69, 71, 73, 66, 68, 69, 82, 84, 78, 80, 63, 65, 96, 98, 78, 80,
       70, 72, 73, 75, 76, 75, 78, 83, 84, 61, 63, 71, 72, 74, 89, 91, 74,
       77, 66, 67, 80, 83, 77, 80, 82, 71, 74, 76, 82, 84, 86, 69, 74, 75,
       70, 71, 86, 87, 70, 72, 77, 79, 81, 83, 62, 65, 76, 78, 73, 75, 76,
       78, 73, 75, 73, 74, 76, 78, 67, 71, 81, 83, 85, 76, 78, 73, 74, 86,
       88, 70, 71, 74, 75, 77, 79, 81, 81, 84, 86, 76, 79, 78, 80, 82, 65,
       67, 78, 81, 70, 71, 74, 78, 74, 75, 73, 75, 67, 68, 76, 78, 81, 65,
       68, 69, 71, 89, 91, 93, 77, 79, 68, 73, 80, 82, 77, 78, 80, 82, 81,
       83, 73, 75, 66, 68, 69, 75, 77, 78, 81, 73, 75, 73, 76, 73, 76, 76,
       78, 77, 79, 80, 82, 84, 77, 79, 78, 80, 71, 73, 76, 77, 81, 75, 79,
       60, 62, 64, 70, 72, 73, 84, 87, 89, 68, 70, 89, 90, 93, 79, 81, 74,
       75, 77, 73, 75, 66, 66, 68, 72, 72, 73, 80, 82, 86, 61, 63, 65])

# Visualization 
fig, ax = plt.subplots(1,2)                          # Making (row, col) of plots  
fig.set_figheight(4)                                 # set height 
fig.set_figwidth(8)                                  # set width
sns.kdeplot(age, color = 'red',
                alpha = .1, fill = 'true',
                ax = ax[0])                          # Distribution plot
sm.qqplot(age, fit = True, line = '45', ax = ax[1])  # qqplot
fig.tight_layout()                                   # Tight layout                                           # show plots

# KS test (because n > 50)
print('n =', age.size)
sps.kstest(age, 'norm')



  • @Timur Shtatland is correct. Your code is:

    sps.kstest(age, 'norm')

    without specifying the parameters of the normal distribution, you are comparing your data to a standard normal distribution (with mean 0 and standard deviation 1). So it not surprising that the p-value for the test is effectively zero. Instead you should use the mean and standard deviation of your data:

    import numpy as np
    import matplotlib.pyplot as plt
    from scipy.stats import norm
    from scipy import stats
    # Data
    data = np.array([87, 88, 75, 76, 80, 88, 90, 80, 83, 85, 71, 73, 75, 93, 95, 68, 69,
            66, 68, 78, 80, 83, 81, 82, 85, 76, 77, 88, 90, 80, 81, 85, 86, 87,
            88, 92, 80, 82, 84, 72, 76, 61, 64, 86, 87, 82, 84, 69, 71, 73, 74,
            64, 66, 77, 80, 60, 62, 86, 88, 91, 90, 92, 79, 80, 82, 84, 88, 89,
            69, 70, 73, 75, 82, 85, 88, 89, 81, 83, 84, 86, 88, 71, 73, 75, 70,
            73, 72, 73, 68, 69, 71, 75, 77, 83, 85, 77, 78, 66, 66, 68, 68, 69,
            69, 70, 71, 71, 72, 92, 94, 97, 74, 78, 82, 84, 85, 87, 65, 67, 71,
            73, 81, 83, 85, 78, 79, 80, 75, 78, 68, 70, 72, 79, 81, 83, 80, 81,
            78, 81, 82, 61, 62, 67, 68, 71, 73, 88, 90, 81, 82, 80, 82, 84, 85,
            86, 83, 84, 70, 72, 75, 76, 77, 73, 75, 66, 69, 71, 69, 73, 89, 91,
            92, 69, 71, 73, 66, 68, 69, 82, 84, 78, 80, 63, 65, 96, 98, 78, 80,
            70, 72, 73, 75, 76, 75, 78, 83, 84, 61, 63, 71, 72, 74, 89, 91, 74,
            77, 66, 67, 80, 83, 77, 80, 82, 71, 74, 76, 82, 84, 86, 69, 74, 75,
            70, 71, 86, 87, 70, 72, 77, 79, 81, 83, 62, 65, 76, 78, 73, 75, 76,
            78, 73, 75, 73, 74, 76, 78, 67, 71, 81, 83, 85, 76, 78, 73, 74, 86,
            88, 70, 71, 74, 75, 77, 79, 81, 81, 84, 86, 76, 79, 78, 80, 82, 65,
            67, 78, 81, 70, 71, 74, 78, 74, 75, 73, 75, 67, 68, 76, 78, 81, 65,
            68, 69, 71, 89, 91, 93, 77, 79, 68, 73, 80, 82, 77, 78, 80, 82, 81,
            83, 73, 75, 66, 68, 69, 75, 77, 78, 81, 73, 75, 73, 76, 73, 76, 76,
            78, 77, 79, 80, 82, 84, 77, 79, 78, 80, 71, 73, 76, 77, 81, 75, 79,
            60, 62, 64, 70, 72, 73, 84, 87, 89, 68, 70, 89, 90, 93, 79, 81, 74,
            75, 77, 73, 75, 66, 66, 68, 72, 72, 73, 80, 82, 86, 61, 63, 65])
    # Fit a normal distribution to the data
    mu, std =
    shapiro_test = stats.shapiro(data)
    print("\nShapiro-Wilk Test:")
    print("Statistic: {:.2f}".format(shapiro_test[0]))
    print("p-value: {:.2f}".format(shapiro_test[1]))
    # Perform the KS test for normality
    ks_statistic, p_value = stats.kstest(data, 'norm', args=(mu, std))
    print("\nKolmogorov-Smirnov Test:")
    print("Statistic: {:.2f}".format(ks_statistic))
    print("p-value: {:.2f}".format(p_value))```

    which produces this:

    Shapiro-Wilk Test:
    Statistic: 0.99
    p-value: 0.07
    Kolmogorov-Smirnov Test:
    Statistic: 0.05
    p-value: 0.21

    I would also plot a histogram of the data and then overlay a normal density with the parameters from your data:

    # Create histogram of the data
    count, bins, ignored = plt.hist(data, 20, density=True, alpha=0.5, color='gray')
    # Plot the PDF of the normal distribution
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 100)
    p = norm.pdf(x, mu, std)
    plt.plot(x, p, 'k', linewidth=2)
    title = "Fit results: mu = %.2f,  std = %.2f" % (mu, std)

    enter image description here