Search code examples
pythonscipystatisticsweibull

Fitting data to weibull distribution


I have a set of integer values, and I want to set them to Weibull distribution and get the best fit parameters. Then I draw the histogram of data together with the pdf of Weibull distribution, using the best fit parameters. This is the code I used.

from jtlHandler import *
import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt



def get_pdf(latencies):

    a = np.array(latencies)
    ag = st.gaussian_kde(a)
    ak = np.linspace(np.min(a), np.max(a), len(a))
    agv = ag(ak)
    plt.plot(ak,agv)
    plt.show()
    return (ak,agv)

def fit_to_distribution(distribution, data):
    params = distribution.fit(data)
    # Return MLEs for shape (if applicable), location, and scale parameters from data.
    #
    # MLE stands for Maximum Likelihood Estimate. Starting estimates for the fit are given by input arguments; for any arguments not provided with starting estimates, self._fitstart(data) is called to generate such.

    return params

def make_distribution_pdf(dist, params, end):
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]

    # Build PDF and turn into pandas Series
    x = np.linspace(0, end, end)
    y = dist.pdf(x, loc=loc, scale=scale, *arg)
    pdf = pd.Series(y, x)

    return pdf


latencies = getLatencyList("filename")

latencies = latencies[int(9*(len(latencies)/10)):len(latencies)]
data = pd.Series(latencies)

params = fit_to_distribution(st.weibull_max, data)
print("Parameters for the fit: "+str(params))



# Make PDF
pdf = make_distribution_pdf(st.weibull_max, params, max(latencies))

# Display
plt.figure()
ax = pdf.plot(lw=2, label='PDF', legend=True)
data.plot(kind='hist', bins=200, normed=True, alpha=0.5, label='Data', 
legend=True, ax=ax)

ax.set_title('Weibull distribution')
ax.set_xlabel('Latnecy')
ax.set_ylabel('Frequency')

plt.savefig("image.png")

This is the resulting figure. enter image description here

As it is seen, the Weibull approximation is not simmilar to the original distribution of data.

How can I get the best Weibull approximation to my data?


Solution

  • You can fit a data set (set of numbers) to any distribution using the following two methods.

    import os
    import matplotlib.pyplot as plt
    import sys
    import math
    import numpy as np
    import scipy.stats as st
    from scipy.stats._continuous_distns import _distn_names
    from scipy.optimize import curve_fit
    
    def fit_to_distribution(distribution, latency_values):
        distribution = getattr(st, distribution)
        params = distribution.fit(latency_values)
    
        return params
    
    
    def make_distribution_pdf(distribution, latency_list):
        distribution = getattr(st, distribution)
        params = distribution.fit(latency_list)
    
        arg = params[:-2]
        loc = params[-2]
        scale = params[-1]
        x = np.linspace(min(latency_list), max(latency_list), 10000)
        y = distribution.pdf(x, loc=loc, scale=scale, *arg)
        return x, y