Search code examples
python-3.xcurve-fittingleast-squaresapproximation

How to determine by which model my curve should be and approximate it?


I have a curve: enter image description here

and its digitized data. https://drive.google.com/open?id=1ZB39G3SmtamjVjmLzkC2JefloZ9iShpO

How should I choose a suitable function for the least squares method and how this approximation can be implemented on Python?

I tried to do it like that

import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import numpy as np
import sympy as sym



x = np.loadtxt("x_data.txt", delimiter='\t', dtype=np.float)
y = np.loadtxt("y_data.txt", delimiter='\t', dtype=np.float)



plt.plot(x, y, 'ro',label="Original Data")

x = np.array(x, dtype=float)  
y = np.array(y, dtype=float) 


def func(x, a, b, c, d):
    return a*x**3 + b*x**2 +c*x + d


popt, pcov = curve_fit(func, x, y)




xs = sym.Symbol('\lambda')    
tex = sym.latex(func(xs,*popt)).replace('$', '')
plt.title(r'$f(\lambda)= %s$' %(tex),fontsize=16)



plt.plot(x, func(x, *popt), label="Fitted Curve")  

plt.legend(loc='upper left')
plt.show()

thanks


Solution

  • I extracted data from your plot for analysis, and here is my first cut at the problem. Since your plot used decade-log scaling, I took the anti-decade-log of the "left-side" Y data for fitting. My peak equation search on this "extracted" data turned up a log-normal type of equation, and here is a graphical python fitter reading in data files and fitting to this equation. This fitter uses the scipy differential_evolution genetic algorithm module to determine initial parameter estimates for the non-linear fitter, which requires parameter ranges within which to search. In this code the data maximum and minimum values are used along with my range estimates. It is much easier to estimate ranges on initial parameter values than specific values. This fitter should be able to directly read your data files. If you can post or link to the actual data I might be able to make a better fit than is shown here.

    plot

    import numpy, scipy, matplotlib
    import matplotlib.pyplot as plt
    from scipy.optimize import curve_fit
    from scipy.optimize import differential_evolution
    import warnings
    
    with open('./x_data.txt', 'rt') as f:
        x_file = f.read()
    
    with open('./y_data.txt', 'rt') as f:
        y_file = f.read()
    
    xlist = []
    for line in x_file.split('\n'):
        if line: # this allows blank lines in file
            xlist.append(float(line.strip()))
    ylist = []
    for line in y_file.split('\n'):
        if line: # this allows blank lines in file
            ylist.append(float(line.strip()))
    
    if len(xlist) != len(ylist):
        print(len(xlist), len(ylist))
        raise Exception('X and Y habe different length')
    
    xData = numpy.array(xlist)
    yData = numpy.array(ylist)
    
    
    def func(t, a, b, c, d): # Log-Normal Peak A Shifted from zunzun.com
        return a * numpy.exp(-0.5 * numpy.power((numpy.log(t-d)-b) / c, 2.0))
    
    
    # function for genetic algorithm to minimize (sum of squared error)
    def sumOfSquaredError(parameterTuple):
        warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
        val = func(xData, *parameterTuple)
        return numpy.sum((yData - val) ** 2.0)
    
    
    def generate_Initial_Parameters():
        # min and max used for bounds
        maxX = max(xData)
        minX = min(xData)
        maxY = max(yData)
        minY = min(yData)
    
        parameterBounds = []
        parameterBounds.append([minY, maxY]) # search bounds for a
        parameterBounds.append([0.0, 2.0]) # search bounds for b
        parameterBounds.append([-1.0, 0.0]) # search bounds for c
        parameterBounds.append([-maxX, 0.0]) # search bounds for d
    
        # "seed" the numpy random number generator for repeatable results
        result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
        return result.x
    
    # by default, differential_evolution completes by calling curve_fit() using parameter bounds
    geneticParameters = generate_Initial_Parameters()
    
    # now call curve_fit without passing bounds from the genetic algorithm,
    # just in case the best fit parameters are aoutside those bounds
    fittedParameters, pcov = curve_fit(func, xData, yData, geneticParameters)
    print('Fitted parameters:', fittedParameters)
    print()
    
    modelPredictions = func(xData, *fittedParameters) 
    
    absError = modelPredictions - yData
    
    SE = numpy.square(absError) # squared errors
    MSE = numpy.mean(SE) # mean squared errors
    RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
    Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
    
    print()
    print('RMSE:', RMSE)
    print('R-squared:', Rsquared)
    
    print()
    
    
    ##########################################################
    # graphics output section
    def ModelAndScatterPlot(graphWidth, graphHeight):
        f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
        axes = f.add_subplot(111)
    
        # first the raw data as a scatter plot
        axes.plot(xData, yData,  'D')
    
        # create data for the fitted equation plot
        xModel = numpy.linspace(min(xData), max(xData), 1000)
        yModel = func(xModel, *fittedParameters)
    
        # now the model as a line plot
        axes.plot(xModel, yModel)
    
        axes.set_xlabel('X Data') # X axis data label
        axes.set_ylabel('Y Data') # Y axis data label
    
        plt.show()
        plt.close('all') # clean up after using pyplot
    
    graphWidth = 800
    graphHeight = 600
    ModelAndScatterPlot(graphWidth, graphHeight)
    

    EDIT: use actual data

    With the actual data now available I found that a Pulse peak equation gives a good fit to the data, here is an updated fitter. I recommend taking additional data from time 0 to 5 if possible, this will yield data that better characterizes the region at the beginning of the peak.

    enter image description here

    import numpy, scipy, matplotlib
    import matplotlib.pyplot as plt
    from scipy.optimize import curve_fit
    from scipy.optimize import differential_evolution
    import warnings
    
    with open('./x_data.txt', 'rt') as f:
        x_file = f.read()
    
    with open('./y_data.txt', 'rt') as f:
        y_file = f.read()
    
    xlist = []
    for line in x_file.split('\n'):
        if line: # this allows blank lines in file
            xlist.append(float(line.strip()))
    ylist = []
    for line in y_file.split('\n'):
        if line: # this allows blank lines in file
            ylist.append(float(line.strip()))
    
    if len(xlist) != len(ylist):
        print(len(xlist), len(ylist))
        raise Exception('X and Y have different length')
    
    xData = numpy.array(xlist)
    yData = numpy.array(ylist)
    
    
    def func(t, a, b, c, Offset): # Pulse Peak With Offset from zunzun.com
        return 4.0 * a * numpy.exp(-1.0 * (t-b) / c) * (1.0 - numpy.exp(-1.0 * (t-b) / c)) + Offset
    
    
    # function for genetic algorithm to minimize (sum of squared error)
    def sumOfSquaredError(parameterTuple):
        warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
        val = func(xData, *parameterTuple)
        return numpy.sum((yData - val) ** 2.0)
    
    
    def generate_Initial_Parameters():
        # min and max used for bounds
        maxX = max(xData)
        minX = min(xData)
        maxY = max(yData)
        minY = min(yData)
    
        parameterBounds = []
        parameterBounds.append([minY, maxY]) # search bounds for a
        parameterBounds.append([-5.0, 0.0]) # search bounds for b
        parameterBounds.append([1.0, 10.0]) # search bounds for c
        parameterBounds.append([minY, maxY]) # search bounds for Offset
    
        # "seed" the numpy random number generator for repeatable results
        result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
        return result.x
    
    # by default, differential_evolution completes by calling curve_fit() using parameter bounds
    geneticParameters = generate_Initial_Parameters()
    
    # now call curve_fit without passing bounds from the genetic algorithm,
    # just in case the best fit parameters are aoutside those bounds
    fittedParameters, pcov = curve_fit(func, xData, yData, geneticParameters)
    print('Fitted parameters:', fittedParameters)
    print()
    
    modelPredictions = func(xData, *fittedParameters) 
    
    absError = modelPredictions - yData
    
    SE = numpy.square(absError) # squared errors
    MSE = numpy.mean(SE) # mean squared errors
    RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
    Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
    
    print()
    print('RMSE:', RMSE)
    print('R-squared:', Rsquared)
    
    print()
    
    
    ##########################################################
    # graphics output section
    def ModelAndScatterPlot(graphWidth, graphHeight):
        f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
        axes = f.add_subplot(111)
    
        # first the raw data as a scatter plot
        axes.plot(xData, yData,  'D')
    
        # create data for the fitted equation plot
        xModel = numpy.linspace(min(xData), max(xData), 1000)
        yModel = func(xModel, *fittedParameters)
    
        # now the model as a line plot
        axes.plot(xModel, yModel)
    
        axes.set_xlabel('X Data') # X axis data label
        axes.set_ylabel('Y Data') # Y axis data label
    
        plt.show()
        plt.close('all') # clean up after using pyplot
    
    graphWidth = 800
    graphHeight = 600
    ModelAndScatterPlot(graphWidth, graphHeight)