Search code examples
pythonlmfit

Python and lmfit: Fit Multiple Data Sets with different lengths


On the example of fit with multiple data sets from lmfit documentation, the data variable have rows of equal length.

I tried to adapt the example to run the fit using datasets with different lengths. The only change I made was changing the length of the x variable for each of the five different data sets, but it didn't worked out.

The error I got was:

data = np.array(data)
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5,) + inhomogeneous part.

And here is the code I tried to run:

import numpy as np
import matplotlib.pyplot as plt
from lmfit import minimize, Parameters, report_fit

def gauss(x, amp, cen, sigma):
    "basic gaussian"
    return amp*np.exp(-(x-cen)**2/(2.*sigma**2))

def gauss_dataset(params, i, x):
    """calc gaussian from params for data set i
    using simple, hardwired naming convention"""
    amp = params['amp_%i' % (i+1)].value
    cen = params['cen_%i' % (i+1)].value
    sig = params['sig_%i' % (i+1)].value
    return gauss(x, amp, cen, sig)

def objective(params, x, data):
    """ calculate total residual for fits to several data sets held
    in a 2-D array, and modeled by Gaussian functions"""
    ndata, nx = data.shape
    resid = 0.0*data[:]
    # make residual per data set
    for i in range(ndata):
        resid[i, :] = data[i, :] - gauss_dataset(params, i, x)
    # now flatten this to a 1D array, as minimize() needs
    return resid.flatten()

# create 5 datasets

data = []
for i in np.arange(5):
    #Now x has different length each iteration
    x  = np.linspace( -1, 2, 151+i)
    params = Parameters()
    amp   =  0.60 + 9.50*np.random.rand()
    cen   = -0.20 + 1.20*np.random.rand()
    sig   =  0.25 + 0.03*np.random.rand()
    dat   = gauss(x, amp, cen, sig) + np.random.normal(size=len(x), scale=0.1)
    data.append(dat)

# data has shape (5, 151)
data = np.array(data)
assert(data.shape) == (5, 151)

# create 5 sets of parameters, one per data set
fit_params = Parameters()
for iy, y in enumerate(data):
    fit_params.add( 'amp_%i' % (iy+1), value=0.5, min=0.0,  max=200)
    fit_params.add( 'cen_%i' % (iy+1), value=0.4, min=-2.0,  max=2.0)
    fit_params.add( 'sig_%i' % (iy+1), value=0.3, min=0.01, max=3.0)

# but now constrain all values of sigma to have the same value
# by assigning sig_2, sig_3, .. sig_5 to be equal to sig_1
for iy in (2, 3, 4, 5):
    fit_params['sig_%i' % iy].expr='sig_1'

# run the global fit to all the data sets
result = minimize(objective, fit_params, args=(x, data))
report_fit(result)

# plot the data sets and fits
plt.figure()
for i in range(5):
    y_fit = gauss_dataset(fit_params, i, x)
    plt.plot(x, data[i, :], 'o', x, y_fit, '-')

plt.show()

Solution

  • Here's and example of simultaneously fitting multiple (2) data sets of different sizes and forms:

    Dataset 1: Gaussian with amplitude, center, sigma Dataset 2: Decaying sine wave with amplitdue, frequency, decay, phase-shift

    Let's assert that the Gaussian center is equal to the frequency and the Gaussian sigma squared is the decay parameter for the sine wave.

    import matplotlib.pyplot as plt
    import numpy as np
    
    from lmfit import Parameters, minimize, report_fit
    from lmfit.lineshapes import gaussian
    
    # parameters for mock data
    gaus_amp = 200
    gaus_cen = 4.5
    gaus_sig = 7.0
    
    sine_amp = 17
    sine_shift = -0.75
    sine_freq = gaus_cen
    sine_decay = gaus_sig**2
    
    # dataset 1: gaussian
    x1 = np.linspace(-30, 30, 121)
    y1 = gaussian(x1, gaus_amp, gaus_cen, gaus_sig)
    y1 += np.random.normal(scale=0.25, size=len(y1))
    
    # dataset 2: decaying sine wave
    x2 = np.linspace(0, 150, 601)
    y2 = sine_amp * np.sin(sine_shift +x2/sine_freq) * np.exp(-x2**2/sine_decay**2)
    y2 += np.random.normal(scale=0.5, size=len(y2))
    
    def objective(params, x1, x2, y1, y2):
        gamp = params['g_amp']
        gcen = params['g_cen']
        gsig = params['g_sig']
        gresid = y1 - gaussian(x1, gamp, gcen, gsig)
    
        samp = params['s_amp']
        sshift = params['s_shift']
        sfreq = gcen
        sdecay = gsig**2
        sresid = y2 - samp * np.sin(sshift +x2/sfreq) * np.exp(-x2**2/sdecay**2)
    
        # now concatenate the residuals of the data sets
        return np.concatenate((gresid, sresid))
    
    params = Parameters()
    params.add('g_amp', value=150, min=0)
    params.add('g_cen', value=5, min=0)
    params.add('g_sig', value=5, min=0)
    params.add('s_amp', value=10, min=0)
    params.add('s_shift', value=-0.5, min=-3, max=3)
    
    # Run the fit and show the fitting result
    out = minimize(objective, params, args=(x1, x2, y1, y2))
    report_fit(out.params)
    

    The key here is np.concatenate() to make a single 1-D residual array from the residuals of the individual datasets.

    This example will run and print out the results of

    [[Variables]]
        g_amp:    201.092389 +/- 1.74123974 (0.87%) (init = 150)
        g_cen:    4.50385672 +/- 0.00477268 (0.11%) (init = 5)
        g_sig:    7.00094469 +/- 0.01821519 (0.26%) (init = 5)
        s_amp:    16.8828293 +/- 0.08010449 (0.47%) (init = 10)
        s_shift: -0.74422074 +/- 0.00555315 (0.75%) (init = -0.5)
    [[Correlations]] (unreported correlations are < 0.100)
        C(g_cen, s_shift) = +0.7679
        C(g_sig, s_amp)   = -0.5904
        C(g_amp, g_sig)   = +0.1502
    

    I'll leave it as an exercise for the reader to plot the fits to the two datasets and think about extending that to 30 datasets ;).