Search code examples
pythonnumpyscipydata-analysiscurve-fitting

Issues with Double Gaussian Fit Using curve_fit in Python


I used find_peaks to locate the peaks and estimated the initial parameters for the double Gaussian fit. I expected the curve_fit function to accurately fit the double Gaussian to my data, aligning the peaks and widths correctly. However, the resulting fit does not match the data well, and the Gaussian peaks are misaligned.

Is there a way to achieve a more accurate fit?

import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import find_peaks
from scipy.optimize import curve_fit

# Data arrays
y_data = np.array([
    1.500e-04, 1.500e-04, 1.500e-04, 1.500e-04, 1.700e-04, 1.600e-04, 1.800e-04,
    1.600e-04, 1.700e-04, 2.300e-04, 2.500e-04, 3.200e-04, 3.200e-04, 3.800e-04,
    4.000e-04, 5.000e-04, 5.600e-04, 5.600e-04, 6.500e-04, 7.500e-04, 9.100e-04,
    1.180e-03, 1.550e-03, 2.110e-03, 2.880e-03, 3.850e-03, 5.200e-03, 6.780e-03,
    8.950e-03, 1.123e-02, 1.403e-02, 1.723e-02, 2.031e-02, 2.330e-02, 2.495e-02,
    2.433e-02, 2.171e-02, 1.725e-02, 1.231e-02, 8.080e-03, 4.980e-03, 2.960e-03,
    1.970e-03, 1.830e-03, 2.220e-03, 2.880e-03, 3.700e-03, 4.650e-03, 5.820e-03,
    7.150e-03, 8.450e-03, 9.510e-03, 1.006e-02, 9.660e-03, 8.560e-03, 6.910e-03,
    5.100e-03, 3.380e-03, 2.170e-03, 1.230e-03, 7.000e-04, 3.600e-04, 2.200e-04,
    1.700e-04, 1.500e-04, 1.600e-04, 1.100e-04, 1.100e-04, 1.200e-04, 1.000e-04,
    1.200e-04, 1.100e-04, 1.300e-04, 1.500e-04, 1.200e-04, 1.600e-04, 1.200e-04,
    1.200e-04, 1.200e-04, 1.200e-04, 1.100e-04, 1.500e-04, 1.500e-04, 1.300e-04,
    1.100e-04, 8.000e-05, 1.200e-04, 1.200e-04, 1.100e-04, 1.100e-04, 1.500e-04])

x_data = np.array([
    6555.101, 6555.201, 6555.301, 6555.401, 6555.501, 6555.601, 6555.701, 6555.801,
    6555.901, 6556.001, 6556.101, 6556.201, 6556.301, 6556.401, 6556.501, 6556.601,
    6556.701, 6556.801, 6556.901, 6557.001, 6557.101, 6557.201, 6557.301, 6557.401,
    6557.501, 6557.601, 6557.701, 6557.801, 6557.901, 6558.001, 6558.101, 6558.201,
    6558.301, 6558.401, 6558.501, 6558.601, 6558.701, 6558.801, 6558.901, 6559.001,
    6559.101, 6559.201, 6559.301, 6559.401, 6559.501, 6559.601, 6559.701, 6559.801,
    6559.901, 6560.001, 6560.101, 6560.201, 6560.301, 6560.401, 6560.501, 6560.601,
    6560.701, 6560.801, 6560.901, 6561.001, 6561.101, 6561.201, 6561.301, 6561.401,
    6561.501, 6561.601, 6561.701, 6561.801, 6561.901, 6562.001, 6562.101, 6562.201,
    6562.301, 6562.401, 6562.501, 6562.601, 6562.701, 6562.801, 6562.901, 6563.001,
    6563.101, 6563.201, 6563.301, 6563.401, 6563.501, 6563.601, 6563.701, 6563.801,
    6563.901, 6564.001, 6564.001])

# Find peaks
peaks, _ = find_peaks(y_data, height=0.0005)

# Define double Gaussian function
def double_gaussian(x, a1, b1, c1, a2, b2, c2):
    g1 = a1 * np.exp(-((x - b1) ** 2) / (2 * c1 ** 2))
    g2 = a2 * np.exp(-((x - b2) ** 2) / (2 * c2 ** 2))
    return g1 + g2

# Function to find Full Width at Half Maximum (FWHM)
def find_fwhm(x, y, peak_index):
    half_max = y[peak_index] / 2
    left_idx = np.where(y[:peak_index] <= half_max)[0][-1]
    right_idx = np.where(y[peak_index:] <= half_max)[0][0] + peak_index
    left_x = x[left_idx] + (half_max - y[left_idx]) * (x[left_idx + 1] - x[left_idx]) / (y[left_idx + 1] - y[left_idx])
    right_x = x[right_idx] + (half_max - y[right_idx]) * (x[right_idx + 1] - x[right_idx]) / (y[right_idx + 1] - y[right_idx])
    return right_x - left_x

# Calculate FWHM for each peak
fwhm1 = find_fwhm(x_data, y_data, peaks[0]) / 2.355
fwhm2 = find_fwhm(x_data, y_data, peaks[1]) / 2.355

# Initial guess for the parameters
initial_guess = [y_data[peaks[0]], x_data[peaks[0]], fwhm1, y_data[peaks[1]], x_data[peaks[1]], fwhm2]

# Fit the data using curve_fit
params, covariance = curve_fit(double_gaussian, x_data, y_data, p0=initial_guess)

# Generate fitted data
x_fit = np.linspace(min(x_data), max(x_data), 10000)
fitted_data = double_gaussian(x_fit, *params)

# Separate the two Gaussian components
g1 = params[0] * np.exp(-((x_fit - params[1]) ** 2

) / (2 * params[2] ** 2))
g2 = params[3] * np.exp(-((x_fit - params[4]) ** 2) / (2 * params[5] ** 2))

# Plot the data and the fit
plt.scatter(x_data, y_data, label='Data')
plt.plot(x_fit, g1, 'g--', label='Gaussian 1')
plt.plot(x_fit, g2, 'm--', label='Gaussian 2')
plt.plot(x_fit, fitted_data, 'r--', label='Double Gaussian')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

And here is the resulting plot:

enter image description here


Solution

  • When data are not exactly Gaussian (peaks have bigger tailing, even in a small extent), it is a common approach to fit a Voigt or a Pseudo-Voigt (which is easier to compute) profile instead of Gaussian.

    import numpy as np
    import matplotlib.pyplot as plt
    from scipy import optimize, stats
    

    We define the Pseudo Voigt peak:

    def pseudo_voigt(x, eta, sigma, gamma, x0, A):
        G = stats.norm.pdf(x, scale=sigma, loc=x0)
        L = stats.cauchy.pdf(x, scale=2. * gamma, loc=x0)
        return A * ((1. - eta) * G + eta * L)
    

    And the model for double peaks:

    def model2(x, eta0, sigma0, gamma0, x00, A0, eta1, sigma1, gamma1, x01, A1):
        return pseudo_voigt(x, eta0, sigma0, gamma0, x00, A0) + pseudo_voigt(x, eta1, sigma1, gamma1, x01, A1)
    

    We fit:

    popt2, pcov2 = optimize.curve_fit(
        model2,
        x_data, y_data,
        p0=[0.5, 1., 1., 6559, 0.025, 0.5, 1., 1., 6561, 0.01],
        bounds=[
             (0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
             (1, np.inf, np.inf, np.inf, np.inf, 1, np.inf, np.inf, np.inf, np.inf)
        ]
    )
    # array([5.04788238e-01, 3.39173299e-01, 2.55686905e-01, 6.55847681e+03,
    #        2.75922544e-02, 3.34888395e-19, 3.35457589e-01, 4.01394103e+00,
    #        6.56029885e+03, 7.93603468e-03])
    

    Results is not totally perfect:

    enter image description here

    But at least the first peak reach its nominal height and its tailing is taken into account.