Search code examples
pythonscipycurve-fitting

SciPy leastsq fit to a sine wave failing


I am trying to figure out what it is I don't understand here.

I am following http://www.scipy.org/Cookbook/FittingData and trying to fit a sine wave. The real problem is satellite magnetometer data which makes a nice sine wave on a spinning spacecraft. I created a dataset then am trying to fit it to recover the inputs.

Here is my code:

import numpy as np
from scipy import optimize

from scipy.optimize import curve_fit, leastsq

import matplotlib.pyplot as plt


class Parameter:
    def __init__(self, value):
            self.value = value

    def set(self, value):
            self.value = value

    def __call__(self):
            return self.value

def fit(function, parameters, y, x = None):
    def f(params):
        i = 0
        for p in parameters:
            p.set(params[i])
            i += 1
        return y - function(x)

    if x is None: x = np.arange(y.shape[0])
    p = [param() for param in parameters]
    return optimize.leastsq(f, p, full_output=True, ftol=1e-6, xtol=1e-6)

# generate a perfect data set (my real data have tiny error)
def mysine(x, a1, a2, a3):
    return a1 * np.sin(a2 * x + a3)

xReal = np.arange(500)/10.
a1 = 200.
a2 = 2*np.pi/10.5  # omega, 10.5 is the period
a3 = np.deg2rad(10.) # 10 degree phase offset
yReal = mysine(xReal, a1, a2, a3)

# plot the real data
plt.figure(figsize=(15,5))
plt.plot(xReal, yReal, 'r', label='Real Values')

# giving initial parameters
amplitude = Parameter(175.)
frequency = Parameter(2*np.pi/8.)
phase = Parameter(0.0)

# define your function:
def f(x): return amplitude() * np.sin(frequency() * x + phase())

# fit! (given that data is an array with the data to fit)
out = fit(f, [amplitude, frequency, phase], yReal, xReal)
period = 2*np.pi/frequency()
print amplitude(), period, np.rad2deg(phase())

xx = np.linspace(0, np.max(xReal), 50)
plt.plot( xx, f(xx) , label='fit')
plt.legend(shadow=True, fancybox=True)

Which makes this plot: enter image description here

The recovered fit parameters of [44.2434221897 8.094832581 -61.6204033699] have no resemblance to what I started with.

Any thoughts on what I am not understanding or doing wrong?

scipy.__version__
'0.10.1'

Edit: Fixing one parameter was suggested. In the example above fixing the amplitude to np.histogram(yReal)[1][-1] still produces unacceptable output. Fits: [175.0 8.31681375217 6.0] Should I try a different fitting method? Suggestions on which?

enter image description here


Solution

  • Here is some code implementing some of Zhenya's ideas. It uses

    yhat = fftpack.rfft(yReal)
    idx = (yhat**2).argmax()
    freqs = fftpack.rfftfreq(N, d = (xReal[1]-xReal[0])/(2*pi))
    frequency = freqs[idx]
    

    to guess the main frequency of the data, and

    amplitude = yReal.max()
    

    to guess the amplitude.


    import numpy as np
    import scipy.optimize as optimize
    import scipy.fftpack as fftpack
    import matplotlib.pyplot as plt
    pi = np.pi
    plt.figure(figsize = (15, 5))
    
    # generate a perfect data set (my real data have tiny error)
    def mysine(x, a1, a2, a3):
        return a1 * np.sin(a2 * x + a3)
    
    N = 5000
    xmax = 10
    xReal = np.linspace(0, xmax, N)
    a1 = 200.
    a2 = 2*pi/10.5  # omega, 10.5 is the period
    a3 = np.deg2rad(10.) # 10 degree phase offset
    print(a1, a2, a3)
    yReal = mysine(xReal, a1, a2, a3) + 0.2*np.random.normal(size=len(xReal))
    
    yhat = fftpack.rfft(yReal)
    idx = (yhat**2).argmax()
    freqs = fftpack.rfftfreq(N, d = (xReal[1]-xReal[0])/(2*pi))
    frequency = freqs[idx]
    
    amplitude = yReal.max()
    guess = [amplitude, frequency, 0.]
    print(guess)
    (amplitude, frequency, phase), pcov = optimize.curve_fit(
        mysine, xReal, yReal, guess)
    
    period = 2*pi/frequency
    print(amplitude, frequency, phase)
    
    xx = xReal
    yy = mysine(xx, amplitude, frequency, phase)
    # plot the real data
    plt.plot(xReal, yReal, 'r', label = 'Real Values')
    plt.plot(xx, yy , label = 'fit')
    plt.legend(shadow = True, fancybox = True)
    plt.show()
    

    yields

    (200.0, 0.5983986006837702, 0.17453292519943295)   # (a1, a2, a3)
    [199.61981404516041, 0.61575216010359946, 0.0]     # guess
    (200.06145097308041, 0.59841420869261097, 0.17487141943703263) # fitted parameters
    

    Notice that by using fft, the guess for the frequency is already pretty close to final fitted parameter.

    It seems you do not need to fix any of the parameters. By making the frequency guess closer to the actual value, optimize.curve_fit is able to converge to a reasonable answer.