Search code examples
pythonplotlyspline

How to draw spline that is not connecting all points


I'm trying to plot a massive amount of data with spline going through the points, it's should look like this.enter image description here

But when I try to do it with plotly the spline insists going through all the points like thisenter image description hereenter image description here

When the first image is only the data points and the second is the spline.

The code i try is

dates = [dates_arr]
x = dates.strftime("%Y-%m-%d")
y = [data_points]
xy_data = go.Scatter(x=x, y=y, mode='markers', marker=dict(size=4), 
name='AAPL')

mov_avg = go.Scatter(x=x, y=y, name="spline",text= 
["spline"],hoverinfo='text+name',line_shape='spline', line_smoothing = 1.3)    

data = [xy_data, mov_avg]  

py.iplot(data, filename='Spline fit')

#################################
first_plot_url = py.plot(data, filename='apple stock moving average', 
auto_open=True, )

Does anyone have idea?


Solution

  • In your first image, the spline is an approximation to all of your datapoints. In your snippet, spline is an attribute set to your line between your graphical representation of your datapoints. These are very different things. To accomplish what you are looking for, you should take a closer look at contributions from users np8 and Matthew Drury on other SO posts and github. You should also take a closer look at how different splines are calculated. The following plot, where a natural cubic spline is estimated, is produced by the code sample named Snippet 2: The whole thing below. It's pretty large, but that's mostly becaus of the function get_natural_cubic_spline_model from Python natural smoothing splines. The plotly part simply follows this logic:

    Snippet 1: Focuses only on the plotly part

    # data points
    points = go.Scatter(
        x = x,
        y = y,
        mode = 'markers',
        name = 'iris')
    
    # spline
    line = go.Scatter(
        x = df_spline['x'],
        y = df_spline['y_est'],
        mode = 'lines',
        name = 'spline')
    
    # gather data
    data=[points, line]
    
    # build figure
    fig=go.Figure(data)
    
    # plot
    fig.show()
    

    Plot:

    enter image description here

    Snippet 2: The whole thing

    # imports
    import plotly.express as px
    import plotly.graph_objs as go
    import numpy as np
    import pandas as pd
    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn.linear_model import LinearRegression
    from sklearn.pipeline import Pipeline
    
    # sample data set
    iris = px.data.iris() # iris is a pandas DataFrame
    x=iris['sepal_length']
    y=iris['sepal_width']
    
    # spline using function from https://stackoverflow.com/questions/51321100/python-natural-smoothing-splines
    
    def get_natural_cubic_spline_model(x, y, minval=None, maxval=None, n_knots=None, knots=None):
        """
        Get a natural cubic spline model for the data.
    
        For the knots, give (a) `knots` (as an array) or (b) minval, maxval and n_knots.
    
        If the knots are not directly specified, the resulting knots are equally
        space within the *interior* of (max, min).  That is, the endpoints are
        *not* included as knots.
    
        Parameters
        ----------
        x: np.array of float
            The input data
        y: np.array of float
            The outpur data
        minval: float 
            Minimum of interval containing the knots.
        maxval: float 
            Maximum of the interval containing the knots.
        n_knots: positive integer 
            The number of knots to create.
        knots: array or list of floats 
            The knots.
    
        Returns
        --------
        model: a model object
            The returned model will have following method:
            - predict(x):
                x is a numpy array. This will return the predicted y-values.
        """
    
        if knots:
            spline = NaturalCubicSpline(knots=knots)
        else:
            spline = NaturalCubicSpline(max=maxval, min=minval, n_knots=n_knots)
    
        p = Pipeline([
            ('nat_cubic', spline),
            ('regression', LinearRegression(fit_intercept=True))
        ])
    
        p.fit(x, y)
    
        return p
    
    
    class AbstractSpline(BaseEstimator, TransformerMixin):
        """Base class for all spline basis expansions."""
    
        def __init__(self, max=None, min=None, n_knots=None, n_params=None, knots=None):
            if knots is None:
                if not n_knots:
                    n_knots = self._compute_n_knots(n_params)
                knots = np.linspace(min, max, num=(n_knots + 2))[1:-1]
                max, min = np.max(knots), np.min(knots)
            self.knots = np.asarray(knots)
    
        @property
        def n_knots(self):
            return len(self.knots)
    
        def fit(self, *args, **kwargs):
            return self
    
    
    class NaturalCubicSpline(AbstractSpline):
        """Apply a natural cubic basis expansion to an array.
        The features created with this basis expansion can be used to fit a
        piecewise cubic function under the constraint that the fitted curve is
        linear *outside* the range of the knots..  The fitted curve is continuously
        differentiable to the second order at all of the knots.
        This transformer can be created in two ways:
          - By specifying the maximum, minimum, and number of knots.
          - By specifying the cutpoints directly.  
    
        If the knots are not directly specified, the resulting knots are equally
        space within the *interior* of (max, min).  That is, the endpoints are
        *not* included as knots.
        Parameters
        ----------
        min: float 
            Minimum of interval containing the knots.
        max: float 
            Maximum of the interval containing the knots.
        n_knots: positive integer 
            The number of knots to create.
        knots: array or list of floats 
            The knots.
        """
    
        def _compute_n_knots(self, n_params):
            return n_params
    
        @property
        def n_params(self):
            return self.n_knots - 1
    
        def transform(self, X, **transform_params):
            X_spl = self._transform_array(X)
            if isinstance(X, pd.Series):
                col_names = self._make_names(X)
                X_spl = pd.DataFrame(X_spl, columns=col_names, index=X.index)
            return X_spl
    
        def _make_names(self, X):
            first_name = "{}_spline_linear".format(X.name)
            rest_names = ["{}_spline_{}".format(X.name, idx)
                          for idx in range(self.n_knots - 2)]
            return [first_name] + rest_names
    
        def _transform_array(self, X, **transform_params):
            X = X.squeeze()
            try:
                X_spl = np.zeros((X.shape[0], self.n_knots - 1))
            except IndexError: # For arrays with only one element
                X_spl = np.zeros((1, self.n_knots - 1))
            X_spl[:, 0] = X.squeeze()
    
            def d(knot_idx, x):
                def ppart(t): return np.maximum(0, t)
    
                def cube(t): return t*t*t
                numerator = (cube(ppart(x - self.knots[knot_idx]))
                             - cube(ppart(x - self.knots[self.n_knots - 1])))
                denominator = self.knots[self.n_knots - 1] - self.knots[knot_idx]
                return numerator / denominator
    
            for i in range(0, self.n_knots - 2):
                X_spl[:, i+1] = (d(i, X) - d(self.n_knots - 2, X)).squeeze()
            return X_spl
    
    # spline calculations
    m1=get_natural_cubic_spline_model(x, y, minval=min(x), maxval=max(x), n_knots=6)
    y_est_m1=m1.predict(x)
    
    # gather results and sort them so that the line is not messed up
    df_spline=pd.DataFrame({'x':x,
                           'y':y,
                           'y_est':m1.predict(x)})
    df_spline=df_spline.sort_values(by=['x'])
    
    ### PLOTLY ###
    # data source
    points = go.Scatter(
        x = x,
        y = y,
        mode = 'markers',
        name = 'iris')
    
    # spline
    line = go.Scatter(
        x = df_spline['x'],
        y = df_spline['y_est'],
        mode = 'lines',
        name = 'spline')
    
    # gather data
    data=[points, line]
    
    # build figure
    fig=go.Figure(data)
    
    # plot
    fig.show()