I'm trying to plot a massive amount of data with spline going through the points, it's should look like this.
But when I try to do it with plotly the spline insists going through all the points like this
When the first image is only the data points and the second is the spline.
The code i try is
dates = [dates_arr]
x = dates.strftime("%Y-%m-%d")
y = [data_points]
xy_data = go.Scatter(x=x, y=y, mode='markers', marker=dict(size=4),
name='AAPL')
mov_avg = go.Scatter(x=x, y=y, name="spline",text=
["spline"],hoverinfo='text+name',line_shape='spline', line_smoothing = 1.3)
data = [xy_data, mov_avg]
py.iplot(data, filename='Spline fit')
#################################
first_plot_url = py.plot(data, filename='apple stock moving average',
auto_open=True, )
Does anyone have idea?
In your first image, the spline is an approximation to all of your datapoints. In your snippet, spline
is an attribute set to your line between your graphical representation of your datapoints. These are very different things. To accomplish what you are looking for, you should take a closer look at contributions from users np8
and Matthew Drury
on other SO posts and github. You should also take a closer look at how different splines are calculated. The following plot, where a natural cubic spline is estimated, is produced by the code sample named Snippet 2: The whole thing
below. It's pretty large, but that's mostly becaus of the function get_natural_cubic_spline_model
from Python natural smoothing splines. The plotly part simply follows this logic:
Snippet 1: Focuses only on the plotly part
# data points
points = go.Scatter(
x = x,
y = y,
mode = 'markers',
name = 'iris')
# spline
line = go.Scatter(
x = df_spline['x'],
y = df_spline['y_est'],
mode = 'lines',
name = 'spline')
# gather data
data=[points, line]
# build figure
fig=go.Figure(data)
# plot
fig.show()
Plot:
Snippet 2: The whole thing
# imports
import plotly.express as px
import plotly.graph_objs as go
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
# sample data set
iris = px.data.iris() # iris is a pandas DataFrame
x=iris['sepal_length']
y=iris['sepal_width']
# spline using function from https://stackoverflow.com/questions/51321100/python-natural-smoothing-splines
def get_natural_cubic_spline_model(x, y, minval=None, maxval=None, n_knots=None, knots=None):
"""
Get a natural cubic spline model for the data.
For the knots, give (a) `knots` (as an array) or (b) minval, maxval and n_knots.
If the knots are not directly specified, the resulting knots are equally
space within the *interior* of (max, min). That is, the endpoints are
*not* included as knots.
Parameters
----------
x: np.array of float
The input data
y: np.array of float
The outpur data
minval: float
Minimum of interval containing the knots.
maxval: float
Maximum of the interval containing the knots.
n_knots: positive integer
The number of knots to create.
knots: array or list of floats
The knots.
Returns
--------
model: a model object
The returned model will have following method:
- predict(x):
x is a numpy array. This will return the predicted y-values.
"""
if knots:
spline = NaturalCubicSpline(knots=knots)
else:
spline = NaturalCubicSpline(max=maxval, min=minval, n_knots=n_knots)
p = Pipeline([
('nat_cubic', spline),
('regression', LinearRegression(fit_intercept=True))
])
p.fit(x, y)
return p
class AbstractSpline(BaseEstimator, TransformerMixin):
"""Base class for all spline basis expansions."""
def __init__(self, max=None, min=None, n_knots=None, n_params=None, knots=None):
if knots is None:
if not n_knots:
n_knots = self._compute_n_knots(n_params)
knots = np.linspace(min, max, num=(n_knots + 2))[1:-1]
max, min = np.max(knots), np.min(knots)
self.knots = np.asarray(knots)
@property
def n_knots(self):
return len(self.knots)
def fit(self, *args, **kwargs):
return self
class NaturalCubicSpline(AbstractSpline):
"""Apply a natural cubic basis expansion to an array.
The features created with this basis expansion can be used to fit a
piecewise cubic function under the constraint that the fitted curve is
linear *outside* the range of the knots.. The fitted curve is continuously
differentiable to the second order at all of the knots.
This transformer can be created in two ways:
- By specifying the maximum, minimum, and number of knots.
- By specifying the cutpoints directly.
If the knots are not directly specified, the resulting knots are equally
space within the *interior* of (max, min). That is, the endpoints are
*not* included as knots.
Parameters
----------
min: float
Minimum of interval containing the knots.
max: float
Maximum of the interval containing the knots.
n_knots: positive integer
The number of knots to create.
knots: array or list of floats
The knots.
"""
def _compute_n_knots(self, n_params):
return n_params
@property
def n_params(self):
return self.n_knots - 1
def transform(self, X, **transform_params):
X_spl = self._transform_array(X)
if isinstance(X, pd.Series):
col_names = self._make_names(X)
X_spl = pd.DataFrame(X_spl, columns=col_names, index=X.index)
return X_spl
def _make_names(self, X):
first_name = "{}_spline_linear".format(X.name)
rest_names = ["{}_spline_{}".format(X.name, idx)
for idx in range(self.n_knots - 2)]
return [first_name] + rest_names
def _transform_array(self, X, **transform_params):
X = X.squeeze()
try:
X_spl = np.zeros((X.shape[0], self.n_knots - 1))
except IndexError: # For arrays with only one element
X_spl = np.zeros((1, self.n_knots - 1))
X_spl[:, 0] = X.squeeze()
def d(knot_idx, x):
def ppart(t): return np.maximum(0, t)
def cube(t): return t*t*t
numerator = (cube(ppart(x - self.knots[knot_idx]))
- cube(ppart(x - self.knots[self.n_knots - 1])))
denominator = self.knots[self.n_knots - 1] - self.knots[knot_idx]
return numerator / denominator
for i in range(0, self.n_knots - 2):
X_spl[:, i+1] = (d(i, X) - d(self.n_knots - 2, X)).squeeze()
return X_spl
# spline calculations
m1=get_natural_cubic_spline_model(x, y, minval=min(x), maxval=max(x), n_knots=6)
y_est_m1=m1.predict(x)
# gather results and sort them so that the line is not messed up
df_spline=pd.DataFrame({'x':x,
'y':y,
'y_est':m1.predict(x)})
df_spline=df_spline.sort_values(by=['x'])
### PLOTLY ###
# data source
points = go.Scatter(
x = x,
y = y,
mode = 'markers',
name = 'iris')
# spline
line = go.Scatter(
x = df_spline['x'],
y = df_spline['y_est'],
mode = 'lines',
name = 'spline')
# gather data
data=[points, line]
# build figure
fig=go.Figure(data)
# plot
fig.show()