Search code examples
pythonpandasdataframerolling-computation

ValueError: invalid on specified as date_x, must be a column (of DataFrame), an Index or None


Trying to take a Monthly Rolling average from my DF based on 'ISIN' column on Tone column. this is the df:

import pandas as pd

# Given data lists
tone = [-0.397617, -1.217575, 0.101528, -0.736255, 1.077126]
date_x = ["2014-01-01 00:00:00", "2014-02-01 00:00:00", "2014-03-01 00:00:00", "2014-04-01 00:00:00", "2014-05-01 00:00:00"]
isin = ["DE0007664005", "DE0007664005", "DE0007664005", "DE0007664005", "DE0007664005"]

# Create DataFrame
df = pd.DataFrame({'ISIN': isin, 'date_x': date_x, 'Tone': tone})

# Convert 'date_x' to datetime
df['date_x'] = pd.to_datetime(df['date_x'])

so this is my code:

news.groupby('ISIN')['Tone'].transform(lambda s: s.rolling('30D',on='date_x').mean())

and this is my error:

----> 4 news.groupby('ISIN')['Tone'].transform(lambda s: s.rolling('30D',on='date_x').mean())

File c:\Users\user\anaconda3\envs\PythonCourse2023\Lib\site-packages\pandas\core\groupby\generic.py:517, in SeriesGroupBy.transform(self, func, engine, engine_kwargs, *args,
**kwargs)
    514 @Substitution(klass="Series", example=__examples_series_doc)
    515 @Appender(_transform_template)
    516 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
--> 517     return self._transform(
    518         func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
    519     )

File c:\Users\user\anaconda3\envs\PythonCourse2023\Lib\site-packages\pandas\core\groupby\groupby.py:2021, in GroupBy._transform(self, func, engine, engine_kwargs, *args,
**kwargs)    2018     warn_alias_replacement(self, orig_func, func)    2020 if not isinstance(func, str):
-> 2021     return self._transform_general(func, engine, engine_kwargs, *args, **kwargs)    2023 elif func not in base.transform_kernel_allowlist:    2024     msg = f"'{func}' is not a valid function name for transform(name)"

File c:\Users\user\anaconda3\envs\PythonCourse2023\Lib\site-packages\pandas\core\groupby\generic.py:557, in SeriesGroupBy._transform_general(self, func, engine, engine_kwargs,
*args, **kwargs)
    552 for name, group in self._grouper.get_iterator(
    553     self._obj_with_exclusions, axis=self.axis
    554 ):
    555     # this setattr is needed for test_transform_lambda_with_datetimetz
    556     object.__setattr__(group, "name", name)
--> 557     res = func(group, *args, **kwargs)
    559     results.append(klass(res, index=group.index))
    561 # check for empty "results" to avoid concat ValueError

Cell In[18], line 4
      1 # news.set_index('date_x', inplace=True)
      2 
      3 # news.groupby('Player').rolling(window='30D',on='date_x')['Tone'].mean()
----> 4 news.groupby('ISIN')['Tone'].transform(lambda s: s.rolling('30D',on='date_x').mean())
      5 # news.reset_index(inplace=True)

File c:\Users\user\anaconda3\envs\PythonCourse2023\Lib\site-packages\pandas\core\generic.py:12573, in NDFrame.rolling(self, window, min_periods, center, win_type, on, axis, closed, step, method)   12559 if win_type is not None:   12560   return Window(   12561         self,   12562         window=window,    (...)   12570         method=method,   12571     )
> 12573 return Rolling(   12574     self,   12575     window=window,   12576     min_periods=min_periods,   12577     center=center,   12578  win_type=win_type,   12579     on=on,   12580     axis=axis,   12581   closed=closed,   12582     step=step,   12583     method=method,   12584 )

File c:\Users\user\anaconda3\envs\PythonCourse2023\Lib\site-packages\pandas\core\window\rolling.py:164, in BaseWindow.__init__(self, obj, window, min_periods, center, win_type, axis, on, closed, step, method, selection)
    162     self._on = Index(self.obj[self.on])
    163 else:
--> 164     raise ValueError(
    165         f"invalid on specified as {self.on}, "
    166         "must be a column (of DataFrame), an Index or None"
    167     )
    169 self._selection = selection
    170 self._validate()

ValueError: invalid on specified as date_x, must be a column (of DataFrame), an Index or None

How can i take the rolling average of Tone based on the ISIN on monthly basis?


Solution

  • IIUC, you can do:

    out = (
        df.groupby("ISIN")
        .apply(
            lambda g: g.rolling("30D", on="date_x")["Tone"].mean().to_frame(),
            include_groups=False,
        )
        .droplevel(1)
    )
    print(out)