Search code examples
pythonpandasdataframepandas-groupbypandas-apply

KeyError with pandas groupby() apply()


Here I'm adding time to data where only date is given. There are 5 minutes between values or 288 values per date.
The code works when the input dataframe is 1 day (288 rows) or less, but gives an error when the input is longer. Any idea what I'm missing? Thanks in advance.

Relevant section of code:

import datetime as dt
print("Print df_raw:\n", df_raw)

df = df_raw[:288]
# df = df_raw[:289]  # Gives KeyError, see traceback below
print("\nPrint df BEFORE groubpy/apply:\n", df)
df.loc[:,'date'] = pd.to_datetime(df.date)

def f(x): 
    x['DT']=[val+dt.timedelta(minutes=(pos*5)) for val,pos in zip(x.loc[:,'date'], range(0,len(x.loc[:,'date'])))]
    return x

df = df.groupby('date').apply(f)
df = df.set_index('DT').drop(columns='date')

print("\nPrint df AFTER groubpy/apply:\n", df)

Output (with 288 rows or less, works as expected):

Print df_raw:
           date   values
0   2015-03-10   556.25
0   2015-03-10  516.993
0   2015-03-10   468.75
0   2015-03-10  432.812
0   2015-03-10  87.1095
..         ...      ...
84  2014-12-16     None
84  2014-12-16     None
84  2014-12-16  160.938
84  2014-12-16  145.118
84  2014-12-16  125.977

[24480 rows x 2 columns]

Print df BEFORE groubpy/apply:
           date   values
0   2015-03-10   556.25
0   2015-03-10  516.993
0   2015-03-10   468.75
0   2015-03-10  432.812
0   2015-03-10  87.1095
..         ...      ...
0   2015-03-10  781.446
0   2015-03-10   743.36
0   2015-03-10  708.985
0   2015-03-10  669.922
0   2015-03-10  632.422

[288 rows x 2 columns]

Print df AFTER groubpy/apply:
                       values
DT                          
2015-03-10 00:00:00   556.25
2015-03-10 00:05:00  516.993
2015-03-10 00:10:00   468.75
2015-03-10 00:15:00  432.812
2015-03-10 00:20:00  87.1095
...                      ...
2015-03-10 23:35:00  781.446
2015-03-10 23:40:00   743.36
2015-03-10 23:45:00  708.985
2015-03-10 23:50:00  669.922
2015-03-10 23:55:00  632.422

[288 rows x 1 columns]

Traceback (with 289 rows or more, I get this):

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
    724             try:
--> 725                 result = self._python_apply_general(f)
    726             except Exception:

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f)
    744         return self._wrap_applied_output(
--> 745             keys, values, not_indexed_same=mutated or self.mutated
    746         )

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\groupby\generic.py in _wrap_applied_output(self, keys, values, not_indexed_same)
    371         elif isinstance(v, DataFrame):
--> 372             return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
    373         elif self.grouper.groupings is not None:

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\groupby\groupby.py in _concat_objects(self, keys, values, not_indexed_same)
    954                 else:
--> 955                     result = result.reindex(ax, axis=self.axis)
    956 

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    220         def wrapper(*args, **kwargs):
--> 221             return func(*args, **kwargs)
    222 

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\frame.py in reindex(self, *args, **kwargs)
   3975         kwargs.pop("labels", None)
-> 3976         return super().reindex(**kwargs)
   3977 

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
   4513         return self._reindex_axes(
-> 4514             axes, level, limit, tolerance, method, fill_value, copy
   4515         ).__finalize__(self)

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   3863             frame = frame._reindex_index(
-> 3864                 index, method, copy, level, fill_value, limit, tolerance
   3865             )

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
   3885             fill_value=fill_value,
-> 3886             allow_dups=False,
   3887         )

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
   4576                 allow_dups=allow_dups,
-> 4577                 copy=copy,
   4578             )

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\internals\managers.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
   1250         if not allow_dups:
-> 1251             self.axes[axis]._can_reindex(indexer)
   1252 

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\indexes\base.py in _can_reindex(self, indexer)
   3361         if not self.is_unique and len(indexer):
-> 3362             raise ValueError("cannot reindex from a duplicate axis")
   3363 

ValueError: cannot reindex from a duplicate axis

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2896             try:
-> 2897                 return self._engine.get_loc(key)
   2898             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'date'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-207-872197908aef> in <module>
     36     return x
     37 
---> 38 df = df.groupby('date').apply(f)
     39 df = df.set_index('DT').drop(columns='date')
     40 

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
    735 
    736                 with _group_selection_context(self):
--> 737                     return self._python_apply_general(f)
    738 
    739         return result

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f)
    740 
    741     def _python_apply_general(self, f):
--> 742         keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
    743 
    744         return self._wrap_applied_output(

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
    235             # group might be modified
    236             group_axes = _get_axes(group)
--> 237             res = f(group)
    238             if not _is_indexed_like(res, group_axes):
    239                 mutated = True

<ipython-input-207-872197908aef> in f(x)
     33 
     34 def f(x):
---> 35     x['DT']=[val+dt.timedelta(minutes=(pos*5)) for val,pos in zip(x.loc[:,'date'], range(0,len(x.loc[:,'date'])))]
     36     return x
     37 

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
   1416                 except (KeyError, IndexError, AttributeError):
   1417                     pass
-> 1418             return self._getitem_tuple(key)
   1419         else:
   1420             # we by definition only have the 0th axis

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
    803     def _getitem_tuple(self, tup):
    804         try:
--> 805             return self._getitem_lowerdim(tup)
    806         except IndexingError:
    807             pass

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
    927         for i, key in enumerate(tup):
    928             if is_label_like(key) or isinstance(key, tuple):
--> 929                 section = self._getitem_axis(key, axis=i)
    930 
    931                 # we have yielded a scalar ?

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
   1848         # fall thru to straight lookup
   1849         self._validate_key(key, axis)
-> 1850         return self._get_label(key, axis=axis)
   1851 
   1852 

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
    158             raise IndexingError("no slices here, handle elsewhere")
    159 
--> 160         return self.obj._xs(label, axis=axis)
    161 
    162     def _get_loc(self, key: int, axis: int):

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
   3727 
   3728         if axis == 1:
-> 3729             return self[key]
   3730 
   3731         self._consolidate_inplace()

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2993             if self.columns.nlevels > 1:
   2994                 return self._getitem_multilevel(key)
-> 2995             indexer = self.columns.get_loc(key)
   2996             if is_integer(indexer):
   2997                 indexer = [indexer]

~\AppData\Local\Continuum\miniconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2897                 return self._engine.get_loc(key)
   2898             except KeyError:
-> 2899                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2900         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2901         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'date'

EDIT
Thanks for all the help. Based on your suggestions, this solution seems to be working for me, with no SettingWithCopyWarning's:

import pandas as pd
import datetime as dt

# Note: df_raw is set elsewhere...
df = df_raw

df.loc[:,'date'] = pd.to_datetime(df['date'])

em = pd.DataFrame()

for key,t in df.groupby('date'):
    temp = t
    temp.loc[:,'DT'] = [i+dt.timedelta(minutes=j*5) for i,j in zip(t['date'],range(t['date'].shape[0]))]
    em = pd.concat([em,temp])

print(em)

Python: 3.7.4
Pandas: 0.25.3
Jupyter Lab


Solution

  • When you use apply function with groupby, you can't access the group key inside the function.

    Instead you can loop through the grouped dataframe group by group, and append the processed dataframes together.

    import pandas as pd
    
    t = pd.DataFrame({'date':['2015-03-10','2015-03-10','2015-03-10','2015-03-10','2015-03-11','2015-03-11','2015-03-11'],'value':[1,2,3,4,5,6,7]})
    
    t['date'] = pd.to_datetime(t['date'])
    

    Create a new dataframe and append the processed groups to it.

    em = pd.DataFrame()
    
    for key,df in t.groupby('date'):
        temp = df
        temp['dt'] = [i+datetime.timedelta(minutes=j*5) for i,j in zip(df['date'],range(df['date'].shape[0]))]
        em = pd.concat([em,temp])
    

    Output:

            date  value                  dt
    0 2015-03-10      1 2015-03-10 00:00:00
    1 2015-03-10      2 2015-03-10 00:05:00
    2 2015-03-10      3 2015-03-10 00:10:00
    3 2015-03-10      4 2015-03-10 00:15:00
    4 2015-03-11      5 2015-03-11 00:00:00
    5 2015-03-11      6 2015-03-11 00:05:00
    6 2015-03-11      7 2015-03-11 00:10:00