Search code examples
pythonpandasgroup-bykeyerror

column is not accessible using groupby and apply(lambda)


I'm encountering a KeyError when trying to use the .apply() method on a pandas DataFrame after performing a groupby. The goal is to calculate the weighted average baced on the Industry_adjusted_return column. The error indicates that the 'Industry_adjusted_return' column cannot be found. Below is a minimal example that reproduces the issue:

```
import pandas as pd

# Creating a small DataFrame
data = {
    'ISIN': ['DE000A1DAHH0', 'DE000KSAG888'],
    'Date': ['2017-03-01', '2017-03-01'],
    'MP_quintile': [0, 0],
    'Mcap_w': [8089460.00, 4154519.75],
    'Industry_adjusted_return': [-0.00869, 0.043052]
}

df = pd.DataFrame(data)
df['Date'] = pd.to_datetime(df['Date'])  # Ensure 'Date' is datetime type

I'm using Python 3.8 with pandas version 1.3.3. Any insights into why this error occurs and how to fix it would be greatly appreciated.

code:

for i,grouped in wa.groupby(['Date','MP_quintile']):
     print(i,grouped)
     weighted_average_returns = grouped.apply(lambda x: (x['Industry_adjusted_return'] * (x['Mcap_w'] / x['Mcap_w'].sum())).sum())

the Error

{
    "name": "KeyError",
    "message": "'Industry_adjusted_return'",
    "stack": "---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File c:\\Users\\mbkoo\\anaconda3\\envs\\myenv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3802, in Index.get_loc(self, key, method, tolerance)
   3801 try:
-> 3802     return self._engine.get_loc(casted_key)
   3803 except KeyError as err:

File c:\\Users\\mbkoo\\anaconda3\\envs\\myenv\\Lib\\site-packages\\pandas\\_libs\\index.pyx:138, in pandas._libs.index.IndexEngine.get_loc()

File c:\\Users\\mbkoo\\anaconda3\\envs\\myenv\\Lib\\site-packages\\pandas\\_libs\\index.pyx:146, in pandas._libs.index.IndexEngine.get_loc()

File pandas\\_libs\\index_class_helper.pxi:49, in pandas._libs.index.Int64Engine._check_type()

KeyError: 'Industry_adjusted_return'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[10], line 8
      3  print(i,grouped)
      4  #weighted_average_returns = grouped.apply( lambda x: ((x['Mcap_w'] / x['Mcap_w'].sum()))).sum()
      5 # grouped['weights_EW'] = 1 / len(grouped)
      6 # grouped['return_EW'] = grouped['Industry_adjusted_return'] * grouped['weights_EW']
----> 8  weighted_average_returns = grouped.apply(lambda x: (x['Industry_adjusted_return'] * (x['Mcap_w'] / x['Mcap_w'].sum())).sum()) #
      9 # equally_weighted_returns=grouped['return_EW'].sum()
     10 # # _df=cpd.from_dataframe(_df,allow_copy=True)
     11  break

File c:\\Users\\pandas\\core\\frame.py:9568, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
   9557 from pandas.core.apply import frame_apply
   9559 op = frame_apply(
   9560     self,
   9561     func=func,
   (...)
   9566     kwargs=kwargs,
   9567 )
-> 9568 return op.apply().__finalize__(self, method=\"apply\")

File c:\\Users\\pandas\\core\\apply.py:764, in FrameApply.apply(self)
    761 elif self.raw:
    762     return self.apply_raw()
--> 764 return self.apply_standard()

File c:\\Users\\pandas\\core\\apply.py:891, in FrameApply.apply_standard(self)
    890 def apply_standard(self):
--> 891     results, res_index = self.apply_series_generator()
    893     # wrap results
    894     return self.wrap_results(results, res_index)

File c:\\Users\\pandas\\core\\apply.py:907, in FrameApply.apply_series_generator(self)
    904 with option_context(\"mode.chained_assignment\", None):
    905     for i, v in enumerate(series_gen):
    906         # ignore SettingWithCopy here in case the user mutates
--> 907         results[i] = self.f(v)
    908         if isinstance(results[i], ABCSeries):
    909             # If we have a view on v, we need to make a copy because
    910             #  series_generator will swap out the underlying data
    911             results[i] = results[i].copy(deep=False)

Cell In[10], line 8, in <lambda>(x)
      3  print(i,grouped)
      4  #weighted_average_returns = grouped.apply( lambda x: ((x['Mcap_w'] / x['Mcap_w'].sum()))).sum()
      5 # grouped['weights_EW'] = 1 / len(grouped)
      6 # grouped['return_EW'] = grouped['Industry_adjusted_return'] * grouped['weights_EW']
----> 8  weighted_average_returns = grouped.apply(lambda x: (x['Industry_adjusted_return'] * (x['Mcap_w'] / x['Mcap_w'].sum())).sum()) #
      9 # equally_weighted_returns=grouped['return_EW'].sum()
     10 # # _df=cpd.from_dataframe(_df,allow_copy=True)
     11  break

File c:\\Users\\pandas\\core\\series.py:981, in Series.__getitem__(self, key)
    978     return self._values[key]
    980 elif key_is_scalar:
--> 981     return self._get_value(key)
    983 if is_hashable(key):
    984     # Otherwise index.get_value will raise InvalidIndexError
    985     try:
    986         # For labels that don't resolve as scalars like tuples and frozensets

File c:\\Users\\pandas\\core\\series.py:1089, in Series._get_value(self, label, takeable)
   1086     return self._values[label]
   1088 # Similar to Index.get_value, but we do not fall back to positional
-> 1089 loc = self.index.get_loc(label)
   1090 return self.index._get_values_for_loc(self, loc, label)

File c:\\Users\\pandas\\core\\indexes\\base.py:3804, in Index.get_loc(self, key, method, tolerance)
   3802     return self._engine.get_loc(casted_key)
   3803 except KeyError as err:
-> 3804     raise KeyError(key) from err
   3805 except TypeError:
   3806     # If we have a listlike key, _check_indexing_error will raise
   3807     #  InvalidIndexError. Otherwise we fall through and re-raise
   3808     #  the TypeError.
   3809     self._check_indexing_error(key)

KeyError: 'Industry_adjusted_return'"
}

Solution

  • You should access the columns directly from grouped when calculating the weighted average. No need to use .apply() in this case since you're applying a vectorized operation:

    import pandas as pd
    
    data = {
        'ISIN': ['DE000A1DAHH0', 'DE000KSAG888'],
        'Date': ['2017-03-01', '2017-03-01'],
        'MP_quintile': [0, 0],
        'Mcap_w': [8089460.00, 4154519.75],
        'Industry_adjusted_return': [-0.00869, 0.043052]
    }
    
    df = pd.DataFrame(data)
    df['Date'] = pd.to_datetime(df['Date'])
    
    for i, grouped in df.groupby(['Date', 'MP_quintile']):
        print(f"Group: {i}\n{grouped}")
        
        weighted_average_returns = (grouped['Industry_adjusted_return'] * (grouped['Mcap_w'] / grouped['Mcap_w'].sum())).sum()
        
        print(f"Weighted Average Returns: {weighted_average_returns}\n")
    
    

    which returns

    Group: (Timestamp('2017-03-01 00:00:00'), 0)
               ISIN       Date  MP_quintile      Mcap_w  Industry_adjusted_return
    0  DE000A1DAHH0 2017-03-01            0  8089460.00                 -0.008690
    1  DE000KSAG888 2017-03-01            0  4154519.75                  0.043052
    Weighted Average Returns: 0.008866641328527188