Search code examples
pythonpandasapplymulti-index

Python Pandas apply qcut to grouped by level 0 of multi-index in multi-index dataframe


I have a multi-index dataframe in pandas (date and entity_id) and for each date/entity I have obseravtions of a number of variables (A, B ...). My goal is to create a dataframe with the same shape but where the values are replaced by their decile scores.

My test data looks like this:

enter image description here

I want to apply qcut to each column grouped by level 0 of the multi-index - the issue I have is creating a result Dataframe

This code

def qcut_sub_index(df_with_sub_index):
#     create empty return value same shape as passed dataframe
    df_return=pd.DataFrame()
    
    for date, sub_df in df_with_sub_index.groupby(level=0):
            df_return=df_return.append(pd.DataFrame(pd.qcut(sub_df, 10, labels=False, duplicates='drop')))
    print(df_return)
    return df_return
    
print(df_values.apply(lambda x: qcut_sub_index(x), axis=0))

returns

                      A
as_at_date entity_id   
2008-01-27 2928       0
           2932       3
           3083       6
           3333       9
2008-02-27 2928       3
           2935       9
           3333       0
           3874       6
2008-03-27 2928       1
           2932       2
           2934       0
           2936       9
           2937       4
           2939       9
           2940       7
           2943       3
           2944       0
           2945       8
           2946       6
           2947       5
           2949       4
                      B
as_at_date entity_id   
2008-01-27 2928       9
           2932       6
           3083       0
           3333       3
2008-02-27 2928       6
           2935       0
           3333       3
           3874       9
2008-03-27 2928       0
           2932       9
           2934       2
           2936       8
           2937       7
           2939       6
           2940       3
           2943       1
           2944       4
           2945       9
           2946       5
           2947       4
           2949       0
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-104-72ff0e6da288> in <module>
     11 
     12 
---> 13 print(df_values.apply(lambda x: qcut_sub_index(x), axis=0))

~\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
   7546             kwds=kwds,
   7547         )
-> 7548         return op.get_result()
   7549 
   7550     def applymap(self, func) -> "DataFrame":

~\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
    178             return self.apply_raw()
    179 
--> 180         return self.apply_standard()
    181 
    182     def apply_empty_result(self):

~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
    272 
    273         # wrap results
--> 274         return self.wrap_results(results, res_index)
    275 
    276     def apply_series_generator(self) -> Tuple[ResType, "Index"]:

~\Anaconda3\lib\site-packages\pandas\core\apply.py in wrap_results(self, results, res_index)
    313         # see if we can infer the results
    314         if len(results) > 0 and 0 in results and is_sequence(results[0]):
--> 315             return self.wrap_results_for_axis(results, res_index)
    316 
    317         # dict of scalars

~\Anaconda3\lib\site-packages\pandas\core\apply.py in wrap_results_for_axis(self, results, res_index)
    369 
    370         try:
--> 371             result = self.obj._constructor(data=results)
    372         except ValueError as err:
    373             if "arrays must all be same length" in str(err):

~\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
    466 
    467         elif isinstance(data, dict):
--> 468             mgr = init_dict(data, index, columns, dtype=dtype)
    469         elif isinstance(data, ma.MaskedArray):
    470             import numpy.ma.mrecords as mrecords

~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_dict(data, index, columns, dtype)
    281             arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
    282         ]
--> 283     return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    284 
    285 

~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
     76         # figure out the index, if necessary
     77         if index is None:
---> 78             index = extract_index(arrays)
     79         else:
     80             index = ensure_index(index)

~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in extract_index(data)
    385 
    386         if not indexes and not raw_lengths:
--> 387             raise ValueError("If using all scalar values, you must pass an index")
    388 
    389         if have_series:

ValueError: If using all scalar values, you must pass an index

so something is preventing the second application of the lambda function.

I'd appreciate your help, thanks for takign a look.

p.s. if this can be done implcitly without using apply would love to hear. thanks


Solution

  • You solution appears over complicated. Your terminology is none standard, multi-indexes have levels. Stated as qcut() by level 0 of multi-index (not talking about sub-frames which are not pandas concepts)

    Bring it all back together

    • use **kwargs approach to pass arguments to assign() for all columns in data frame
    • groupby(level=0) is as_of_date
    • transform() to get a row back for every entry in index
    s = 12
    df = pd.DataFrame({"as_at_date":np.random.choice(pd.date_range(dt.date(2020,1,27), periods=3, freq="M"), s),
                  "entity_id":np.random.randint(2900, 3500, s),
                  "A":np.random.random(s),
                  "B":np.random.random(s)*(10**np.random.randint(8,10,s))
                 }).sort_values(["as_at_date","entity_id"])
    
    df = df.set_index(["as_at_date","entity_id"])
    df2 = df.assign(**{c:df.groupby(level=0)[c].transform(lambda x: pd.qcut(x, 10, labels=False))
                for c in df.columns})
    
    

    df

                                 A             B
    as_at_date entity_id                        
    2020-01-31 2926       0.770121  2.883519e+07
               2943       0.187747  1.167975e+08
               2973       0.371721  3.133071e+07
               3104       0.243347  4.497294e+08
               3253       0.591022  7.796131e+08
               3362       0.810001  6.438441e+08
    2020-02-29 3185       0.690875  4.513044e+08
               3304       0.311436  4.561929e+07
    2020-03-31 2953       0.325846  7.770111e+08
               2981       0.918461  7.594753e+08
               3034       0.133053  6.767501e+08
               3355       0.624519  6.318104e+07
    

    df2

                          A  B
    as_at_date entity_id      
    2020-01-31 2926       7  0
               2943       0  3
               2973       3  1
               3104       1  5
               3253       5  9
               3362       9  7
    2020-02-29 3185       9  9
               3304       0  0
    2020-03-31 2953       3  9
               2981       9  6
               3034       0  3
               3355       6  0