I have a multi-index dataframe in pandas (date and entity_id) and for each date/entity I have obseravtions of a number of variables (A, B ...). My goal is to create a dataframe with the same shape but where the values are replaced by their decile scores.
My test data looks like this:
I want to apply qcut to each column grouped by level 0 of the multi-index - the issue I have is creating a result Dataframe
This code
def qcut_sub_index(df_with_sub_index):
# create empty return value same shape as passed dataframe
df_return=pd.DataFrame()
for date, sub_df in df_with_sub_index.groupby(level=0):
df_return=df_return.append(pd.DataFrame(pd.qcut(sub_df, 10, labels=False, duplicates='drop')))
print(df_return)
return df_return
print(df_values.apply(lambda x: qcut_sub_index(x), axis=0))
returns
A
as_at_date entity_id
2008-01-27 2928 0
2932 3
3083 6
3333 9
2008-02-27 2928 3
2935 9
3333 0
3874 6
2008-03-27 2928 1
2932 2
2934 0
2936 9
2937 4
2939 9
2940 7
2943 3
2944 0
2945 8
2946 6
2947 5
2949 4
B
as_at_date entity_id
2008-01-27 2928 9
2932 6
3083 0
3333 3
2008-02-27 2928 6
2935 0
3333 3
3874 9
2008-03-27 2928 0
2932 9
2934 2
2936 8
2937 7
2939 6
2940 3
2943 1
2944 4
2945 9
2946 5
2947 4
2949 0
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-104-72ff0e6da288> in <module>
11
12
---> 13 print(df_values.apply(lambda x: qcut_sub_index(x), axis=0))
~\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7546 kwds=kwds,
7547 )
-> 7548 return op.get_result()
7549
7550 def applymap(self, func) -> "DataFrame":
~\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
178 return self.apply_raw()
179
--> 180 return self.apply_standard()
181
182 def apply_empty_result(self):
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
272
273 # wrap results
--> 274 return self.wrap_results(results, res_index)
275
276 def apply_series_generator(self) -> Tuple[ResType, "Index"]:
~\Anaconda3\lib\site-packages\pandas\core\apply.py in wrap_results(self, results, res_index)
313 # see if we can infer the results
314 if len(results) > 0 and 0 in results and is_sequence(results[0]):
--> 315 return self.wrap_results_for_axis(results, res_index)
316
317 # dict of scalars
~\Anaconda3\lib\site-packages\pandas\core\apply.py in wrap_results_for_axis(self, results, res_index)
369
370 try:
--> 371 result = self.obj._constructor(data=results)
372 except ValueError as err:
373 if "arrays must all be same length" in str(err):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
466
467 elif isinstance(data, dict):
--> 468 mgr = init_dict(data, index, columns, dtype=dtype)
469 elif isinstance(data, ma.MaskedArray):
470 import numpy.ma.mrecords as mrecords
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_dict(data, index, columns, dtype)
281 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
282 ]
--> 283 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
284
285
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
76 # figure out the index, if necessary
77 if index is None:
---> 78 index = extract_index(arrays)
79 else:
80 index = ensure_index(index)
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in extract_index(data)
385
386 if not indexes and not raw_lengths:
--> 387 raise ValueError("If using all scalar values, you must pass an index")
388
389 if have_series:
ValueError: If using all scalar values, you must pass an index
so something is preventing the second application of the lambda function.
I'd appreciate your help, thanks for takign a look.
p.s. if this can be done implcitly without using apply would love to hear. thanks
You solution appears over complicated. Your terminology is none standard, multi-indexes have levels. Stated as qcut()
by level 0 of multi-index (not talking about sub-frames which are not pandas concepts)
Bring it all back together
**kwargs
approach to pass arguments to assign()
for all columns in data framegroupby(level=0)
is as_of_datetransform()
to get a row back for every entry in indexs = 12
df = pd.DataFrame({"as_at_date":np.random.choice(pd.date_range(dt.date(2020,1,27), periods=3, freq="M"), s),
"entity_id":np.random.randint(2900, 3500, s),
"A":np.random.random(s),
"B":np.random.random(s)*(10**np.random.randint(8,10,s))
}).sort_values(["as_at_date","entity_id"])
df = df.set_index(["as_at_date","entity_id"])
df2 = df.assign(**{c:df.groupby(level=0)[c].transform(lambda x: pd.qcut(x, 10, labels=False))
for c in df.columns})
A B
as_at_date entity_id
2020-01-31 2926 0.770121 2.883519e+07
2943 0.187747 1.167975e+08
2973 0.371721 3.133071e+07
3104 0.243347 4.497294e+08
3253 0.591022 7.796131e+08
3362 0.810001 6.438441e+08
2020-02-29 3185 0.690875 4.513044e+08
3304 0.311436 4.561929e+07
2020-03-31 2953 0.325846 7.770111e+08
2981 0.918461 7.594753e+08
3034 0.133053 6.767501e+08
3355 0.624519 6.318104e+07
A B
as_at_date entity_id
2020-01-31 2926 7 0
2943 0 3
2973 3 1
3104 1 5
3253 5 9
3362 9 7
2020-02-29 3185 9 9
3304 0 0
2020-03-31 2953 3 9
2981 9 6
3034 0 3
3355 6 0