I have pandas DataFrame of numeric columns. This data does not contain any zeros(0).
I have defined a function that creates integer bins as follows
1. -1 for all negative values
2. Bin width (e.g. 1000000) upto threshold (e.g. 20000000)
3. After the threshold (e.g. 20000000), it will dynamically size bins to contain 1% of the total count or NOBS(No. of Observations)
i.e. if NOBS is going below 1% of total NOBS then it will dynamically adjust the bin width such that it will be approx 1% of total NOBS.
Here is my code
import pandas as pd
import numpy as np
# Reproducible example
rng = np.random.default_rng(42)
a = rng.integers(-1687500000, 0, 5000)
b = rng.integers(0, 20000000, 25000)
c = rng.integers(20000000, 56956420100000, 4149)
df = pd.DataFrame({'Col1': np.concatenate([a, b, c])})
def dynamic_binning(df_, col_, first_bin, first_bin_label, bin_width, threshold):
# Create bins
m = df_[col_] > threshold
q = np.linspace(0, 1, 100 * sum(m) // len(df_))
bins = [-np.inf] + np.arange(first_bin, threshold+1, bin_width).tolist()
bins += df_.loc[m, col_].quantile(q).tolist()[1:]
# df_.loc[df_[col_]<=first_bin] = first_bin
df_bins_ = (pd.cut(df_[col_].dropna(), bins, labels=False, right=False).sub(abs(first_bin_label)+1)
.value_counts().rename('NOBS')
.sort_index().rename_axis('Bins').reset_index()
.assign(Percentage=lambda x: x['NOBS'] / len(df_) * 108))
return df_bins_
Calling the function
dynamic_binning(df, 'Col1', 0, 0, 1000000, 20000000)
The output of the function
Bins NOBS Percentage
0 -1.0 5000 15.813054555038216
1 0.0 1274 4.0291663006237375
2 1.0 1169 3.6970921549679345
3 2.0 1287 4.070280242466836
4 3.0 1241 3.924800140560485
5 4.0 1296 4.098743740665905
6 5.0 1246 3.940613195115523
7 6.0 1288 4.073442853377844
8 7.0 1201 3.798295704120179
9 8.0 1289 4.076605464288852
10 9.0 1174 3.712905209522973
11 10.0 1273 4.02600368971273
12 11.0 1246 3.940613195115523
13 12.0 1277 4.0386541333567605
14 13.0 1292 4.086093297021875
15 14.0 1259 3.9817271369586225
16 15.0 1276 4.035491522445753
17 16.0 1200 3.7951330932091714
18 17.0 1242 3.9279627514714925
19 18.0 1204 3.807783536853202
20 19.0 1266 4.003865413335676
21 20.0 378 1.195466924360889
22 21.0 377 1.1923043134498814
23 22.0 377 1.1923043134498814
24 23.0 377 1.1923043134498814
25 24.0 377 1.1923043134498814
26 25.0 377 1.1923043134498814
27 26.0 377 1.1923043134498814
28 27.0 377 1.1923043134498814
29 28.0 377 1.1923043134498814
30 29.0 377 1.1923043134498814
31 30.0 377 1.1923043134498814
In the above example, 20 Bins -1, 0, 1, 2,3,......, 19 are till threshold i.e. 20000000. Bins 20, 21, 22,...30 are dynamic bins as per the third condition mentioned above.
I'm facing issues while using this function for the following cases
first_bin = -10000000, bin_width = 500000, threshold = 5000000
Calling dynamic_binning function
dynamic_binning(df, 'Col2', -10000000, -20, 500000, 10000000)
In this case, 20 Bins -20, -19, -18,.....-1, 0, 1, 2, .....9
are till threshold i.e. 10000000
. Bins 10, 11, 12,...
are dynamic bins as per the third condition mentioned above. But I need to use this code df_.loc[df_[col_]<=first_bin] = first_bin
before df_bins_ = (pd.cut(....
otherwise it creates two bins at the start i.e. -21 and -20 instead of -20 which includes values <=10000000
first_bin = 0, bin_width = 0.1, threshold = 3
Calling dynamic_binning function
dynamic_binning(df, 'Col3', 0, 0, 0.1, 3)
In this case, df['Col3']
values range from -0.281 to 1.922I get the following error
ValueError: bins must increase monotonically
Maybe you can try this version:
def dynamic_binning(df_, col_, first_bin, first_bin_label, bin_width, threshold):
# Create bins
m = df_[col_] > threshold
q = np.linspace(0, 1, 100 * sum(m) // len(df_))
bins = [-np.inf] + np.arange(first_bin, threshold+1, bin_width).tolist()
bins += df_.loc[m, col_].quantile(q).tolist()[1:]
df_bins_ = (pd.cut(df_[col_].dropna(), bins, labels=False, right=False)
.value_counts().rename('NOBS'))
# Fix missing bins
idx = pd.RangeIndex(df_bins_.index.min(), df_bins_.index.max(), name='Bins')
df_bins_ = (df_bins_.reindex(idx, fill_value=0).reset_index()
.assign(Percentage=lambda x: x['NOBS'] / len(df_) * 108,
Bins=lambda x: x['Bins'].astype(int) + first_bin_label))
return df_bins_
However, your first call should be dynamic_binning(df, 'Col1', 0, -1, 1000000, 20000000)
to be consistent.
>>> dynamic_binning(df, 'Col1', 0, 0, 1000000, 20000000)
Bins NOBS Percentage
0 0 5000 15.813055
1 1 1274 4.029166
2 2 1169 3.697092
3 3 1287 4.070280
4 4 1241 3.924800
5 5 1296 4.098744
6 6 1246 3.940613
7 7 1288 4.073443
8 8 1201 3.798296
9 9 1289 4.076605
10 10 1174 3.712905
11 11 1273 4.026004
12 12 1246 3.940613
13 13 1277 4.038654
14 14 1292 4.086093
15 15 1259 3.981727
16 16 1276 4.035492
17 17 1200 3.795133
18 18 1242 3.927963
19 19 1204 3.807784
20 20 1266 4.003865
21 21 378 1.195467
22 22 377 1.192304
23 23 377 1.192304
24 24 377 1.192304
25 25 377 1.192304
26 26 377 1.192304
27 27 377 1.192304
28 28 377 1.192304
29 29 377 1.192304
30 30 377 1.192304
>>> dynamic_binning(df, 'Col1', -10000000, -20, 500000, 10000000)
Bins NOBS Percentage
0 -20 4969 15.715014
1 -19 2 0.006325
2 -18 1 0.003163
3 -17 2 0.006325
4 -16 2 0.006325
.. ... ... ...
82 62 355 1.122727
83 63 355 1.122727
84 64 355 1.122727
85 65 355 1.122727
86 66 355 1.122727
[87 rows x 3 columns]
>>> dynamic_binning(df, 'Col1', 0, 0, 0.1, 3)
Bins NOBS Percentage
0 0 5000 15.813055
1 1 0 0.000000
2 2 0 0.000000
3 3 0 0.000000
4 4 0 0.000000
.. ... ... ...
118 118 347 1.097426
119 119 347 1.097426
120 120 347 1.097426
121 121 347 1.097426
122 122 347 1.097426
[123 rows x 3 columns]