import pandas as pd
d={'Country':['Algeria', 'France', 'Italy']*10, 'Input category':[1,2]*15, 'Output category':[0,0,0]*10}
df=pd.DataFrame(d)
df.sort_values(['Country', 'Input category']).reset_index(drop=True)
Country Input category Output category
0 Algeria 1 0
1 Algeria 1 0
2 Algeria 1 0
3 Algeria 1 0
4 Algeria 1 0
5 Algeria 2 0
6 Algeria 2 0
7 Algeria 2 0
8 Algeria 2 0
9 Algeria 2 0
10 France 1 0
11 France 1 0
12 France 1 0
13 France 1 0
14 France 1 0
15 France 2 0
16 France 2 0
17 France 2 0
18 France 2 0
19 France 2 0
20 Italy 1 0
21 Italy 1 0
22 Italy 1 0
23 Italy 1 0
24 Italy 1 0
25 Italy 2 0
26 Italy 2 0
27 Italy 2 0
28 Italy 2 0
29 Italy 2 0
I have a dataset that contains a number of rows that are individuals from a country bearing an input category (1,2). Each unique row is present 5 times (5 times the same row, then 5 times the next row etc.). What I want to do is to create a new column in my df (let's say output) and to assign it another value (also 1 or 2) based on a conditional distribution.
d={'Country': ['Algeria', 'France', 'Italy'] , 'p1_1':[2/5,1/5,1/5], 'p2_1':[3/5,4/5,4/5], 'p1_2':[2/5,3/5,5/5], 'p2_2':[3/5,2/5,0]}
cond_prob=pd.DataFrame(d)
cond_prob
Country p1_1 p2_1 p1_2 p2_2
0 Algeria 0.4 0.6 0.4 0.6
1 France 0.2 0.8 0.6 0.4
2 Italy 0.2 0.8 1.0 0.0
For instance, since for Algeria p1_1 (P of Output= 1 with input=1) = 2/5, I want to assign the output 1 to 2 of my rows (thus the output 2 to the 3 remaining row).
Edited: here is the expected output :
Country Input category Output category
0 Algeria 1 1
1 Algeria 1 1
2 Algeria 1 1
3 Algeria 1 2
4 Algeria 1 2
5 Algeria 2 1
6 Algeria 2 1
7 Algeria 2 1
8 Algeria 2 2
9 Algeria 2 2
10 France 1 1
11 France 1 2
12 France 1 2
13 France 1 2
14 France 1 2
15 France 2 1
16 France 2 1
17 France 2 1
18 France 2 2
19 France 2 2
20 Italy 1 1
21 Italy 1 2
22 Italy 1 2
23 Italy 1 2
24 Italy 1 2
25 Italy 2 1
26 Italy 2 1
27 Italy 2 1
28 Italy 2 1
29 Italy 2 1
IIUC,
n=5
#
#s = df['Country'].value_counts()
#assert s.nunique() == 1
#n = s.iloc[0] // df['Input category'].nunique()
#print(n)
##5
df = df.sort_values(['Country', 'Input category']).reset_index(drop=True)
df2 = cond_prob.melt('Country').sort_values(['Country'])
df['Output Category'] = (df2.reindex(df2.index.repeat(df2['value'].mul(n)))['variable']
.str.extract('(\d+)')[0].values.astype(int))
print(df)
Country Input category Output category
0 Algeria 1 1
1 Algeria 1 1
2 Algeria 1 2
3 Algeria 1 2
4 Algeria 1 2
5 Algeria 2 1
6 Algeria 2 1
7 Algeria 2 2
8 Algeria 2 2
9 Algeria 2 2
10 France 1 1
11 France 1 2
12 France 1 2
13 France 1 2
14 France 1 2
15 France 2 1
16 France 2 1
17 France 2 1
18 France 2 2
19 France 2 2
20 Italy 1 1
21 Italy 1 2
22 Italy 1 2
23 Italy 1 2
24 Italy 1 2
25 Italy 2 1
26 Italy 2 1
27 Italy 2 1
28 Italy 2 1
29 Italy 2 1
If you need sort by Input category
:
df2 = cond_prob.melt('Country').sort_values('Country')
df2 = df2.reindex(df2.index.repeat(df2['value'].mul(5)))
values = (df2.assign(**df2['variable'].str.split('_', expand=True)
.set_axis(['Output category', 'Input category'],
axis=1))
.sort_values(['Country', 'Input category']))['Output category'].str.extract('(\d+)').values
df['Output category'] = values
print(df)