python pandas dataframe probability-distribution

How can I assign a value picked from a list to a df column based on a probability distribution?

import pandas as pd
d={'Country':['Algeria', 'France', 'Italy']*10, 'Input category':[1,2]*15, 'Output category':[0,0,0]*10}
df=pd.DataFrame(d)
df.sort_values(['Country', 'Input category']).reset_index(drop=True)

    Country  Input category  Output category
0   Algeria               1                0
1   Algeria               1                0
2   Algeria               1                0
3   Algeria               1                0
4   Algeria               1                0
5   Algeria               2                0
6   Algeria               2                0
7   Algeria               2                0
8   Algeria               2                0
9   Algeria               2                0
10   France               1                0
11   France               1                0
12   France               1                0
13   France               1                0
14   France               1                0
15   France               2                0
16   France               2                0
17   France               2                0
18   France               2                0
19   France               2                0
20    Italy               1                0
21    Italy               1                0
22    Italy               1                0
23    Italy               1                0
24    Italy               1                0
25    Italy               2                0
26    Italy               2                0
27    Italy               2                0
28    Italy               2                0
29    Italy               2                0

I have a dataset that contains a number of rows that are individuals from a country bearing an input category (1,2). Each unique row is present 5 times (5 times the same row, then 5 times the next row etc.). What I want to do is to create a new column in my df (let's say output) and to assign it another value (also 1 or 2) based on a conditional distribution.

d={'Country': ['Algeria', 'France', 'Italy'] , 'p1_1':[2/5,1/5,1/5], 'p2_1':[3/5,4/5,4/5], 'p1_2':[2/5,3/5,5/5], 'p2_2':[3/5,2/5,0]}
cond_prob=pd.DataFrame(d)
cond_prob

   Country  p1_1  p2_1  p1_2  p2_2
0  Algeria   0.4   0.6   0.4   0.6
1   France   0.2   0.8   0.6   0.4
2    Italy   0.2   0.8   1.0   0.0

For instance, since for Algeria p1_1 (P of Output= 1 with input=1) = 2/5, I want to assign the output 1 to 2 of my rows (thus the output 2 to the 3 remaining row).

Edited: here is the expected output :


Country  Input category  Output category
0   Algeria               1                1
1   Algeria               1                1
2   Algeria               1                1
3   Algeria               1                2
4   Algeria               1                2
5   Algeria               2                1
6   Algeria               2                1
7   Algeria               2                1
8   Algeria               2                2
9   Algeria               2                2
10   France               1                1
11   France               1                2
12   France               1                2
13   France               1                2
14   France               1                2
15   France               2                1
16   France               2                1
17   France               2                1
18   France               2                2
19   France               2                2
20    Italy               1                1
21    Italy               1                2
22    Italy               1                2
23    Italy               1                2
24    Italy               1                2
25    Italy               2                1
26    Italy               2                1
27    Italy               2                1
28    Italy               2                1
29    Italy               2                1

Solution

IIUC,

n=5
#
#s = df['Country'].value_counts()
#assert s.nunique() == 1
#n = s.iloc[0] // df['Input category'].nunique()
#print(n)
##5

df = df.sort_values(['Country', 'Input category']).reset_index(drop=True)
df2 = cond_prob.melt('Country').sort_values(['Country'])
df['Output Category'] = (df2.reindex(df2.index.repeat(df2['value'].mul(n)))['variable']
                            .str.extract('(\d+)')[0].values.astype(int))

print(df)
    Country  Input category  Output category
0   Algeria               1                1
1   Algeria               1                1
2   Algeria               1                2
3   Algeria               1                2
4   Algeria               1                2
5   Algeria               2                1
6   Algeria               2                1
7   Algeria               2                2
8   Algeria               2                2
9   Algeria               2                2
10   France               1                1
11   France               1                2
12   France               1                2
13   France               1                2
14   France               1                2
15   France               2                1
16   France               2                1
17   France               2                1
18   France               2                2
19   France               2                2
20    Italy               1                1
21    Italy               1                2
22    Italy               1                2
23    Italy               1                2
24    Italy               1                2
25    Italy               2                1
26    Italy               2                1
27    Italy               2                1
28    Italy               2                1
29    Italy               2                1

If you need sort by Input category:

df2 = cond_prob.melt('Country').sort_values('Country')
df2 = df2.reindex(df2.index.repeat(df2['value'].mul(5)))
values = (df2.assign(**df2['variable'].str.split('_', expand=True)
                                      .set_axis(['Output category', 'Input category'],
                                                axis=1))
             .sort_values(['Country', 'Input category']))['Output category'].str.extract('(\d+)').values
df['Output category'] = values
print(df)