Search code examples
pandasgroup-bypercentage

Within group by percentage calculation in pandas Dataframe


    import pandas as pd

#create DataFrame
df = pd.DataFrame({'team': ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'],
                   'status':['yes','yes','no','no','yes','no','no','yes','no','yes'],
                   'month':['m1','m2','m3','m2','m1','m3','m2','m1','m3','m1'],
                   'points': [12, 29, 34, 14, 10, 11, 7, 36, 34, 22]})

#view DataFrame
print(df)

Tried with below code

    x = df.groupby(['team','month','status']).size().reset_index().rename(columns={0:'cnt'}).pivot(index='team',columns=['month','status'])
x = x.fillna(0)
x

Sample output

Expected Output is within groupby % calculation

for every month m1 or m2 or m3 consider calculate yes% and no% . for example

for m1 and team A yes% 100% and no% should 0% m2 and team A yes% 100% and no% should 0% m3 and team A yes% 50% and no% should be 50% similarly for team B


Solution

  • You can use crosstab with normalize - but output is different:

    x = pd.crosstab([df['team'], df['month']], df['status'], normalize='index').mul(100)
    
    print (x)
    status         no    yes
    team month              
    A    m1       0.0  100.0
         m2      50.0   50.0
         m3     100.0    0.0
    B    m1       0.0  100.0
         m2     100.0    0.0
         m3     100.0    0.0
    

    If need new columns for percentages:

    x = pd.crosstab([df['team'], df['month']], df['status'])
    
    x = pd.concat([x, x.div(x.sum(axis=1), axis=0).add_suffix('_%').mul(100)], axis=1)
    print (x)
    status      no  yes   no_%  yes_%
    team month                       
    A    m1      0    2    0.0  100.0
         m2      1    1   50.0   50.0
         m3      1    0  100.0    0.0
    B    m1      0    2    0.0  100.0
         m2      1    0  100.0    0.0
         m3      2    0  100.0    0.0
    

    Another idea with SeriesGroupBy.value_counts:

    x = (df.groupby(['team','month'])['status'].value_counts(normalize=True)
           .unstack('status', fill_value=0)
           .mul(100))
    print (x)
    status         no    yes
    team month              
    A    m1       0.0  100.0
         m2      50.0   50.0
         m3     100.0    0.0
    B    m1       0.0  100.0
         m2     100.0    0.0
         m3     100.0    0.0
    

    Your ouput:

    g = df.groupby(['team','month'])['status']
    x = (pd.concat([g.value_counts(normalize=True).rename('%').mul(100),
                    g.value_counts().rename('no')], axis=1)
            .unstack(['month','status'], fill_value=0)
           )
    print (x)
                %                      no          
    month      m1     m2           m3  m1 m2     m3
    status    yes     no   yes     no yes no yes no
    team                                           
    A       100.0   50.0  50.0  100.0   2  1   1  1
    B       100.0  100.0   0.0  100.0   2  1   0  2
    

    g = df.groupby(['team','month'])['status']
    x = (pd.concat([g.value_counts(normalize=True).rename('%').mul(100),
                    g.value_counts().rename('no')], axis=1)
           .unstack(fill_value=0)
           .stack()
           .unstack(['month','status']))
    print (x)
              %                                 no                  
    month    m1            m2           m3      m1     m2     m3    
    status   no    yes     no   yes     no  yes no yes no yes no yes
    team                                                            
    A       0.0  100.0   50.0  50.0  100.0  0.0  0   2  1   1  1   0
    B       0.0  100.0  100.0   0.0  100.0  0.0  0   2  1   0  2   0
    

    x = (pd.crosstab([df['team'], df['month']], df['status'], normalize='index')
           .stack()
           .mul(100)
           .unstack(['month','status']))
    
    print (x)
    month    m1            m2           m3     
    status   no    yes     no   yes     no  yes
    team                                       
    A       0.0  100.0   50.0  50.0  100.0  0.0
    B       0.0  100.0  100.0   0.0  100.0  0.0