Search code examples
pandasmeanpandas-groupby

Python pandas how to get the reverse of groupby


I have two data frames. They are the same except for one column. I want to change the column of the second dataframe according to mean values from the first dataframe. For the latter I have to use groupby, but then I don't know how to get a reverse. Below is a minimal example, where in this particular example df_two should end up being the same as df_one. My question is how to get from tmp to df2_new - see the code below.

import pandas as pd


def foo(df1, df2):
    # Group by A
    groupsA_one = dict(list(df1.groupby('A', as_index=False)))
    groupsA_two = dict(list(df2.groupby('A', as_index=False)))

    for key_A in groupsA_one:
        # Group by B
        groupsB_one = dict(list(groupsA_one[key_A].groupby('B', as_index=False)))
        groupsB_two = dict(list(groupsA_two[key_A].groupby('B', as_index=False)))

        for key_B in groupsB_one:
            # Group by C
            tmp = groupsB_two[key_B].groupby('C', as_index=False)['D'].mean()   # Returns DataFrame with NaN
            tmp['D'] = groupsB_one[key_B].groupby('C', as_index=False)['D'].mean()['D']
            print tmp

    df2_new = []        # ???
    return df2_new

if __name__ == '__main__':
    A1 = {'A': [1, 1, 1, 1, 2, 2, 2, 2], 'B': [1, 1, 2, 2, 1, 1, 2, 2],
          'C': [1, 2, 1, 2, 1, 2, 1, 2], 'D': [5, 5, 5, 5, 5, 5, 5, 5]}
    A2 = {'A': [1, 1, 1, 1, 2, 2, 2, 2], 'B': [1, 1, 2, 2, 1, 1, 2, 2],
          'C': [1, 2, 1, 2, 1, 2, 1, 2], 'D': [0, 0, 0, 0, 0, 0, 0, 0]}
    df_one = pd.DataFrame(A1)
    df_two = pd.DataFrame(A2)
    foo(df_one, df_two)

Solution

  • Here is the solution that I wanted. Please, if you find a more elegant solution I will be happy to set it as a correct answer.

    Hre it is:

    import pandas as pd
    import numpy as np
    
    
    def foo(df):
        # Group by A
        groups_a_one = dict(list(df.groupby('A', as_index=False)))
    
        for key_a in groups_a_one:
            # Group by B
            groups_b_one = dict(list(groups_a_one[key_a].groupby('B', as_index=False)))
    
            for key_b in groups_b_one:
                # Group by C
                tmp = groups_b_one[key_b].groupby('C', as_index=False).transform(lambda x: x.fillna(x.mean()))
                df.ix[tmp.index, 'D'] = tmp['D']# assign mean values to correct lines in df
    
        return df
    
    if __name__ == '__main__':
        A1 = {'A': [1, 1, 1, 1, 2, 2, 2, 2], 'B': [1, 1, 2, 2, 1, 1, 2, 2],
              'C': [1, 2, 1, 2, 1, 2, 1, 2], 'D': [5, 5, 5, 5, 5, 5, 5, 5]}
        A2 = {'A': [1, 1, 1, 1, 2, 2, 2, 2], 'B': [1, 1, 2, 2, 1, 1, 2, 2],
              'C': [1, 2, 1, 2, 1, 2, 1, 2], 'D': [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]}
        df_one = pd.DataFrame(A1)
        df_two = pd.DataFrame(A2)
        df = pd.concat([df_one, df_two], axis=0, ignore_index=True)# To get only one DataFrame
    
        # run the transform
        foo(df)
    

    Here is the initial state and the final one:

    # Initial
        A  B  C  D
    0   1  1  1  5
    1   1  1  2  5
    2   1  2  1  5
    3   1  2  2  5
    4   2  1  1  5
    5   2  1  2  5
    6   2  2  1  5
    7   2  2  2  5
    8   1  1  1  NaN
    9   1  1  2  NaN
    10  1  2  1  NaN
    11  1  2  2  NaN
    12  2  1  1  NaN
    13  2  1  2  NaN
    14  2  2  1  NaN
    15  2  2  2  NaN
    
    # Final
        A  B  C  D
    0   1  1  1  5
    1   1  1  2  5
    2   1  2  1  5
    3   1  2  2  5
    4   2  1  1  5
    5   2  1  2  5
    6   2  2  1  5
    7   2  2  2  5
    8   1  1  1  5
    9   1  1  2  5
    10  1  2  1  5
    11  1  2  2  5
    12  2  1  1  5
    13  2  1  2  5
    14  2  2  1  5
    15  2  2  2  5