Search code examples
pandasgroup-byisin

Isin across 2 columns for groupby


How to use isin with or (?), when I know that my data to match in df1 will be distributed across 2 columns (Title, ID).

Below code works if you delete ' or df1[df1.ID.isin(df2[column])] '


import pandas as pd
df1 = pd.DataFrame({'Title': ['A1', 'A2', 'A3', 'C1', 'C2', 'C3'], 
                    'ID': ['B1', 'B2', 'B3', 'D1', 'D2', 'D3'], 
                    'Whole': ['full', 'full', 'full', 'semi', 'semi', 'semi']})

df2 = pd.DataFrame({'Group1': ['A1', 'A2', 'A3'], 
                    'Group2': ['B1', 'B2', 'B3']})

df = pd.DataFrame()

for column in df2.columns:
    
    d_group = (df1[df1.Title.isin(df2[column])] or df1[df1.ID.isin(df2[column])])
     
    df3 = d_group.groupby('Whole')['Whole'].count()\
                .rename(column, inplace=True)\
                .reindex(['part', 'full', 'semi'], fill_value='-')
    df = df.append(df3, ignore_index=False, sort=False)
        
print(df)

Desired output:

            | full    | part     | semi
    --------+---------+----------+----------
    Group1  | 3       | -        | -
    Group2  | 3       | -        | -

Solution

  • you need to use | instead of or and make sure you use the [] correctly to sub-select from the df you want. In general the notation is df[selection_filter]

    import pandas as pd
    df1 = pd.DataFrame({'Title': ['A1', 'A2', 'A3', 'C1', 'C2', 'C3'],
                        'ID': ['B1', 'B2', 'B3', 'D1', 'D2', 'D3'],
                        'Whole': ['full', 'full', 'full', 'semi', 'semi', 'semi']})
    
    df2 = pd.DataFrame({'Group1': ['A1', 'A2', 'A3'],
                        'Group2': ['B1', 'B2', 'B3']})
    
    df = pd.DataFrame()
    
    for column in df2.columns:
    
        d_group = df1[df1.Title.isin(df2[column]) | df1.ID.isin(df2[column])]
    
        df3 = d_group.groupby('Whole')['Whole'].count()\
                    .rename(column, inplace=True)\
                    .reindex(['part', 'full', 'semi'], fill_value='-')
        df = df.append(df3, ignore_index=False, sort=False)
    
    print(df)