Search code examples
pythondataframecomparisonmissing-data

Compare two dataframes for missing rows based on multiple columns python


I want to compare two dataframes that have similar columns(not all) and print a new dataframe that shows the missing rows of df1 compare to df2 and a second dataframe that shows this time the missing values of df2 compare to df1 based on given columns.

Here the "key_columns" are named key_column1 and key_column2

import pandas as pd


data1 = {'first_column':  ['4', '2', '7', '2', '2'],
        'second_column': ['1', '2', '2', '2', '2'],
       'key_column1':['1', '3', '2', '6', '4'],
      'key_column2':['1', '2', '2', '1', '1'],
       'fourth_column':['1', '2', '2', '2', '2'],
         'other':['1', '2', '3', '2', '2'],
        }
df1 = pd.DataFrame(data1)

data2 = {'first':  ['1', '2', '2', '2', '2'],
        'second_column': ['1', '2', '2', '2', '2'],
       'key_column1':['1', '3', '2', '6', '4'],
      'key_column2':['1', '5', '2', '2', '2'],
       'fourth_column':['1', '2', '2', '2', '2'],
          'other2':['1', '4', '3', '2', '2'],
         'other3':['6', '8', '1', '4', '2'],
        }

df2 = pd.DataFrame(data2)

enter image description here


Solution

  • I have modified the data1 and data2 dictionaries so that the resulting dataframes have only same columns to demonstrate that the solution provided in the answer by Emi OB relies on existence of columns in one dataframe which are not in the other one ( in case a common column is used the code fails with KeyError on the column chosen to collect NaNs). Below an improved version which does not suffer from that limitation creating own columns for the purpose of collecting NaNs:

    df1['df1_NaNs'] = '' # create additional column to collect NaNs
    df2['df2_NaNs'] = '' # create additional column to collect NaNs
    
    df1_s = df1.merge(df2[['key_column1', 'key_column2', 'df2_NaNs']], on=['key_column1', 'key_column2'], how='outer')
    df2 = df2.drop(columns=["df2_NaNs"])   # clean up df2 
    df1_s = df1_s.loc[df1_s['df2_NaNs'].isna(), df1.columns]
    df1_s = df1_s.drop(columns=["df1_NaNs"]) # clean up df1_s 
    print(df1_s)
    
    print('--------------------------------------------')
    
    df2_s = df2.merge(df1[['key_column1', 'key_column2', 'df1_NaNs']], on=['key_column1', 'key_column2'], how='outer')
    df1 = df1.drop(columns=["df1_NaNs"])   # clean up df1 
    df2_s = df2_s.loc[df2_s['df1_NaNs'].isna(), df2.columns]
    df2_s = df2_s.drop(columns=["df2_NaNs"]) # clean up df2_s 
    print(df2_s)
    

    gives:

      first second_column key_column1 key_column2 fourth_column
    1     2             2           3           2             2
    3     2             2           6           1             2
    4     2             2           4           1             2
    --------------------------------------------
      first second_column key_column1 key_column2 fourth_column
    1     2             2           3           5             3
    3     2             2           6           2             5
    4     2             2           4           2             6
    

    Also the code below works in case the columns of both dataframes are the same and in addition saves memory and computation time by not creating temporary full-sized dataframes required to achieve the final result:

    """ I want to compare two dataframes that have similar columns(not all) 
    and print a new dataframe that shows the missing rows of df1 compare to 
    df2 and a second dataframe that shows this time the missing values of 
    df2 compare to df1 based on given columns. Here the "key_columns"
    """
    import pandas as pd
    #data1 ={  'first_column':['4', '2', '7', '2', '2'],
    data1 = {         'first':['4', '2', '7', '2', '2'],
              'second_column':['1', '2', '2', '2', '2'],
                'key_column1':['1', '3', '2', '6', '4'],
                'key_column2':['1', '2', '2', '1', '1'],
              'fourth_column':['1', '2', '2', '2', '2'],
    #                 'other':['1', '2', '3', '2', '2'],
            }
    df1 = pd.DataFrame(data1)
    #print(df1)
    
    data2 = {  'first':['1', '2', '2', '2', '2'],
       'second_column':['1', '2', '2', '2', '2'],
         'key_column1':['1', '3', '2', '6', '4'],
         'key_column2':['1', '5', '2', '2', '2'],
    #  'fourth_column':['1', '2', '2', '2', '2'],
       'fourth_column':['2', '3', '4', '5', '6'],
    #         'other2':['1', '4', '3', '2', '2'],
    #         'other3':['6', '8', '1', '4', '2'],
            }
    df2 = pd.DataFrame(data2)
    #print(df2)
    
    data1_key_cols = dict.fromkeys( zip(data1['key_column1'], data1['key_column2']) )
    data2_key_cols = dict.fromkeys( zip(data2['key_column1'], data2['key_column2']) )
    # for Python versions < 3.7 (dictionaries are not ordered):
    #data1_key_cols = list(zip(data1['key_column1'], data1['key_column2']))
    #data2_key_cols = list(zip(data2['key_column1'], data2['key_column2']))
    from collections import defaultdict
    missing_data2_in_data1 = defaultdict(list)
    missing_data1_in_data2 = defaultdict(list)
    
    for indx, val in enumerate(data1_key_cols.keys()):
    #for indx, val in enumerate(data1_key_cols): # for Python version < 3.7
        if val not in data2_key_cols:
            for key, val in data1.items():
                missing_data1_in_data2[key].append(data1[key][indx])
    for indx, val in enumerate(data2_key_cols.keys()):
    #for indx, val in enumerate(data2_key_cols): # for Python version < 3.7
        if val not in data1_key_cols:
            for key, val in data2.items():
                missing_data2_in_data1[key].append(data2[key][indx])
    df1_s = pd.DataFrame(missing_data1_in_data2)
    df2_s = pd.DataFrame(missing_data2_in_data1)
    print(df1_s)
    print('--------------------------------------------')
    print(df2_s)
    

    prints

      first second_column key_column1 key_column2 fourth_column
    0     2             2           3           2             2
    1     2             2           6           1             2
    2     2             2           4           1             2
    --------------------------------------------
      first second_column key_column1 key_column2 fourth_column
    0     2             2           3           5             3
    1     2             2           6           2             5
    2     2             2           4           2             6