Search code examples
pythonpandasdataframe

Comparing dataframes with 2 different number of columns


I need to compare 2 dataframes with different columns and find differences between them.

While this example is numeric, my data is not, I am using this as a sample dataset.

import pandas as pd

# Sample DataFrames
ProdDF = pd.DataFrame({'Project.name': ['Project A', 'Project B', 'Project C', 'Project D'],
                'Line identifier': [1, 2, 3, 4],
                'Column1': [10, 20, 30, 40],
                'Column2': [100, 200, 300, 400]})

TestDF = pd.DataFrame({'Project.name': ['Project A', 'Project B', 'Project C', 'Project E'],
                'Line identifier': [1, 2, 3, 5],
                'Column1': [10, 20, 35, 45],
                'Column2': [100, 200, 350, 450],
                'Column3': [100, 200, 350, 450]})
key_columns = ['Project.name', 'Line identifier']

# Merge DataFrames on key columns
merged_df = pd.merge(ProdDF, TestDF, on=key_columns, how='outer', suffixes=('_Prod', 
    '_Test'))

print(merged_df)

output = pd.DataFrame(columns=['Row', 'Column', 'Misaligned Data'])

for index, row in merged_df.iterrows():
    for column in merged_df.columns:
        if column not in key_columns:
            prod_column = column + '_Prod'
            test_column = column + '_Test'

            if prod_column in merged_df.columns and test_column in merged_df.columns:
                if row[prod_column] != row[test_column]:
                    output = output.append({'Row': index, 'Column': column, 'Misaligned Data': (row[prod_column], row[test_column])}, ignore_index=True)

print(output)

This gives me the following output:

Project.name  Line identifier  Column1_Prod  Column2_Prod  Column1_Test  Column2_Test  
Column3
0    Project A                1          10.0         100.0          10.0         100.0    
100.0
1    Project B                2          20.0         200.0          20.0         200.0    
200.0
2    Project C                3          30.0         300.0          35.0         350.0    
350.0

I would like the output to be:

Project.Name, Line identifier Prod Test
Project C       3              30   35
Project A       1              100   NaN
Project A       2              200   NaN
Project A       3              250   NaN

I am only looking for the differences between dataframes.


Solution

  • You can change your loop to try this instead, comparing columns from the Prod dataframe in order to find the differences with the merged_df

    formatted_output = []
    
    for index, row in merged_df.iterrows():
        # compare columns from ProdDF
        for column in ProdDF.columns.difference(key_columns):
            prod_column = column + '_Prod'
            test_column = column + '_Test'
    
            # Check if both columns are in the merged DataFrame
            if prod_column in merged_df.columns or test_column in merged_df.columns:
                prod_value = row[prod_column] if prod_column in merged_df.columns else None
                test_value = row[test_column] if test_column in merged_df.columns else None
    
                # compare to find differences
                if prod_value != test_value:
                    formatted_output.append({
                        'Project.Name': row['Project.name'],
                        'Line identifier': row['Line identifier'],
                        'Prod': prod_value,
                        'Test': test_value
                    })
    
    output_df = pd.DataFrame(formatted_output)
    
    # sort by 'Project.Name' and 'Line identifier'
    output_df = output_df.sort_values(by=['Project.Name', 'Line identifier'])
    
    # print DataFrame with formatted output
    print(output_df)