Comparing dataframes with 2 different number of columns

I need to compare 2 dataframes with different columns and find differences between them.

While this example is numeric, my data is not, I am using this as a sample dataset.

import pandas as pd

# Sample DataFrames
ProdDF = pd.DataFrame({'Project.name': ['Project A', 'Project B', 'Project C', 'Project D'],
                'Line identifier': [1, 2, 3, 4],
                'Column1': [10, 20, 30, 40],
                'Column2': [100, 200, 300, 400]})

TestDF = pd.DataFrame({'Project.name': ['Project A', 'Project B', 'Project C', 'Project E'],
                'Line identifier': [1, 2, 3, 5],
                'Column1': [10, 20, 35, 45],
                'Column2': [100, 200, 350, 450],
                'Column3': [100, 200, 350, 450]})

key_columns = ['Project.name', 'Line identifier']

# Merge DataFrames on key columns
merged_df = pd.merge(ProdDF, TestDF, on=key_columns, how='outer', suffixes=('_Prod', 
    '_Test'))

print(merged_df)

output = pd.DataFrame(columns=['Row', 'Column', 'Misaligned Data'])

for index, row in merged_df.iterrows():
    for column in merged_df.columns:
        if column not in key_columns:
            prod_column = column + '_Prod'
            test_column = column + '_Test'

            if prod_column in merged_df.columns and test_column in merged_df.columns:
                if row[prod_column] != row[test_column]:
                    output = output.append({'Row': index, 'Column': column, 'Misaligned Data': (row[prod_column], row[test_column])}, ignore_index=True)

print(output)

This gives me the following output:

Project.name  Line identifier  Column1_Prod  Column2_Prod  Column1_Test  Column2_Test  
Column3
0    Project A                1          10.0         100.0          10.0         100.0    
100.0
1    Project B                2          20.0         200.0          20.0         200.0    
200.0
2    Project C                3          30.0         300.0          35.0         350.0    
350.0

I would like the output to be:

Project.Name, Line identifier Prod Test
Project C       3              30   35
Project A       1              100   NaN
Project A       2              200   NaN
Project A       3              250   NaN

I am only looking for the differences between dataframes.

Solution

You can change your loop to try this instead, comparing columns from the Prod dataframe in order to find the differences with the merged_df

formatted_output = []

for index, row in merged_df.iterrows():
    # compare columns from ProdDF
    for column in ProdDF.columns.difference(key_columns):
        prod_column = column + '_Prod'
        test_column = column + '_Test'

        # Check if both columns are in the merged DataFrame
        if prod_column in merged_df.columns or test_column in merged_df.columns:
            prod_value = row[prod_column] if prod_column in merged_df.columns else None
            test_value = row[test_column] if test_column in merged_df.columns else None

            # compare to find differences
            if prod_value != test_value:
                formatted_output.append({
                    'Project.Name': row['Project.name'],
                    'Line identifier': row['Line identifier'],
                    'Prod': prod_value,
                    'Test': test_value
                })

output_df = pd.DataFrame(formatted_output)

# sort by 'Project.Name' and 'Line identifier'
output_df = output_df.sort_values(by=['Project.Name', 'Line identifier'])

# print DataFrame with formatted output
print(output_df)