I need to compare 2 dataframes with different columns and find differences between them.
While this example is numeric, my data is not, I am using this as a sample dataset.
import pandas as pd
# Sample DataFrames
ProdDF = pd.DataFrame({'Project.name': ['Project A', 'Project B', 'Project C', 'Project D'],
'Line identifier': [1, 2, 3, 4],
'Column1': [10, 20, 30, 40],
'Column2': [100, 200, 300, 400]})
TestDF = pd.DataFrame({'Project.name': ['Project A', 'Project B', 'Project C', 'Project E'],
'Line identifier': [1, 2, 3, 5],
'Column1': [10, 20, 35, 45],
'Column2': [100, 200, 350, 450],
'Column3': [100, 200, 350, 450]})
key_columns = ['Project.name', 'Line identifier']
# Merge DataFrames on key columns
merged_df = pd.merge(ProdDF, TestDF, on=key_columns, how='outer', suffixes=('_Prod',
'_Test'))
print(merged_df)
output = pd.DataFrame(columns=['Row', 'Column', 'Misaligned Data'])
for index, row in merged_df.iterrows():
for column in merged_df.columns:
if column not in key_columns:
prod_column = column + '_Prod'
test_column = column + '_Test'
if prod_column in merged_df.columns and test_column in merged_df.columns:
if row[prod_column] != row[test_column]:
output = output.append({'Row': index, 'Column': column, 'Misaligned Data': (row[prod_column], row[test_column])}, ignore_index=True)
print(output)
This gives me the following output:
Project.name Line identifier Column1_Prod Column2_Prod Column1_Test Column2_Test
Column3
0 Project A 1 10.0 100.0 10.0 100.0
100.0
1 Project B 2 20.0 200.0 20.0 200.0
200.0
2 Project C 3 30.0 300.0 35.0 350.0
350.0
I would like the output to be:
Project.Name, Line identifier Prod Test
Project C 3 30 35
Project A 1 100 NaN
Project A 2 200 NaN
Project A 3 250 NaN
I am only looking for the differences between dataframes.
You can change your loop to try this instead, comparing columns from the Prod dataframe in order to find the differences with the merged_df
formatted_output = []
for index, row in merged_df.iterrows():
# compare columns from ProdDF
for column in ProdDF.columns.difference(key_columns):
prod_column = column + '_Prod'
test_column = column + '_Test'
# Check if both columns are in the merged DataFrame
if prod_column in merged_df.columns or test_column in merged_df.columns:
prod_value = row[prod_column] if prod_column in merged_df.columns else None
test_value = row[test_column] if test_column in merged_df.columns else None
# compare to find differences
if prod_value != test_value:
formatted_output.append({
'Project.Name': row['Project.name'],
'Line identifier': row['Line identifier'],
'Prod': prod_value,
'Test': test_value
})
output_df = pd.DataFrame(formatted_output)
# sort by 'Project.Name' and 'Line identifier'
output_df = output_df.sort_values(by=['Project.Name', 'Line identifier'])
# print DataFrame with formatted output
print(output_df)