I am working on a data processing task in Python using Pandas, where I need to pair rows from a DataFrame based on specific criteria and then store these pairs for further processing. My challenge is to efficiently store these pairs in a dictionary, with each pair comprising two DataFrame rows (an 'a' entry and a 'b' entry). EDIT: Every 'a' entry is paired to one 'b' entry. Elements where no pair was foudn are not added to the dictionary.
Context:
Problem:
I want to use a dictionary to store these pairs for efficient retrieval and manipulation later. The ideal structure would be:
{ a_entry : b_entry }
However its not possible to do this, since rows are unhashable.
I prefer having it as a dictionary because that way I can merge these two rows very easily like this in a loop:
merged_entries = {}
for a_entry, b_entry in pairs_dict.items():
code = '19' if a_entry['Asleep'] and b_entry['Asleep'] else '11'
if code == '19':
logging.info(f"Changed code from 11 to 19 for seat {a_entry['Seat']}")
merged_entry = {
'Seat': a_entry['Seat'],
'Starttime': a_entry['Starttime'],
'Endtime': b_entry['Endtime'],
'Duration': b_entry['Endtime'] - a_entry['Starttime'],
'Code': code,
'Notes': None,
'Asleep': a_entry['Asleep'],
'Concert': a_entry['Concert'],
'Camera_Position': a_entry['Camera_Position'],
'Composer': a_entry['Composer'],
'Movement': a_entry['Movement'],
'File_Name': a_entry['File_Name']
}
merged_entries.append(merged_entry)
# Here's sample data
data = {
'Seat': [1, 1, 1, 2, 2, 3, 3, 3, 3],
'Starttime': [datetime(2023, 1, 1, 9, 0),
datetime(2023, 1, 1, 9, 30),
datetime(2023, 1, 1, 10, 0),
datetime(2023, 1, 1, 9, 15),
datetime(2023, 1, 1, 9, 45),
datetime(2023, 1, 1, 10, 10),
datetime(2023, 1, 1, 10, 30),
datetime(2023, 1, 1, 10, 50),
datetime(2023, 1, 1, 11, 10)],
'Endtime': [datetime(2023, 1, 1, 9, 20),
datetime(2023, 1, 1, 9, 50),
datetime(2023, 1, 1, 10, 20),
datetime(2023, 1, 1, 9, 35),
datetime(2023, 1, 1, 10, 5),
datetime(2023, 1, 1, 10, 20),
datetime(2023, 1, 1, 10, 50),
datetime(2023, 1, 1, 11, 0),
datetime(2023, 1, 1, 11, 20)],
'Type': ['a', 'b', 'b', 'a', 'b', 'a', 'a', 'b', 'b'],
'Asleep': [False, True, False, True, True, False, True, True, False],
'Code': [11, 11, 11, 11, 11, 11, 11, 11, 11],
'Notes': [None, None, None, None, None, None, None, None, None],
'Concert': ['Concert1', 'Concert1', 'Concert1', 'Concert2', 'Concert2', 'Concert3', 'Concert3', 'Concert3', 'Concert3'],
'Camera_Position': ['Front', 'Front', 'Front', 'Back', 'Back', 'Left', 'Left', 'Left', 'Left'],
'Composer': ['Bach', 'Bach', 'Bach', 'Mozart', 'Mozart', 'Beethoven', 'Beethoven', 'Beethoven', 'Beethoven'],
'Movement': ['Adagio', 'Adagio', 'Adagio', 'Allegro', 'Allegro', 'Fur Elise', 'Fur Elise', 'Fur Elise', 'Fur Elise'],
'File_Name': ['file1.csv', 'file1.csv', 'file1.csv', 'file2.csv', 'file2.csv', 'file3.csv', 'file3.csv', 'file3.csv', 'file3.csv']
}
I tried:
As you mentioned, rows cannot be dict keys, because pandas series are not hashable. To process the data the way need it, you can instead iterate over the rows and keep track of the rows that meet your conditions:
def merge_entries(df, a_i, b_i):
a_entry = df.iloc[a_i]
b_entry = df.iloc[a_i]
code = '19' if a_entry['Asleep'] and b_entry['Asleep'] else '11'
if code == '19':
logging.info(f"Changed code from 11 to 19 for seat {a_entry['Seat']}")
merged_entry = {
'Seat': a_entry['Seat'],
'Starttime': a_entry['Starttime'],
'Endtime': b_entry['Endtime'],
'Duration': b_entry['Endtime'] - a_entry['Starttime'],
'Code': code,
'Notes': None,
'Asleep': a_entry['Asleep'],
'Concert': a_entry['Concert'],
'Camera_Position': a_entry['Camera_Position'],
'Composer': a_entry['Composer'],
'Movement': a_entry['Movement'],
'File_Name': a_entry['File_Name']
}
return merged_entry
def process_entries(df):
t = df.iterrows()
a_i = None
b_i = None
merged_entries = []
for i, row in t:
if (a_i is None and row.Type == 'a'):
a_i = i
continue
if (a_i is None and row.Type != 'a'):
a_i = None
if (b_i is None and row.Type == 'b'):
b_i = i
if a_i is not None and b_i is not None:
merged_entries.append(merge_entries(df, a_i, b_i))
a_i = None
b_i = None
return merged_entries
process_entries(df)
This will give you the result:
[{'Seat': 1,
'Starttime': Timestamp('2023-01-01 09:00:00'),
'Endtime': Timestamp('2023-01-01 09:50:00'),
'Duration': Timedelta('0 days 00:50:00'),
'Code': '11',
'Notes': None,
'Asleep': False,
'Concert': 'Concert1',
'Camera_Position': 'Front',
'Composer': 'Bach',
'Movement': 'Adagio',
'File_Name': 'file1.csv'},
{'Seat': 2,
'Starttime': Timestamp('2023-01-01 09:15:00'),
'Endtime': Timestamp('2023-01-01 10:05:00'),
'Duration': Timedelta('0 days 00:50:00'),
'Code': '19',
'Notes': None,
'Asleep': True,
'Concert': 'Concert2',
'Camera_Position': 'Back',
'Composer': 'Mozart',
'Movement': 'Allegro',
'File_Name': 'file2.csv'},
{'Seat': 3,
'Starttime': Timestamp('2023-01-01 10:10:00'),
'Endtime': Timestamp('2023-01-01 11:00:00'),
'Duration': Timedelta('0 days 00:50:00'),
'Code': '11',
'Notes': None,
'Asleep': False,
'Concert': 'Concert3',
'Camera_Position': 'Left',
'Composer': 'Beethoven',
'Movement': 'Fur Elise',
'File_Name': 'file3.csv'}]