As shown in this code, I need to use certain data in one table as a basis to modify another table and add some information. When this kind of table information scale is large, this violent traversal method is very inefficient. How should I modify it? Moreover, multiple sheets need to be compared, which makes the efficiency even lower.
df = pd.DataFrame({'id': [123, 321, 456, 543], 'name': ['xxx', 'yyy', 'zzz', 'www']})
df.set_index('id', inplace=True)
df_1 = pd.DataFrame({'id': [123, 321, 456, 543], 'name': ['xxx', 'yyy', 'zzz', 'www'], 'complete': ['yes', 'yes', 'yes', 'yes'], 'course_name':['AA', 'BB', 'AA', 'DD'], 'complete_date': ['1.1', '1.2', '1.1', '1.5']})
df_1.set_index('id', inplace=True)
group_df = df_1.groupby('course_name')
info = dict()
for course_name, course_df in group_df:
info[course_name]=[]
def process(row):
info[course_name].append(Subscriber(*row.tolist()))
get_info = course_df.loc[course_df["complete"] == "yes"]
get_columns = ['name', 'complete_date']
finish_df = get_info[get_columns]
Subscriber = namedtuple('Subscriber', ['name', 'complete_date'])
finish_df.apply(process, axis = 1)
# print(info)
# {'AA': [Subscriber(name='xxx', complete='yes'), Subscriber(name='zzz', complete='yes')], 'BB': [Subscriber(name='yyy', complete='yes')], 'DD': [Subscriber(name='www', complete='yes')]}
'''modify df'''
names = set(df['name'])
for course in info.keys():
for name, date in info[course]:
if name in names:
df.loc[df['name'] == name, course] = date + 'yes'
pass
# print(df)
As Ben.T is pointing out in the comments, a pivot approach would do the job. Here is an example of how you can do this:
import pandas as pd
from collections import namedtuple
df = pd.DataFrame({'id': [123, 321, 456, 543], 'name': ['xxx', 'yyy', 'zzz', 'www']})
df_1 = pd.DataFrame({'id': [123, 321, 456, 543], 'name': ['xxx', 'yyy', 'zzz', 'www'], 'complete': ['yes', 'yes', 'yes', 'yes'], 'course_name': ['AA', 'BB', 'AA', 'DD'], 'complete_date': ['1.1', '1.2', '1.1', '1.5']})
completed_courses = df_1[df_1['complete'] == 'yes']
pivot_df = completed_courses.pivot_table(index='id', columns='course_name', values='complete_date', aggfunc='first')
result_df = df.set_index('id').join(pivot_df, on='id')
result_df = result_df.fillna('')
for col in pivot_df.columns:
result_df[col] = result_df[col].apply(lambda x: 'yes' if x != '' else x)
result_df.reset_index(inplace=True)
print(result_df)
which gives you
id name AA BB DD
0 123 xxx yes
1 321 yyy yes
2 456 zzz yes
3 543 www yes