I have this function which is very slow, essentially I need to create cumulative sequence of string.
I tried with NumPy but there was no time improvement.
def create_sequence_shifted(df) -> list:
articles = []
customers = df["CUSTOMER"].drop_duplicates()
for customer in customers:
tmp = df[df["CUSTOMER"]==customer].sort_values(by="DATE").reset_index(drop=True)
i=0
for x in tmp["DOCUMENT_CODE"].drop_duplicates():
if i==0:
articles += [0]
sequence = tmp[tmp["DOCUMENT_CODE"]==x]["PRODUCT_ID"].str.cat(sep =", ")
i = 1
else:
articles += [sequence]
sequence = articles[-1] +", " + tmp[tmp["DOCUMENT_CODE"]==x]["PRODUCT_ID"].str.cat(sep =", ")
return articles
Starting data:
CUSTOMER | DATE | DOCUMENT_CODE | ARTICLE_CODE |
---|---|---|---|
1 | 01/01/22 | a | x |
1 | 01/01/22 | a | y |
1 | 02/01/22 | b | z |
1 | 03/01/22 | d | w |
2 | 01/01/22 | c | x |
To:
CUSTOMER | DATE | DOCUMENT_CODE | ARTICLE_CODE |
---|---|---|---|
1 | 01/01/22 | a | 0 |
1 | 02/01/22 | b | x,y |
1 | 03/01/22 | d | x,y,z |
2 | 01/01/22 | c | 0 |
IIUC you need a double groupby
:
out = (
df.groupby(['CUSTOMER', 'DATE', 'DOCUMENT_CODE'])
['ARTICLE_CODE'].agg(','.join)
.groupby('CUSTOMER').shift(fill_value=0)
.reset_index()
)
Output:
CUSTOMER DATE DOCUMENT_CODE ARTICLE_CODE
0 1 01/01/22 a 0
1 1 02/01/22 b x,y
2 2 01/01/22 c 0
sep = ','
out = (
df.groupby(['CUSTOMER', 'DATE', 'DOCUMENT_CODE'])
['ARTICLE_CODE'].agg(sep.join)
.groupby('CUSTOMER')
.apply(lambda s: s.add(sep).cumsum().str[:-len(sep)].shift(fill_value=0))
.reset_index()
)
Output:
CUSTOMER DATE DOCUMENT_CODE ARTICLE_CODE
0 1 01/01/22 a 0
1 1 02/01/22 b x,y
2 1 03/01/22 d x,y,z
3 2 01/01/22 c 0