Search code examples
pythonpandasstringgroup-by

How to speed up cumulative concatenation with Pandas?


I have this function which is very slow, essentially I need to create cumulative sequence of string.

I tried with NumPy but there was no time improvement.

def create_sequence_shifted(df) -> list:
    articles = []
    customers = df["CUSTOMER"].drop_duplicates()
    for customer in customers:
        tmp = df[df["CUSTOMER"]==customer].sort_values(by="DATE").reset_index(drop=True)
        i=0
        for x in tmp["DOCUMENT_CODE"].drop_duplicates():
            if i==0: 
                articles += [0]
                sequence = tmp[tmp["DOCUMENT_CODE"]==x]["PRODUCT_ID"].str.cat(sep =", ")
                i = 1
            else:
                articles += [sequence]
                sequence = articles[-1] +", " + tmp[tmp["DOCUMENT_CODE"]==x]["PRODUCT_ID"].str.cat(sep =", ")

    return articles

Starting data:

CUSTOMER DATE DOCUMENT_CODE ARTICLE_CODE
1 01/01/22 a x
1 01/01/22 a y
1 02/01/22 b z
1 03/01/22 d w
2 01/01/22 c x

To:

CUSTOMER DATE DOCUMENT_CODE ARTICLE_CODE
1 01/01/22 a 0
1 02/01/22 b x,y
1 03/01/22 d x,y,z
2 01/01/22 c 0

Solution

  • IIUC you need a double groupby:

    out = (
     df.groupby(['CUSTOMER', 'DATE', 'DOCUMENT_CODE'])
       ['ARTICLE_CODE'].agg(','.join)
       .groupby('CUSTOMER').shift(fill_value=0)
       .reset_index()
    )
    

    Output:

       CUSTOMER      DATE DOCUMENT_CODE ARTICLE_CODE
    0         1  01/01/22             a            0
    1         1  02/01/22             b          x,y
    2         2  01/01/22             c            0
    
    cumulated aggregation
    sep = ','
    
    out = (
     df.groupby(['CUSTOMER', 'DATE', 'DOCUMENT_CODE'])
       ['ARTICLE_CODE'].agg(sep.join)
       .groupby('CUSTOMER')
       .apply(lambda s: s.add(sep).cumsum().str[:-len(sep)].shift(fill_value=0))
       .reset_index()
    )
    

    Output:

       CUSTOMER      DATE DOCUMENT_CODE ARTICLE_CODE
    0         1  01/01/22             a            0
    1         1  02/01/22             b          x,y
    2         1  03/01/22             d        x,y,z
    3         2  01/01/22             c            0