Search code examples
pythonpandasperformancepysparkpython-itertools

Is there a way how we can make this code less memory consuming?


I have created a logic, which is counting how many people bought the same products. It works, but it is really inefficient (running out of memory the whole time).

Therefore, I hope that someone has a logic which is less memory consuming than mine.

This is what I have done:

df: # Please note: below you can find the code to duplicate this. 

Order_Number    Country     Product 
1                Ger        [A,B]
2                NL         [A,B,C]
3                USA        [C,D]
4                NL         [B,C,D]
5                GER        [A]

I would like to know how many customers bought the same products (with a minimum of two products obviously):

list_df = [df] 

# Example for two products bought together 
for X in list_df :    # 
   #print(X)
   combinations_list = []
   for row in X.Product:
       combinations = list(itertools.combinations(row, 2)) # Only counting for 2 products here
       combinations_list.append(combinations)
   Products_DF = pd.Series(combinations_list).explode().reset_index(drop=True)
   Products_DF = Products_DF.value_counts()

   Products_DF = Products_DF.to_frame()
   Products_DF.reset_index(level=0, inplace=True)
   Products_DF = Products_DF.rename(index = str, columns = {"index":"Product"})
   Products_DF = Products_DF.rename(index = str, columns = {0:"Occurrence"})
   Products_DF['Product_Combinations'] = 2 # Only counting for 2 products here
   Products_DF['Country'] = X['Country']
   main_dataframe = main_dataframe.append(Products_DF, ignore_index = True)
   del(Products_DF)   

Then, I redo the above again for 3,4,5,6 and 7 products bought together. Having all information appended in my main_dataframe.

The result is one dataframe, containing the country, products bought together and the occurrence. Just as the output from the data below.

Many thanks in advance!

PS I'm also open for PySpark solutions (everything is appreciated!)

Complete example:

import pandas as pd
import itertools

df= {'Order_Number':['1', '2', '3', '4', '5'], 
     'Country':['Ger', 'NL', 'USA', 'NL', 'Ger'],
     'Product': ['[A,B]', '[A,B,C]','[C,D]', '[B,C,D]', '[A]']}

# Creates pandas DataFrame.  
df = pd.DataFrame(df)  

df = [df] # sorry, this is legacy in my code 


main_dataframe = pd.DataFrame()
# Example for two products bought together 
for X in df :    # 
   #print(X)
   combinations_list = []
   for row in X.Product:
       combinations = list(itertools.combinations(row, 2)) # Only counting for 2 products here
       combinations_list.append(combinations)
   Products_DF = pd.Series(combinations_list).explode().reset_index(drop=True)
   Products_DF = Products_DF.value_counts()

   Products_DF = Products_DF.to_frame()
   Products_DF.reset_index(level=0, inplace=True)
   Products_DF = Products_DF.rename(index = str, columns = {"index":"Product"})
   Products_DF = Products_DF.rename(index = str, columns = {0:"Occurrence"})
   Products_DF['Product_Combinations'] = 2 # Only counting for 2 products here
   Products_DF['Country'] = X['Country']
   main_dataframe = main_dataframe.append(Products_DF, ignore_index = True)
   del(Products_DF)   

Solution

  • Final cleaned up edit. I tested 1e6 orders with 500 unique products and got results in about 3 minutes.

    I did filter by requiring a minimum number of orders and by stopping after 3-product groups. These can be changed in the function call:

    memo = find_product_groups(df, min_unique_orders=10, max_group_size=3)


    Output

    testing 500 products in 1000000.0 orders where there are between 1 and 10 products per order
    made test data in 72.29691982269287 seconds
    counted_groups in 199.78216004371643 seconds
    showing first five 3-member group sorted by decreasing number of orders
    ('product_259', 'product_263', 'product_435') 11
    ('product_252', 'product_432', 'product_63') 11
    ('product_114', 'product_139', 'product_156') 11
    ('product_101', 'product_11', 'product_179') 11
    ('product_115', 'product_301', 'product_45') 11
    

    Code

    import pandas as pd
    import collections
    
    import time #just for timing
    import numpy as np #just for creating the test data
    np.random.seed(1)
    
    def init_data():
        #Initialize test data
        start_create_data = time.time()
        num_products = 500
        num_orders = 1e6
        max_order_size = 10
    
        print('testing',num_products,'products in',num_orders,'orders where there are between 1 and',max_order_size,'products per order')
    
        products = ['product_'+str(p) for p in range(num_products)]
    
        order_sizes = np.round(np.random.uniform(low=1,high=max_order_size,size=int(num_orders)))
    
        order_list = []
        for order_size in order_sizes:
            order = np.random.choice(products,int(order_size),replace=False)
            order_list.append(order)
        
        # Create pandas DataFrame
        df = pd.DataFrame({'Product':order_list})
        print('made test data in',time.time()-start_create_data,'seconds')
        return df
    
    def find_product_groups(df, min_unique_orders=1, max_group_size=None):
        
        #Initialize the memo with single-product groups
        memo = collections.defaultdict(set)
        for ind,order in df['Product'].items():
            for product in order:
                memo[(product,)].add(ind)
    
        #Build up the groups from 1-smaller groups, updating the memo
        unique_prods = sorted(p[0] for p in memo.keys())
        groups = list(memo.keys())
        
        while groups:
            group = groups.pop()
            
            #Skip if the max_group_size is set and this group is already max size
            if max_group_size and len(group) == max_group_size:
                continue
            
            last_prod = group[-1]
            last_prod_ind = unique_prods.index(last_prod)
            for prod in unique_prods[last_prod_ind+1:]:
                new_inds = memo[group].intersection(memo[(prod,)])
    
                #Only add the new group if there are enough orders
                if len(new_inds) >= min_unique_orders:
                    new_group = (*group,prod)
                    memo[new_group] = new_inds
                    groups.append(new_group)
                    
        return memo
    
    
    df = init_data()
    
    start_count_groups = time.time()
    memo = find_product_groups(df, min_unique_orders=10, max_group_size=3)
    print('counted_groups in',time.time()-start_count_groups,'seconds')
    
    
    print('showing first five 3-member group sorted by decreasing number of orders')
    filt_memo = {group:order_inds for group,order_inds in memo.items() if len(group) >= 3}
    filt_memo = dict(sorted(filt_memo.items(),key=lambda kv: -len(kv[1]))[:5])
    
    for group,order_inds in filt_memo.items():
        print(group,len(order_inds))