python pandas performance pyspark python-itertools

Is there a way how we can make this code less memory consuming?

I have created a logic, which is counting how many people bought the same products. It works, but it is really inefficient (running out of memory the whole time).

Therefore, I hope that someone has a logic which is less memory consuming than mine.

This is what I have done:

df: # Please note: below you can find the code to duplicate this. 

Order_Number    Country     Product 
1                Ger        [A,B]
2                NL         [A,B,C]
3                USA        [C,D]
4                NL         [B,C,D]
5                GER        [A]

I would like to know how many customers bought the same products (with a minimum of two products obviously):

list_df = [df] 

# Example for two products bought together 
for X in list_df :    # 
   #print(X)
   combinations_list = []
   for row in X.Product:
       combinations = list(itertools.combinations(row, 2)) # Only counting for 2 products here
       combinations_list.append(combinations)
   Products_DF = pd.Series(combinations_list).explode().reset_index(drop=True)
   Products_DF = Products_DF.value_counts()

   Products_DF = Products_DF.to_frame()
   Products_DF.reset_index(level=0, inplace=True)
   Products_DF = Products_DF.rename(index = str, columns = {"index":"Product"})
   Products_DF = Products_DF.rename(index = str, columns = {0:"Occurrence"})
   Products_DF['Product_Combinations'] = 2 # Only counting for 2 products here
   Products_DF['Country'] = X['Country']
   main_dataframe = main_dataframe.append(Products_DF, ignore_index = True)
   del(Products_DF)

Then, I redo the above again for 3,4,5,6 and 7 products bought together. Having all information appended in my main_dataframe.

The result is one dataframe, containing the country, products bought together and the occurrence. Just as the output from the data below.

Many thanks in advance!

PS I'm also open for PySpark solutions (everything is appreciated!)

Complete example:

import pandas as pd
import itertools

df= {'Order_Number':['1', '2', '3', '4', '5'], 
     'Country':['Ger', 'NL', 'USA', 'NL', 'Ger'],
     'Product': ['[A,B]', '[A,B,C]','[C,D]', '[B,C,D]', '[A]']}

# Creates pandas DataFrame.  
df = pd.DataFrame(df)  

df = [df] # sorry, this is legacy in my code 


main_dataframe = pd.DataFrame()
# Example for two products bought together 
for X in df :    # 
   #print(X)
   combinations_list = []
   for row in X.Product:
       combinations = list(itertools.combinations(row, 2)) # Only counting for 2 products here
       combinations_list.append(combinations)
   Products_DF = pd.Series(combinations_list).explode().reset_index(drop=True)
   Products_DF = Products_DF.value_counts()

   Products_DF = Products_DF.to_frame()
   Products_DF.reset_index(level=0, inplace=True)
   Products_DF = Products_DF.rename(index = str, columns = {"index":"Product"})
   Products_DF = Products_DF.rename(index = str, columns = {0:"Occurrence"})
   Products_DF['Product_Combinations'] = 2 # Only counting for 2 products here
   Products_DF['Country'] = X['Country']
   main_dataframe = main_dataframe.append(Products_DF, ignore_index = True)
   del(Products_DF)

Solution

Final cleaned up edit. I tested 1e6 orders with 500 unique products and got results in about 3 minutes.

I did filter by requiring a minimum number of orders and by stopping after 3-product groups. These can be changed in the function call:

memo = find_product_groups(df, min_unique_orders=10, max_group_size=3)

Output

testing 500 products in 1000000.0 orders where there are between 1 and 10 products per order
made test data in 72.29691982269287 seconds
counted_groups in 199.78216004371643 seconds
showing first five 3-member group sorted by decreasing number of orders
('product_259', 'product_263', 'product_435') 11
('product_252', 'product_432', 'product_63') 11
('product_114', 'product_139', 'product_156') 11
('product_101', 'product_11', 'product_179') 11
('product_115', 'product_301', 'product_45') 11

Code

import pandas as pd
import collections

import time #just for timing
import numpy as np #just for creating the test data
np.random.seed(1)

def init_data():
    #Initialize test data
    start_create_data = time.time()
    num_products = 500
    num_orders = 1e6
    max_order_size = 10

    print('testing',num_products,'products in',num_orders,'orders where there are between 1 and',max_order_size,'products per order')

    products = ['product_'+str(p) for p in range(num_products)]

    order_sizes = np.round(np.random.uniform(low=1,high=max_order_size,size=int(num_orders)))

    order_list = []
    for order_size in order_sizes:
        order = np.random.choice(products,int(order_size),replace=False)
        order_list.append(order)
    
    # Create pandas DataFrame
    df = pd.DataFrame({'Product':order_list})
    print('made test data in',time.time()-start_create_data,'seconds')
    return df

def find_product_groups(df, min_unique_orders=1, max_group_size=None):
    
    #Initialize the memo with single-product groups
    memo = collections.defaultdict(set)
    for ind,order in df['Product'].items():
        for product in order:
            memo[(product,)].add(ind)

    #Build up the groups from 1-smaller groups, updating the memo
    unique_prods = sorted(p[0] for p in memo.keys())
    groups = list(memo.keys())
    
    while groups:
        group = groups.pop()
        
        #Skip if the max_group_size is set and this group is already max size
        if max_group_size and len(group) == max_group_size:
            continue
        
        last_prod = group[-1]
        last_prod_ind = unique_prods.index(last_prod)
        for prod in unique_prods[last_prod_ind+1:]:
            new_inds = memo[group].intersection(memo[(prod,)])

            #Only add the new group if there are enough orders
            if len(new_inds) >= min_unique_orders:
                new_group = (*group,prod)
                memo[new_group] = new_inds
                groups.append(new_group)
                
    return memo


df = init_data()

start_count_groups = time.time()
memo = find_product_groups(df, min_unique_orders=10, max_group_size=3)
print('counted_groups in',time.time()-start_count_groups,'seconds')


print('showing first five 3-member group sorted by decreasing number of orders')
filt_memo = {group:order_inds for group,order_inds in memo.items() if len(group) >= 3}
filt_memo = dict(sorted(filt_memo.items(),key=lambda kv: -len(kv[1]))[:5])

for group,order_inds in filt_memo.items():
    print(group,len(order_inds))