Cross-audience analysis in Polars Python

I have a code that works well in pandas, but I would like to convert it to polars. I have managed to obtain the quantities per audience, but I can't seem to achieve the cross between audiences.

I hope you can help me. I'll share the code with you and explain what I need. I have a dataframe with a list of clients and the audiences they belong to. A client can be in more than one audience. I need to obtain the number of clients in each audience and how many are present in both audiences in all possible combinations.

import pandas as pd
import itertools

data = {'DNI': ['11111111A', '22222222B', '33333333C', '44444444D', '22222222B', '66666666F', '77777777G', '88888888H', '44444444D', '22222222B', '11111111A'],
        'audience': ['A1', 'A1', 'A2', 'A2', 'A2', 'A3', 'A3', 'A3', 'A4', 'A4', 'A4']}
df = pd.DataFrame(data)

# Count the number of unique DNIs in each audience
df_audience_count = df.groupby('audience')['DNI'].nunique().reset_index().rename(columns={'DNI': 'Count'})

# Create a dictionary that maps from audience to the number of unique DNIs in that audience
audience_to_count = dict(zip(df_audience_count['audience'], df_audience_count['Count']))

# Create a list of all possible combinations of two audiences
audience_combinations = list(itertools.combinations(audience_to_count.keys(), 2))

# Create a DataFrame that contains the two audiences in each combination, as well as the number of unique DNIs in each audience and the number of unique DNIs that are in both audiences
df_combinations = pd.DataFrame(audience_combinations, columns=['audience_x', 'audience_y'])
df_combinations['audience_x_count'] = df_combinations['audience_x'].map(audience_to_count)
df_combinations['audience_y_count'] = df_combinations['audience_y'].map(audience_to_count)
df_combinations['audience_x_y_count'] = df_combinations.apply(lambda x: len(set(df[df['audience'] == x['audience_x']]['DNI']).intersection(set(df[df['audience'] == x['audience_y']]['DNI']))), axis=1)

# Calculate the percentage of customers in each audience that are also in the other audience
df_combinations['audience_x_pct_in_y'] = (df_combinations['audience_x_y_count'] / df_combinations['audience_x_count']) * 100
df_combinations['audience_y_pct_in_x'] = (df_combinations['audience_x_y_count'] / df_combinations['audience_y_count']) * 100

# Print the DataFrame
df_combinations

I achieved this, obtaining the quantities per audience, but I'm unable to obtain the cross:

import polars as pl
import itertools

data = {'DNI': ['11111111A', '22222222B', '33333333C', '44444444D', '22222222B', '66666666F', '77777777G', '88888888H', '44444444D', '22222222B','11111111A'],
        'audience': ['A1', 'A1', 'A2', 'A2', 'A2', 'A3', 'A3', 'A3', 'A4', 'A4','A4']}
df = pl.DataFrame(data)

dfcountxaudience = df.group_by('audience').agg(pl.count('DNI').alias('count'))
lstcountxaudience = dfcountxaudience.to_numpy().tolist()
diccountxaudience = {element[0]: element[1] for element in lstcountxaudience}
dfUniqueAudiences = dfcountxaudience["audience"]
PossibleCombinations = list(itertools.combinations(dfUniqueAudiences, 2))
PossibleCombinations = list(zip(*PossibleCombinations))
dfPossibleCombinations = pl.DataFrame(PossibleCombinations).rename({"column_0": "audience_x", "column_1":"audience_y"})
dfAuxiliary = dfPossibleCombinations.join(dfcountxaudience, left_on='audience_x', right_on='audience').rename({"count": "customers_x"})
dfCombinationsClients = dfAuxiliary.join(dfcountxaudience, left_on='audience_y', right_on='audience').rename({"count": "customers_y"})
dfCombinationsClients

Solution

You can use a non-equi-join to generate the combinations.

# add index to use in join_where
lf = df.lazy().with_row_index()

audience = (
   lf.select("index", "audience", pl.len().over("audience"))
     .unique("audience")
)
audience = audience.join_where(audience, pl.col.index < pl.col.index_right)

audience_x_y = (
   lf.join_where(lf,
      pl.col.DNI == pl.col.DNI_right,
      pl.col.index < pl.col.index_right
   )
   .group_by("audience", "audience_right")
   .agg(
      pl.col.DNI.count().alias("audience_x_y_count")
   )
)

audience.join(audience_x_y, 
   on = ["audience", "audience_right"], 
   how = "left"
).collect()

shape: (6, 7)
┌───────┬──────────┬─────┬─────────────┬────────────────┬───────────┬────────────────────┐
│ index ┆ audience ┆ len ┆ index_right ┆ audience_right ┆ len_right ┆ audience_x_y_count │
│ ---   ┆ ---      ┆ --- ┆ ---         ┆ ---            ┆ ---       ┆ ---                │
│ u32   ┆ str      ┆ u32 ┆ u32         ┆ str            ┆ u32       ┆ u32                │
╞═══════╪══════════╪═════╪═════════════╪════════════════╪═══════════╪════════════════════╡
│ 0     ┆ A1       ┆ 2   ┆ 2           ┆ A2             ┆ 3         ┆ 1                  │
│ 0     ┆ A1       ┆ 2   ┆ 5           ┆ A3             ┆ 3         ┆ null               │
│ 0     ┆ A1       ┆ 2   ┆ 8           ┆ A4             ┆ 3         ┆ 2                  │
│ 2     ┆ A2       ┆ 3   ┆ 5           ┆ A3             ┆ 3         ┆ null               │
│ 2     ┆ A2       ┆ 3   ┆ 8           ┆ A4             ┆ 3         ┆ 2                  │
│ 5     ┆ A3       ┆ 3   ┆ 8           ┆ A4             ┆ 3         ┆ null               │
└───────┴──────────┴─────┴─────────────┴────────────────┴───────────┴────────────────────┘

Combinations

We add a row index, add the group length and unique by audience.

audience = (
   lf.select("index", "audience", pl.len().over("audience"))
     .unique("audience")
)

┌───────┬──────────┬─────┐
│ index ┆ audience ┆ len │
│ ---   ┆ ---      ┆ --- │
│ u32   ┆ str      ┆ u32 │
╞═══════╪══════════╪═════╡
│ 0     ┆ A1       ┆ 2   │
│ 8     ┆ A4       ┆ 3   │
│ 5     ┆ A3       ┆ 3   │
│ 2     ┆ A2       ┆ 3   │
└───────┴──────────┴─────┘

We then do a self .join_where() using index as the predicate to get the combinations.

audience.join_where(audience, pl.col.index < pl.col.index_right)

┌───────┬──────────┬─────┬─────────────┬────────────────┬───────────┐
│ index ┆ audience ┆ len ┆ index_right ┆ audience_right ┆ len_right │
│ ---   ┆ ---      ┆ --- ┆ ---         ┆ ---            ┆ ---       │
│ u32   ┆ str      ┆ u32 ┆ u32         ┆ str            ┆ u32       │
╞═══════╪══════════╪═════╪═════════════╪════════════════╪═══════════╡
│ 0     ┆ A1       ┆ 2   ┆ 2           ┆ A2             ┆ 3         │
│ 0     ┆ A1       ┆ 2   ┆ 5           ┆ A3             ┆ 3         │
│ 0     ┆ A1       ┆ 2   ┆ 8           ┆ A4             ┆ 3         │
│ 2     ┆ A2       ┆ 3   ┆ 5           ┆ A3             ┆ 3         │
│ 2     ┆ A2       ┆ 3   ┆ 8           ┆ A4             ┆ 3         │
│ 5     ┆ A3       ┆ 3   ┆ 8           ┆ A4             ┆ 3         │
└───────┴──────────┴─────┴─────────────┴────────────────┴───────────┘

The same approach is used to generate audience_x_y - just with the added predicate of DNI being equal.