How to efficiently calculate membership counts by month and group

I have to calculate in Python the number of unique active members by year, month, and group for a large dataset (N ~ 30M). Membership always starts at the beginning of the month and ends at the end of the month. Here is a very small subset of the data.

print(df.head(6))
   member_id  type  start_date    end_date
1         10     A  2021-12-01  2022-05-31
2         22     B  2022-01-01  2022-07-31
3         17     A  2022-01-01  2022-06-30
4         57     A  2022-02-02  2022-02-28
5         41     B  2022-02-02  2022-04-30

My current solution is inefficient as it relies on a for loop:

import pandas as pd


date_list = pd.date_range(
    start=min(df.start_date),
    end=max(df.end_date),
    freq='MS'
)
members = pd.DataFrame()

for d in date_list:
    df['date_filter'] = (
        (d >= df.start_date)
        & (d <= df.end_date)
    )
    grouped_members = (
         df
         .loc[df.date_filter]
         .groupby(by='type', as_index=False)
         .member_id
         .nunique()
    )
    member_counts = pd.DataFrame(
         data={'year': d.year, 'month': d.month}
         index=[0]
    )
     member_counts = member_counts.merge(
         right=grouped_members,
         how='cross'
    )
    members = pd.concat[members, member_counts]
members = members.reset_index(drop=True)

It produces the following:

print(members)

    year  month  type  member_id
 0  2021     12     A          1
 1  2021     12     B          0
 2  2022      1     A          3
 3  2022      1     B          1
 4  2022      2     A          3
 5  2022      2     B          2
 6  2022      3     A          2
 7  2022      3     B          2
 8  2022      4     A          2
 9  2022      4     B          2
10  2022      5     A          2
11  2022      5     B          1
12  2022      6     A          1
13  2022      6     B          1
14  2022      7     A          0
15  2022      7     B          1

I'm looking for a completely vectorized solution to reduce computational time.

Solution

Updated answer that avoids melt. Maybe faster? Uses the same idea as before where we don't actually care about member ids, we are just keeping track of start/end counts

#Create multiindexed series for reindexing later
months = pd.date_range(
    start=df.start_date.min(),
    end=df.end_date.max(),
    freq='MS',
).to_period('M')

ind = pd.MultiIndex.from_product([df.type.unique(),months],names=['type','month'])

#push each end date to the next month
df['end_date'] += pd.DateOffset(1) 

#Convert the dates to yyyy-mm
df['start_date'] = df.start_date.dt.to_period('M')
df['end_date'] = df.end_date.dt.to_period('M')

#Get cumsum counts per type/month of start and ends 
gb_counts = (
    df.groupby('type').agg(
        start = ('start_date','value_counts'),
        end = ('end_date','value_counts'),
    )
    .reindex(ind)
    .fillna(0)
    .groupby('type')
    .cumsum()
    .astype(int)
)

counts = (gb_counts.start-gb_counts.end).unstack()
counts

ORIGINAL

Updated answer than works unless the same member_id/group has overlapping date ranges (in which case it double-counts)

The idea is to keep track of when the number of users changes per group instead of exploding out all months per user.

I think this should be very fast and I'm curious how it performs

Output

Code (looks long but is mostly comments)

import pandas as pd
import itertools

#Load example data
import io #just for reading in your example table
df = pd.read_csv(
    io.StringIO("""
0  member_id  type  start_date    end_date
1         10     A  2021-12-01  2022-05-31
2         22     B  2022-01-01  2022-07-31
3         17     A  2022-01-01  2022-06-30
4         57     A  2022-02-02  2022-02-28
5         41     B  2022-02-02  2022-04-30
"""),
    delim_whitespace=True,
    index_col=0,
    parse_dates=['start_date','end_date'],
).reset_index(drop=True)

#Create categorical index for reindexing and ffill
months = pd.date_range(
    start=df.start_date.min(),
    end=df.end_date.max(),
    freq='MS',
).to_period('M')

cat_ind = pd.Categorical(itertools.product(df.type.unique(),months))

#push each end date to the next month
df['end_date'] += pd.DateOffset(1) 

#Convert the dates to yyyy-mm
df['start_date'] = df.start_date.dt.to_period('M')
df['end_date'] = df.end_date.dt.to_period('M')

#Melt from:
#
#member_id | type | start_date |  end_date
#----------|------|------------|-----------
#       10 |   A  | 2021-12-01 | 2022-05-31
# ...
#
#to
#
# type | active_users | date
#----------------------------
#    A |   start_date | 2021-12-01
#    A |     end_date | 2022-05-31
# ...
df = df.melt(
    id_vars='type',
    value_vars=['start_date','end_date'],
    var_name='active_users',
    value_name='date',
).sort_values('date')

#Replace var column with +1/-1 for start/end date rows
#
# type | active_users | date
#----------------------------
#    A |            1 | 2021-12-01
#    A |           -1 | 2022-05-31
# ...
df['active_users'] = df.active_users.replace({'start_date':1,'end_date':-1})

#Sum within each type/date then cumsum the number of active users
df = df.groupby(['type','date']).sum().cumsum()

#Reindex to ffill missing dates
df = df.reindex(cat_ind).ffill().astype(int)

df.unstack()