Search code examples
pythonpandasdatetimevectorization

How to efficiently calculate membership counts by month and group


I have to calculate in Python the number of unique active members by year, month, and group for a large dataset (N ~ 30M). Membership always starts at the beginning of the month and ends at the end of the month. Here is a very small subset of the data.

print(df.head(6))
   member_id  type  start_date    end_date
1         10     A  2021-12-01  2022-05-31
2         22     B  2022-01-01  2022-07-31
3         17     A  2022-01-01  2022-06-30
4         57     A  2022-02-02  2022-02-28
5         41     B  2022-02-02  2022-04-30

My current solution is inefficient as it relies on a for loop:

import pandas as pd


date_list = pd.date_range(
    start=min(df.start_date),
    end=max(df.end_date),
    freq='MS'
)
members = pd.DataFrame()

for d in date_list:
    df['date_filter'] = (
        (d >= df.start_date)
        & (d <= df.end_date)
    )
    grouped_members = (
         df
         .loc[df.date_filter]
         .groupby(by='type', as_index=False)
         .member_id
         .nunique()
    )
    member_counts = pd.DataFrame(
         data={'year': d.year, 'month': d.month}
         index=[0]
    )
     member_counts = member_counts.merge(
         right=grouped_members,
         how='cross'
    )
    members = pd.concat[members, member_counts]
members = members.reset_index(drop=True)

It produces the following:

print(members)

    year  month  type  member_id
 0  2021     12     A          1
 1  2021     12     B          0
 2  2022      1     A          3
 3  2022      1     B          1
 4  2022      2     A          3
 5  2022      2     B          2
 6  2022      3     A          2
 7  2022      3     B          2
 8  2022      4     A          2
 9  2022      4     B          2
10  2022      5     A          2
11  2022      5     B          1
12  2022      6     A          1
13  2022      6     B          1
14  2022      7     A          0
15  2022      7     B          1

I'm looking for a completely vectorized solution to reduce computational time.


Solution

  • Updated answer that avoids melt. Maybe faster? Uses the same idea as before where we don't actually care about member ids, we are just keeping track of start/end counts

    #Create multiindexed series for reindexing later
    months = pd.date_range(
        start=df.start_date.min(),
        end=df.end_date.max(),
        freq='MS',
    ).to_period('M')
    
    ind = pd.MultiIndex.from_product([df.type.unique(),months],names=['type','month'])
    
    #push each end date to the next month
    df['end_date'] += pd.DateOffset(1) 
    
    #Convert the dates to yyyy-mm
    df['start_date'] = df.start_date.dt.to_period('M')
    df['end_date'] = df.end_date.dt.to_period('M')
    
    #Get cumsum counts per type/month of start and ends 
    gb_counts = (
        df.groupby('type').agg(
            start = ('start_date','value_counts'),
            end = ('end_date','value_counts'),
        )
        .reindex(ind)
        .fillna(0)
        .groupby('type')
        .cumsum()
        .astype(int)
    )
    
    counts = (gb_counts.start-gb_counts.end).unstack()
    counts
    

    ORIGINAL

    Updated answer than works unless the same member_id/group has overlapping date ranges (in which case it double-counts)

    The idea is to keep track of when the number of users changes per group instead of exploding out all months per user.

    I think this should be very fast and I'm curious how it performs

    Output

    enter image description here

    Code (looks long but is mostly comments)

    import pandas as pd
    import itertools
    
    #Load example data
    import io #just for reading in your example table
    df = pd.read_csv(
        io.StringIO("""
    0  member_id  type  start_date    end_date
    1         10     A  2021-12-01  2022-05-31
    2         22     B  2022-01-01  2022-07-31
    3         17     A  2022-01-01  2022-06-30
    4         57     A  2022-02-02  2022-02-28
    5         41     B  2022-02-02  2022-04-30
    """),
        delim_whitespace=True,
        index_col=0,
        parse_dates=['start_date','end_date'],
    ).reset_index(drop=True)
    
    #Create categorical index for reindexing and ffill
    months = pd.date_range(
        start=df.start_date.min(),
        end=df.end_date.max(),
        freq='MS',
    ).to_period('M')
    
    cat_ind = pd.Categorical(itertools.product(df.type.unique(),months))
    
    #push each end date to the next month
    df['end_date'] += pd.DateOffset(1) 
    
    #Convert the dates to yyyy-mm
    df['start_date'] = df.start_date.dt.to_period('M')
    df['end_date'] = df.end_date.dt.to_period('M')
    
    #Melt from:
    #
    #member_id | type | start_date |  end_date
    #----------|------|------------|-----------
    #       10 |   A  | 2021-12-01 | 2022-05-31
    # ...
    #
    #to
    #
    # type | active_users | date
    #----------------------------
    #    A |   start_date | 2021-12-01
    #    A |     end_date | 2022-05-31
    # ...
    df = df.melt(
        id_vars='type',
        value_vars=['start_date','end_date'],
        var_name='active_users',
        value_name='date',
    ).sort_values('date')
    
    #Replace var column with +1/-1 for start/end date rows
    #
    # type | active_users | date
    #----------------------------
    #    A |            1 | 2021-12-01
    #    A |           -1 | 2022-05-31
    # ...
    df['active_users'] = df.active_users.replace({'start_date':1,'end_date':-1})
    
    #Sum within each type/date then cumsum the number of active users
    df = df.groupby(['type','date']).sum().cumsum()
    
    #Reindex to ffill missing dates
    df = df.reindex(cat_ind).ffill().astype(int)
    
    df.unstack()