Search code examples
pythonpandas

Create columns from grouped output


I have data which looks like this

startDate   endDate value   sourceName
0   2024-06-03 22:26:00+02:00   2024-06-03 22:46:00+02:00   HKCategoryValueSleepAnalysisAsleepCore  AppleWatch
6   2024-06-03 22:40:00+02:00   2024-06-04 07:48:00+02:00   HKCategoryValueSleepAnalysisAsleepCore  Connect
1   2024-06-03 22:46:00+02:00   2024-06-03 22:49:00+02:00   HKCategoryValueSleepAnalysisAwake   AppleWatch
2   2024-06-03 22:49:00+02:00   2024-06-04 00:56:00+02:00   HKCategoryValueSleepAnalysisAsleepREM   AppleWatch
3   2024-06-04 00:56:00+02:00   2024-06-04 03:56:00+02:00   HKCategoryValueSleepAnalysisAsleepCore  AppleWatch
4   2024-06-04 05:56:00+02:00   2024-06-04 07:56:00+02:00   HKCategoryValueSleepAnalysisAsleepREM   AppleWatch
5   2024-06-04 22:40:00+02:00   2024-06-05 07:48:00+02:00   HKCategoryValueSleepAnalysisAsleepCore  AppleWatch

I group them into "sleep sessions" by device and start end date if the gap is no larger than 2 hours like so.

                startDate                   endDate                     duration
sourceName              
AppleWatch  0   2024-06-03 22:26:00+02:00   2024-06-04 07:56:00+02:00   7.500000
            1   2024-06-04 22:40:00+02:00   2024-06-05 07:48:00+02:00   9.133333
Connect     1   2024-06-03 22:40:00+02:00   2024-06-04 07:48:00+02:00   9.133333

I would like to get columns which sum the duration of each grouped value (within the session). example

REM_duration
Core_duration
Awake_duration

Also any gaps between stages (see between row index 3&4) should be added to Awake_duration. Example Awake_duration from session 0 should be 2.05

So expected output would be

    startDate                   endDate                    duration sourceName  rem_duration    core_duration   awake_duration
0   2024-06-03 22:26:00+02:00   2024-06-04 07:56:00+02:00   7.500000    AppleWatch  4.116667    3.333333    2.05
1   2024-06-04 22:40:00+02:00   2024-06-05 07:48:00+02:00   9.133333    AppleWatch  0.000000    9.133333    0.00
1   2024-06-03 22:40:00+02:00   2024-06-04 07:48:00+02:00   8.133333    Connect 1.000000    7.133333    1.00

This is what I have so far

import pandas as pd
from datetime import timedelta

data = [
    {
        "startDate": pd.Timestamp("2024-06-03 22:26:00+0200"),
        "endDate": pd.Timestamp("2024-06-03 22:46:00+0200"),
        "value": "HKCategoryValueSleepAnalysisAsleepCore",
        "sourceName": "AppleWatch"
    },
    {
        "startDate": pd.Timestamp("2024-06-03 22:46:00+0200"),
        "endDate": pd.Timestamp("2024-06-03 22:49:00+0200"),
        "value": "HKCategoryValueSleepAnalysisAwake",
        "sourceName": "AppleWatch"
    },
    {
        "startDate": pd.Timestamp("2024-06-03 22:49:00+0200"),
        "endDate": pd.Timestamp("2024-06-04 00:56:00+0200"),
        "value": "HKCategoryValueSleepAnalysisAsleepREM",
        "sourceName": "AppleWatch"
    },
    {
        "startDate": pd.Timestamp("2024-06-04 00:56:00+0200"),
        "endDate": pd.Timestamp("2024-06-04 03:56:00+0200"),
        "value": "HKCategoryValueSleepAnalysisAsleepCore",
        "sourceName": "AppleWatch"
    },
    {
        "startDate": pd.Timestamp("2024-06-04 05:56:00+0200"),
        "endDate": pd.Timestamp("2024-06-04 07:56:00+0200"),
        "value": "HKCategoryValueSleepAnalysisAsleepREM",
        "sourceName": "AppleWatch"
    },
    {
        "startDate": pd.Timestamp("2024-06-04 22:40:00+0200"),
        "endDate": pd.Timestamp("2024-06-05 07:48:00+0200"),
        "value": "HKCategoryValueSleepAnalysisAsleepCore",
        "sourceName": "AppleWatch"
    },
    {
        "startDate": pd.Timestamp("2024-06-03 22:40:00+0200"),
        "endDate": pd.Timestamp("2024-06-04 07:48:00+0200"),
        "value": "HKCategoryValueSleepAnalysisAsleepCore",
        "sourceName": "Connect"
    }
]

# Create DataFrame
df_orig = pd.DataFrame.from_records(data).sort_values('startDate')
max_gap = 2

df = df_orig.copy()

df = df.sort_values(['sourceName', 'startDate'])

df['duration'] = (df['endDate'] - df['startDate']).div(pd.Timedelta(hours=1))

g = df['startDate'].sub(df['endDate'].shift()).div(pd.Timedelta(hours=1))
df2 = df.groupby(['sourceName', g.gt(max_gap).cumsum()]).agg({'startDate':'min', 'endDate':'max', 'duration': 'sum'})

Solution

  • IIUC, you can add an extra grouper, an extra column with groupby.diff, the perform your custom computations, unstack and concat:

    max_gap = 2
    
    tmp = (df_orig
       .sort_values(['sourceName', 'startDate'])
       .assign(duration=lambda x: x['endDate'].sub(x['startDate']).div(pd.Timedelta(hours=1)),
               delta=lambda x: x['startDate'].sub(x.groupby('sourceName')['endDate'].shift())
                                             .div(pd.Timedelta(hours=1)),
               n=lambda x: x['delta'].gt(max_gap).cumsum(),
               col=lambda x: x['value'].str.extract(r'([A-Z]+[a-z]*)$', expand=False).str.lower()
               
              )
       .groupby(['sourceName', 'n', 'col'])
       .agg({'startDate':'min', 'endDate':'max',
             'duration': 'sum', 'delta': 'sum'})
    )
    
    g = tmp.groupby(level=['sourceName', 'n'])
    
    out = (pd.concat([g[['startDate']].min(), g[['endDate']].max(),
                      g[['duration']].sum(),
                      (tmp['duration'].mask((tmp.index.get_level_values('col')=='awake'),
                                            tmp['duration']+g['delta'].transform('sum')
                                           )
                       .unstack('col', fill_value=0)
                       .add_suffix('_duration')
                      )], axis=1)
             .reset_index(level=0).rename_axis(None)
          )
    

    Output:

       sourceName                 startDate                   endDate  duration  awake_duration  core_duration  rem_duration
    0  AppleWatch 2024-06-03 22:26:00+02:00 2024-06-04 07:56:00+02:00  7.500000            2.05       3.333333      4.116667
    1  AppleWatch 2024-06-04 22:40:00+02:00 2024-06-05 07:48:00+02:00  9.133333            0.00       9.133333      0.000000
    1     Connect 2024-06-03 22:40:00+02:00 2024-06-04 07:48:00+02:00  9.133333            0.00       9.133333      0.000000