Search code examples
pythonpandastime-series

Fill gaps in time intervals with other time intervals


We have two tables with time intervals. I want to fill gaps in df1 with df2 as in the graph to get df3. df1 is moved to df3 as it is, and only the parts of df2 that lie in the gaps of df1 (difference) are moved to df3.

[1]: https://i.sstatic.net/uQeML.png

df1 = pd.DataFrame({'Start': ['2023-01-01', '2023-02-01', '2023-03-15', '2023-04-18', '2023-05-15', '2023-05-25'],
                    'End': ['2023-01-15', '2023-02-20', '2023-04-01', '2023-05-03',  '2023-05-20', '2023-05-30']})

df2 = pd.DataFrame({'Start': ['2023-01-02', '2023-01-05', '2023-01-20', '2023-02-25', '2023-03-05', '2023-04-18', '2023-05-12'],
                    'End': ['2023-01-03', '2023-01-10', '2023-02-10', '2023-03-01', '2023-04-15', '2023-05-10', '2023-06-05']})

df3 =  pd.DataFrame({'Start': ['2023-01-01', '2023-01-20', '2023-02-01', '2023-02-25', '2023-03-05', '2023-03-15', '2023-04-02', '2023-04-18', '2023-05-04', '2023-05-12', '2023-05-15', '2023-05-21', '2023-05-25', '2023-05-31'],
                     'End': ['2023-01-15', '2023-01-31', '2023-02-20', '2023-03-01', '2023-03-14', '2023-04-01', '2023-04-15', '2023-05-03', '2023-05-10', '2023-05-14', '2023-05-20', '2023-05-24', '2023-05-30', '2023-06-05']})

# df1

        Start         End
0  2023-01-01  2023-01-15
1  2023-02-01  2023-02-20
2  2023-03-15  2023-04-01
3  2023-04-18  2023-05-03
4  2023-05-15  2023-05-20
5  2023-05-25  2023-05-30

# df2

        Start         End
0  2023-01-02  2023-01-03
1  2023-01-05  2023-01-10
2  2023-01-20  2023-02-10
3  2023-02-25  2023-03-01
4  2023-03-05  2023-04-15
5  2023-04-18  2023-05-10
6  2023-05-12  2023-06-05

# df3 (desired result)

         Start         End
0   2023-01-01  2023-01-15
1   2023-01-20  2023-01-31
2   2023-02-01  2023-02-20
3   2023-02-25  2023-03-01
4   2023-03-05  2023-03-14
5   2023-03-15  2023-04-01
6   2023-04-02  2023-04-15
7   2023-04-18  2023-05-03
8   2023-05-04  2023-05-10
9   2023-05-12  2023-05-14
10  2023-05-15  2023-05-20
11  2023-05-21  2023-05-24
12  2023-05-25  2023-05-30
13  2023-05-31  2023-06-05

Code to generate plot:

import plotly.express as px

df_plot = pd.concat(
    [
        df1.assign(color='df1', df='df1'),
        df2.assign(color='df2', df='df2'), 
        df3.assign(color=['df1', 'df2', 'df1', 'df2', 'df2', 'df1', 'df2', 'df1', 'df2', 'df2', 'df1', 'df2', 'df1', 'df2'], df='df3')
    ],
)
fig = px.timeline(df_plot, x_start="Start", x_end="End", y="df", color="color")
fig.update_yaxes(categoryorder='category descending')
fig.show()

Solution

  • I think I can get you close:

    df1 = pd.DataFrame({'Start': ['2023-01-01', '2023-02-01', '2023-03-15'],
                        'End': ['2023-01-15', '2023-02-20', '2023-04-01']})
    df2 = pd.DataFrame({'Start': ['2023-01-02', '2023-01-05', '2023-01-20', '2023-02-25', '2023-03-05'],
                        'End': ['2023-01-03', '2023-01-10', '2023-02-10', '2023-03-01', '2023-04-15']})
    df3 =  pd.DataFrame({'Start': ['2023-01-01', '2023-01-20', '2023-02-01', '2023-02-25', '2023-03-05', '2023-03-15', '2023-04-02'],
                         'End': ['2023-01-15', '2023-01-31', '2023-02-20', '2023-03-01', '2023-03-14', '2023-04-01', '2023-04-15']})
    
    
    df1['dates'] = [pd.date_range(s,e) for s, e in zip(df1['Start'], df1['End'])]    
    df2['dates'] = [pd.date_range(s,e) for s, e in zip(df2['Start'], df2['End'])]
    
    df1e = df1.explode('dates').assign(source='df1')
    df2e = df2.explode('dates').assign(source='df2')
    
    
    df3e = df1e.set_index(df1e['dates']).combine_first(df2e.set_index(df2e['dates']))
    df3e['dates'] = pd.to_datetime(df3e['dates'])
    
    df3e['group'] = ((df3e['source'] != df3e['source'].shift()) | 
                     (df3e['dates'] - df3e['dates'].shift() > pd.Timedelta(days=1))).cumsum()
    
    df_out = df3e.groupby(['group', 'source'])['dates'].agg([min, max])
    

    Output:

                        min        max
    group source                      
    1     df1    2023-01-01 2023-01-15
    2     df2    2023-01-20 2023-01-31
    3     df1    2023-02-01 2023-02-20
    4     df2    2023-02-25 2023-03-01
    5     df2    2023-03-05 2023-03-14
    6     df1    2023-03-15 2023-04-01
    7     df2    2023-04-02 2023-04-15
    

    Graphical Output:

    import plotly.express as px
    
    df_out = df_out.reset_index().rename({'source':'color', 'min':'Start', 'max':'End'}, axis=1)
    df_plot = pd.concat(
        [
            df1.assign(color='df1'),
            df2.assign(color='df2'), 
            df_out
        ],
        keys=['df1' , 'df2', 'df3']
    ).reset_index(level=0, names='df')
    fig = px.timeline(df_plot, x_start="Start", x_end="End", y="df", color="color")
    fig.update_yaxes(categoryorder='category descending')
    fig.show()
    

    Graph:

    enter image description here

    with updated dataset:

                        min        max
    group source                      
    1     df1    2023-01-01 2023-01-15
    2     df2    2023-01-20 2023-01-31
    3     df1    2023-02-01 2023-02-20
    4     df2    2023-02-25 2023-03-01
    5     df2    2023-03-05 2023-03-14
    6     df1    2023-03-15 2023-04-01
    7     df2    2023-04-02 2023-04-15
    8     df1    2023-04-18 2023-05-03
    9     df2    2023-05-04 2023-05-10
    10    df2    2023-05-12 2023-05-14
    11    df1    2023-05-15 2023-05-20
    12    df2    2023-05-21 2023-05-24
    13    df1    2023-05-25 2023-05-30
    14    df2    2023-05-31 2023-06-05
    

    Graph output:

    enter image description here