We have two tables with time intervals. I want to fill gaps in df1
with df2
as in the graph to get df3
. df1
is moved to df3
as it is, and only the parts of df2
that lie in the gaps of df1
(difference) are moved to df3
.
df1 = pd.DataFrame({'Start': ['2023-01-01', '2023-02-01', '2023-03-15', '2023-04-18', '2023-05-15', '2023-05-25'],
'End': ['2023-01-15', '2023-02-20', '2023-04-01', '2023-05-03', '2023-05-20', '2023-05-30']})
df2 = pd.DataFrame({'Start': ['2023-01-02', '2023-01-05', '2023-01-20', '2023-02-25', '2023-03-05', '2023-04-18', '2023-05-12'],
'End': ['2023-01-03', '2023-01-10', '2023-02-10', '2023-03-01', '2023-04-15', '2023-05-10', '2023-06-05']})
df3 = pd.DataFrame({'Start': ['2023-01-01', '2023-01-20', '2023-02-01', '2023-02-25', '2023-03-05', '2023-03-15', '2023-04-02', '2023-04-18', '2023-05-04', '2023-05-12', '2023-05-15', '2023-05-21', '2023-05-25', '2023-05-31'],
'End': ['2023-01-15', '2023-01-31', '2023-02-20', '2023-03-01', '2023-03-14', '2023-04-01', '2023-04-15', '2023-05-03', '2023-05-10', '2023-05-14', '2023-05-20', '2023-05-24', '2023-05-30', '2023-06-05']})
# df1
Start End
0 2023-01-01 2023-01-15
1 2023-02-01 2023-02-20
2 2023-03-15 2023-04-01
3 2023-04-18 2023-05-03
4 2023-05-15 2023-05-20
5 2023-05-25 2023-05-30
# df2
Start End
0 2023-01-02 2023-01-03
1 2023-01-05 2023-01-10
2 2023-01-20 2023-02-10
3 2023-02-25 2023-03-01
4 2023-03-05 2023-04-15
5 2023-04-18 2023-05-10
6 2023-05-12 2023-06-05
# df3 (desired result)
Start End
0 2023-01-01 2023-01-15
1 2023-01-20 2023-01-31
2 2023-02-01 2023-02-20
3 2023-02-25 2023-03-01
4 2023-03-05 2023-03-14
5 2023-03-15 2023-04-01
6 2023-04-02 2023-04-15
7 2023-04-18 2023-05-03
8 2023-05-04 2023-05-10
9 2023-05-12 2023-05-14
10 2023-05-15 2023-05-20
11 2023-05-21 2023-05-24
12 2023-05-25 2023-05-30
13 2023-05-31 2023-06-05
Code to generate plot:
import plotly.express as px
df_plot = pd.concat(
[
df1.assign(color='df1', df='df1'),
df2.assign(color='df2', df='df2'),
df3.assign(color=['df1', 'df2', 'df1', 'df2', 'df2', 'df1', 'df2', 'df1', 'df2', 'df2', 'df1', 'df2', 'df1', 'df2'], df='df3')
],
)
fig = px.timeline(df_plot, x_start="Start", x_end="End", y="df", color="color")
fig.update_yaxes(categoryorder='category descending')
fig.show()
I think I can get you close:
df1 = pd.DataFrame({'Start': ['2023-01-01', '2023-02-01', '2023-03-15'],
'End': ['2023-01-15', '2023-02-20', '2023-04-01']})
df2 = pd.DataFrame({'Start': ['2023-01-02', '2023-01-05', '2023-01-20', '2023-02-25', '2023-03-05'],
'End': ['2023-01-03', '2023-01-10', '2023-02-10', '2023-03-01', '2023-04-15']})
df3 = pd.DataFrame({'Start': ['2023-01-01', '2023-01-20', '2023-02-01', '2023-02-25', '2023-03-05', '2023-03-15', '2023-04-02'],
'End': ['2023-01-15', '2023-01-31', '2023-02-20', '2023-03-01', '2023-03-14', '2023-04-01', '2023-04-15']})
df1['dates'] = [pd.date_range(s,e) for s, e in zip(df1['Start'], df1['End'])]
df2['dates'] = [pd.date_range(s,e) for s, e in zip(df2['Start'], df2['End'])]
df1e = df1.explode('dates').assign(source='df1')
df2e = df2.explode('dates').assign(source='df2')
df3e = df1e.set_index(df1e['dates']).combine_first(df2e.set_index(df2e['dates']))
df3e['dates'] = pd.to_datetime(df3e['dates'])
df3e['group'] = ((df3e['source'] != df3e['source'].shift()) |
(df3e['dates'] - df3e['dates'].shift() > pd.Timedelta(days=1))).cumsum()
df_out = df3e.groupby(['group', 'source'])['dates'].agg([min, max])
Output:
min max
group source
1 df1 2023-01-01 2023-01-15
2 df2 2023-01-20 2023-01-31
3 df1 2023-02-01 2023-02-20
4 df2 2023-02-25 2023-03-01
5 df2 2023-03-05 2023-03-14
6 df1 2023-03-15 2023-04-01
7 df2 2023-04-02 2023-04-15
Graphical Output:
import plotly.express as px
df_out = df_out.reset_index().rename({'source':'color', 'min':'Start', 'max':'End'}, axis=1)
df_plot = pd.concat(
[
df1.assign(color='df1'),
df2.assign(color='df2'),
df_out
],
keys=['df1' , 'df2', 'df3']
).reset_index(level=0, names='df')
fig = px.timeline(df_plot, x_start="Start", x_end="End", y="df", color="color")
fig.update_yaxes(categoryorder='category descending')
fig.show()
Graph:
with updated dataset:
min max
group source
1 df1 2023-01-01 2023-01-15
2 df2 2023-01-20 2023-01-31
3 df1 2023-02-01 2023-02-20
4 df2 2023-02-25 2023-03-01
5 df2 2023-03-05 2023-03-14
6 df1 2023-03-15 2023-04-01
7 df2 2023-04-02 2023-04-15
8 df1 2023-04-18 2023-05-03
9 df2 2023-05-04 2023-05-10
10 df2 2023-05-12 2023-05-14
11 df1 2023-05-15 2023-05-20
12 df2 2023-05-21 2023-05-24
13 df1 2023-05-25 2023-05-30
14 df2 2023-05-31 2023-06-05
Graph output: