I have a dataframe containing sensor measurements and labels structured as time series data that looks a bit like this:
df = pd.DataFrame( {'Sensor_ID': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 'sensornode0009', Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 'sensornode0015', Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 'sensornode0011', Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 'sensornode0012', Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 'sensornode0016'}, 'CO2_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 1150.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 1300.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 412.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 1100.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 1392.0}, 'CO_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): -0.43, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): -0.08, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): -0.08, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): -0.16, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.18}, 'H2_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.13, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.12, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.22, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.13, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.09}, 'Humidity_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 51.5, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 52.85, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 50.6, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 52.66, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 52.53}, 'PM05_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 15.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 7.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 16.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 13.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 10.0}, 'PM100_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.0}, 'PM10_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 2.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 2.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 1.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 1.0}, 'PM25_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.0}, 'PM40_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.0}, 'PM_Room_Typical_Size': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.4, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.45, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.47, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.42, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.44}, 'PM_Total_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 17.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 8.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 19.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 15.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 11.0}, 'Temperature_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 26.3, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 25.7, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 26.5, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 25.9, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 26.0}, 'UV_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.0}, 'VOC_Room_RAW': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.5, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.5, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.5, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.6, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.5}, 'scenario_label': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 'Background'}, 'anomaly_label': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 'Normal', Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 'Normal', Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 'Normal', Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 'Normal', Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 'Normal'}, 'ternary_label': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 'Background'}} )
I want to select a subsequence of the time series dataframe (defined by an start and end DatetimeIndex), cut it from the original position and insert the subsequence at the new position in the time series. The "gap" from the original position of the subsequence should be set to NaN so that these values can be imputed in an additional step e.g.
So far I´ve tryed to use the following function for this problem:
def move_subsequence(df, start, end, new_start):
"""
Move a subsequence of a time series to a new position.
Parameters:
df (pandas.DataFrame): The time series to modify.
start (pandas.Timestamp): The start time of the subsequence to move.
end (pandas.Timestamp): The end time of the subsequence to move.
new_start (pandas.Timestamp): The new start time for the subsequence.
Returns:
pandas.DataFrame: The modified time series.
"""
# Extract the subsequence to move
subsequence = df.loc[start:end]
# Remove the subsequence from its original position
df = df.drop(subsequence.index)
# Calculate the length of the subsequence
subseq_length = len(subsequence)
# Insert the subsequence at the new position
if new_start > start:
df = pd.concat([df.loc[:new_start - timedelta(subseq_length)], subsequence,
df.loc[new_start:]])
else:
df = pd.concat([df.loc[:new_start], subsequence, df.loc[new_start + timedelta(subseq_length):]])
return df
which cuts of the subsequence correctly but does not insert it to the new_start position in the time series.
With a shortened version of your dataframe:
import pandas as pd
df = pd.DataFrame(
{
"Sensor_ID": {
pd.Timestamp("2022-07-04 14:10:52.259355+0000", tz="UTC"): "sensornode0009",
pd.Timestamp("2022-07-04 14:10:53.401380+0000", tz="UTC"): "sensornode0015",
pd.Timestamp("2022-07-04 14:10:54.401182+0000", tz="UTC"): "sensornode0011",
pd.Timestamp("2022-07-04 14:10:54.899093+0000", tz="UTC"): "sensornode0012",
pd.Timestamp("2022-07-04 14:10:55.299093+0000", tz="UTC"): "sensornode0013",
pd.Timestamp("2022-07-04 14:10:59.346954+0000", tz="UTC"): "sensornode0016",
},
"CO2_Room": {
pd.Timestamp("2022-07-04 14:10:52.259355+0000", tz="UTC"): 1150.0,
pd.Timestamp("2022-07-04 14:10:53.401380+0000", tz="UTC"): 1300.0,
pd.Timestamp("2022-07-04 14:10:54.401182+0000", tz="UTC"): 412.0,
pd.Timestamp("2022-07-04 14:10:54.899093+0000", tz="UTC"): 1100.0,
pd.Timestamp("2022-07-04 14:10:55.299093+0000", tz="UTC"): 900.0,
pd.Timestamp("2022-07-04 14:10:59.346954+0000", tz="UTC"): 1392.0,
},
"CO_Room": {
pd.Timestamp("2022-07-04 14:10:52.259355+0000", tz="UTC"): -0.43,
pd.Timestamp("2022-07-04 14:10:53.401380+0000", tz="UTC"): -0.08,
pd.Timestamp("2022-07-04 14:10:54.401182+0000", tz="UTC"): -0.08,
pd.Timestamp("2022-07-04 14:10:54.899093+0000", tz="UTC"): -0.16,
pd.Timestamp("2022-07-04 14:10:55.299093+0000", tz="UTC"): -0.80,
pd.Timestamp("2022-07-04 14:10:59.346954+0000", tz="UTC"): 0.18,
},
"H2_Room": {
pd.Timestamp("2022-07-04 14:10:52.259355+0000", tz="UTC"): 0.13,
pd.Timestamp("2022-07-04 14:10:53.401380+0000", tz="UTC"): 0.12,
pd.Timestamp("2022-07-04 14:10:54.401182+0000", tz="UTC"): 0.22,
pd.Timestamp("2022-07-04 14:10:54.899093+0000", tz="UTC"): 0.13,
pd.Timestamp("2022-07-04 14:10:55.299093+0000", tz="UTC"): 0.19,
pd.Timestamp("2022-07-04 14:10:59.346954+0000", tz="UTC"): 0.09,
},
}
)
print(df)
Here is one way to do it with a new version of your function:
def move_subsequence(
df: pd.DataFrame, start: pd.Timestamp, end: pd.Timestamp, new_start: pd.Timestamp
) -> pd.DataFrame:
"""Move a subsequence of a time series to a new position.
Args:
df: The time series to modify.
start: The start time of the subsequence to move.
end: The end time of the subsequence to move.
new_start: The new start time for the subsequence.
Returns:
The modified time series.
"""
# Check that start is valid else return df unchanged
if start < df.index[0] or start > df.index[-1] or start > end:
return df
# Slice df and remove extracted rows
sub_df = df.loc[start:end, :]
df = df[~df.index.isin(sub_df.index)]
# Evaluate time delta between eeach values of sub_df index
time_deltas = [
sub_df.index[i + 1] - sub_df.index[i]
for i, _ in enumerate(sub_df.index)
if i < sub_df.shape[0] - 1
]
# Set new index of sub_df
new_index = [new_start]
for time_delta in time_deltas:
new_index = new_index + [new_index[-1] + time_delta]
sub_df.index = new_index
# Concat and return df and sub_df
return pd.concat([df, sub_df]).sort_index()
Then, for instance:
start = pd.Timestamp("2022-07-04 14:10:53.401380+00:00", tz="UTC")
end = pd.Timestamp("2022-07-04 14:10:54.899093+00:00 ", tz="UTC")
new_start = pd.Timestamp("2022-07-04 14:10:56.101032+0000", tz="UTC")
new_df = move_subsequence(df, start, end, new_start)
print(new_df)