Search code examples
pythonpandasdata-manipulation

Moving a subsequence of a time series from original position to a different position


I have a dataframe containing sensor measurements and labels structured as time series data that looks a bit like this:

df = pd.DataFrame( {'Sensor_ID': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 'sensornode0009', Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 'sensornode0015', Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 'sensornode0011', Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 'sensornode0012', Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 'sensornode0016'}, 'CO2_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 1150.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 1300.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 412.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 1100.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 1392.0}, 'CO_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): -0.43, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): -0.08, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): -0.08, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): -0.16, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.18}, 'H2_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.13, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.12, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.22, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.13, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.09}, 'Humidity_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 51.5, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 52.85, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 50.6, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 52.66, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 52.53}, 'PM05_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 15.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 7.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 16.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 13.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 10.0}, 'PM100_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.0}, 'PM10_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 2.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 2.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 1.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 1.0}, 'PM25_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.0}, 'PM40_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.0}, 'PM_Room_Typical_Size': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.4, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.45, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.47, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.42, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.44}, 'PM_Total_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 17.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 8.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 19.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 15.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 11.0}, 'Temperature_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 26.3, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 25.7, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 26.5, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 25.9, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 26.0}, 'UV_Room': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.0, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.0}, 'VOC_Room_RAW': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 0.5, Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 0.5, Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 0.5, Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 0.6, Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 0.5}, 'scenario_label': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 'Background'}, 'anomaly_label': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 'Normal', Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 'Normal', Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 'Normal', Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 'Normal', Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 'Normal'}, 'ternary_label': {Timestamp('2022-07-04 14:10:54.259355+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:54.401380+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:55.401182+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:55.899093+0000', tz='UTC'): 'Background', Timestamp('2022-07-04 14:10:59.346954+0000', tz='UTC'): 'Background'}} )

I want to select a subsequence of the time series dataframe (defined by an start and end DatetimeIndex), cut it from the original position and insert the subsequence at the new position in the time series. The "gap" from the original position of the subsequence should be set to NaN so that these values can be imputed in an additional step e.g.

So far I´ve tryed to use the following function for this problem:

def move_subsequence(df, start, end, new_start):
    """
    Move a subsequence of a time series to a new position.

    Parameters:
    df (pandas.DataFrame): The time series to modify.
    start (pandas.Timestamp): The start time of the subsequence to move.
    end (pandas.Timestamp): The end time of the subsequence to move.
    new_start (pandas.Timestamp): The new start time for the subsequence.

    Returns:
    pandas.DataFrame: The modified time series.
    """
    # Extract the subsequence to move
    subsequence = df.loc[start:end]

    # Remove the subsequence from its original position
    df = df.drop(subsequence.index)

    # Calculate the length of the subsequence
    subseq_length = len(subsequence)

    # Insert the subsequence at the new position
    if new_start > start:
        df = pd.concat([df.loc[:new_start - timedelta(subseq_length)], subsequence,
df.loc[new_start:]])
    else:
        df = pd.concat([df.loc[:new_start], subsequence, df.loc[new_start + timedelta(subseq_length):]])

    return df

which cuts of the subsequence correctly but does not insert it to the new_start position in the time series.


Solution

  • With a shortened version of your dataframe:

    import pandas as pd
    
    df = pd.DataFrame(
        {
            "Sensor_ID": {
                pd.Timestamp("2022-07-04 14:10:52.259355+0000", tz="UTC"): "sensornode0009",
                pd.Timestamp("2022-07-04 14:10:53.401380+0000", tz="UTC"): "sensornode0015",
                pd.Timestamp("2022-07-04 14:10:54.401182+0000", tz="UTC"): "sensornode0011",
                pd.Timestamp("2022-07-04 14:10:54.899093+0000", tz="UTC"): "sensornode0012",
                pd.Timestamp("2022-07-04 14:10:55.299093+0000", tz="UTC"): "sensornode0013",
                pd.Timestamp("2022-07-04 14:10:59.346954+0000", tz="UTC"): "sensornode0016",
            },
            "CO2_Room": {
                pd.Timestamp("2022-07-04 14:10:52.259355+0000", tz="UTC"): 1150.0,
                pd.Timestamp("2022-07-04 14:10:53.401380+0000", tz="UTC"): 1300.0,
                pd.Timestamp("2022-07-04 14:10:54.401182+0000", tz="UTC"): 412.0,
                pd.Timestamp("2022-07-04 14:10:54.899093+0000", tz="UTC"): 1100.0,
                pd.Timestamp("2022-07-04 14:10:55.299093+0000", tz="UTC"): 900.0,
                pd.Timestamp("2022-07-04 14:10:59.346954+0000", tz="UTC"): 1392.0,
            },
            "CO_Room": {
                pd.Timestamp("2022-07-04 14:10:52.259355+0000", tz="UTC"): -0.43,
                pd.Timestamp("2022-07-04 14:10:53.401380+0000", tz="UTC"): -0.08,
                pd.Timestamp("2022-07-04 14:10:54.401182+0000", tz="UTC"): -0.08,
                pd.Timestamp("2022-07-04 14:10:54.899093+0000", tz="UTC"): -0.16,
                pd.Timestamp("2022-07-04 14:10:55.299093+0000", tz="UTC"): -0.80,
                pd.Timestamp("2022-07-04 14:10:59.346954+0000", tz="UTC"): 0.18,
            },
            "H2_Room": {
                pd.Timestamp("2022-07-04 14:10:52.259355+0000", tz="UTC"): 0.13,
                pd.Timestamp("2022-07-04 14:10:53.401380+0000", tz="UTC"): 0.12,
                pd.Timestamp("2022-07-04 14:10:54.401182+0000", tz="UTC"): 0.22,
                pd.Timestamp("2022-07-04 14:10:54.899093+0000", tz="UTC"): 0.13,
                pd.Timestamp("2022-07-04 14:10:55.299093+0000", tz="UTC"): 0.19,
                pd.Timestamp("2022-07-04 14:10:59.346954+0000", tz="UTC"): 0.09,
            },
        }
    )
    
    print(df)
    

    enter image description here

    Here is one way to do it with a new version of your function:

    def move_subsequence(
        df: pd.DataFrame, start: pd.Timestamp, end: pd.Timestamp, new_start: pd.Timestamp
    ) -> pd.DataFrame:
        """Move a subsequence of a time series to a new position.
    
        Args:
            df: The time series to modify.
            start: The start time of the subsequence to move.
            end: The end time of the subsequence to move.
            new_start: The new start time for the subsequence.
    
        Returns:
            The modified time series.
    
        """
        # Check that start is valid else return df unchanged
        if start < df.index[0] or start > df.index[-1] or start > end:
            return df
    
        # Slice df and remove extracted rows
        sub_df = df.loc[start:end, :]
        df = df[~df.index.isin(sub_df.index)]
    
        # Evaluate time delta between eeach values of sub_df index
        time_deltas = [
            sub_df.index[i + 1] - sub_df.index[i]
            for i, _ in enumerate(sub_df.index)
            if i < sub_df.shape[0] - 1
        ]
    
        # Set new index of sub_df
        new_index = [new_start]
        for time_delta in time_deltas:
            new_index = new_index + [new_index[-1] + time_delta]
        sub_df.index = new_index
    
        # Concat and return df and sub_df
        return pd.concat([df, sub_df]).sort_index()
    

    Then, for instance:

    start = pd.Timestamp("2022-07-04 14:10:53.401380+00:00", tz="UTC")
    end = pd.Timestamp("2022-07-04 14:10:54.899093+00:00    ", tz="UTC")
    new_start = pd.Timestamp("2022-07-04 14:10:56.101032+0000", tz="UTC")
    
    new_df = move_subsequence(df, start, end, new_start)
    
    print(new_df)
    

    enter image description here