Search code examples
pythondataframetime-seriesinterpolationpython-polars

Interpolate time series data from one df to time axis of another df in Python polars


I have time series data on different time axes in different dataframes. I need to interpolate data from one df to onto the time axis of another df, df_ref. Ex:

import polars as pl

# DataFrame with the reference time axis:
df_ref = pl.DataFrame({"dt": ["2022-12-14T14:00:01.000", "2022-12-14T14:00:02.000",
                              "2022-12-14T14:00:03.000", "2022-12-14T14:00:04.000",
                              "2022-12-14T14:00:05.000", "2022-12-14T14:00:06.000"]})
df_ref = df_ref.with_columns(pl.col("dt").str.strptime(pl.Datetime).cast(pl.Datetime))

# DataFrame with a different frequency time axis, to be interpolated onto the reference time axis:
df = pl.DataFrame({
        "dt": ["2022-12-14T14:00:01.500", "2022-12-14T14:00:03.500", "2022-12-14T14:00:05.500"],
        "v1": [1.5, 3.5, 5.5]})
df = df.with_columns(pl.col("dt").str.strptime(pl.Datetime).cast(pl.Datetime))

I cannot join the dfs since keys don't match:

print(df_ref.join(df, on="dt", how="left").interpolate())
shape: (6, 2)
┌─────────────────────┬──────┐
│ dt                  ┆ v1   │
│ ---                 ┆ ---  │
│ datetime[μs]        ┆ f64  │
╞═════════════════════╪══════╡
│ 2022-12-14 14:00:01 ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2022-12-14 14:00:02 ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2022-12-14 14:00:03 ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2022-12-14 14:00:04 ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2022-12-14 14:00:05 ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2022-12-14 14:00:06 ┆ null │
└─────────────────────┴──────┘

So my 'iterative' approach would be to interpolate each column individually, for instance like

from scipy.interpolate import interp1d

f = interp1d(df["dt"].dt.timestamp(), df["v1"],
             kind="linear", bounds_error=False, fill_value="extrapolate")

out = f(df_ref["dt"].dt.timestamp())
df_ref = df_ref.with_columns(pl.Series(out).alias("v1_interp"))

print(df_ref.head(6))
shape: (6, 2)
┌─────────────────────┬───────────┐
│ dt                  ┆ v1_interp │
│ ---                 ┆ ---       │
│ datetime[μs]        ┆ f64       │
╞═════════════════════╪═══════════╡
│ 2022-12-14 14:00:01 ┆ 1.0       │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2022-12-14 14:00:02 ┆ 2.0       │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2022-12-14 14:00:03 ┆ 3.0       │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2022-12-14 14:00:04 ┆ 4.0       │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2022-12-14 14:00:05 ┆ 5.0       │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2022-12-14 14:00:06 ┆ 6.0       │
└─────────────────────┴───────────┘

Although this gives the result I need, I wonder if there is a more idiomatic approach? I hesitate to mention efficiency here since I haven't benchmarked this with real data yet ("measure, don't guess!"). However, I'd assume that a native implementation in the underlying Rust code could add some performance benefits.


Solution

  • The scipy.interpolate.interpol1d example ends up calling this function.

    You could use the same approach and process each column with .map()

    def polars_ip(df_ref, df):
       old = df["dt"].dt.timestamp().to_numpy()
       new = df_ref["dt"].dt.timestamp().to_numpy()
    
       hi = np.searchsorted(old, new).clip(1, len(old) - 1)
       lo = hi - 1
    
       def _interp(column):
          column = column.to_numpy()
          slope = (column[hi] - column[lo]) / (old[hi] - old[lo])
          return pl.Series(slope * (new - old[lo]) + column[lo])
          
       values = (
          pl.concat([df, df_ref], how="diagonal")
            .select(pl.exclude("dt").map(_interp))
       )
       values.columns = [f"{name}_ref_ip" for name in values.columns]
       
       return df_ref.hstack(values)  
    
    >>> %time polars_ip(df_ref, df)
    CPU times: user 48.1 ms, sys: 20.4 ms, total: 68.5 ms
    Wall time: 22 ms
    shape: (85536, 11)
    
    >>> %time scipy_ip(df_ref, df)
    CPU times: user 75.5 ms, sys: 5.51 ms, total: 81 ms
    Wall time: 74.3 ms
    shape: (85536, 11)
    

    Check they return the same values:

    >>> polars_ip(df_ref, df).frame_equal(scipy_ip(df_ref, df))
    True
    

    You can also generate the same values using:

    N_COLS = 10
    names = list(map(str, range(N_COLS)))
    has_reading = pl.col(names[0]).is_not_null()
    has_no_reading = has_reading.is_not()
    (
       pl.concat([df, df_ref], how="diagonal")
       .sort("dt")
       .with_columns([
          pl.when(has_reading).then(pl.all())
            .shift(-1).backward_fill().suffix("_hi"),
          pl.when(has_reading).then(pl.all())
            .shift(+1).forward_fill().suffix("_lo")
          ])
       .with_columns([
          pl.when(has_reading).then(pl.col(r"^.+_hi$"))
            .forward_fill().backward_fill(),
          pl.when(has_reading).then(pl.col(r"^.+_lo$"))
            .backward_fill().forward_fill()
          ])
       .filter(has_no_reading)
       .with_column(
          pl.col(r"^dt.*$").dt.timestamp().suffix("_ts"))
       .with_columns([
          (((pl.col(f"{name}_hi")  - pl.col(f"{name}_lo")) 
             / (pl.col("dt_hi_ts") - pl.col("dt_lo_ts")))
             * (pl.col("dt_ts")    - pl.col("dt_lo_ts")) 
             + pl.col(f"{name}_lo"))
             .alias(f"{name}_ref_ip") for name in names
          ])
       .select([
          pl.col("dt"), pl.col("^.+_ref_ip$")
       ])
    )