python dataframe time-series interpolation python-polars

Interpolate time series data from one df to time axis of another df in Python polars

I have time series data on different time axes in different dataframes. I need to interpolate data from one df to onto the time axis of another df, df_ref. Ex:

import polars as pl

# DataFrame with the reference time axis:
df_ref = pl.DataFrame({"dt": ["2022-12-14T14:00:01.000", "2022-12-14T14:00:02.000",
                              "2022-12-14T14:00:03.000", "2022-12-14T14:00:04.000",
                              "2022-12-14T14:00:05.000", "2022-12-14T14:00:06.000"]})
df_ref = df_ref.with_columns(pl.col("dt").str.to_datetime())

# DataFrame with a different frequency time axis, to be interpolated onto the reference time axis:
df = pl.DataFrame({
        "dt": ["2022-12-14T14:00:01.500", "2022-12-14T14:00:03.500", "2022-12-14T14:00:05.500"],
        "v1": [1.5, 3.5, 5.5]})
df = df.with_columns(pl.col("dt").str.to_datetime())

I cannot join the dfs since keys don't match:

print(df_ref.join(df, on="dt", how="left").interpolate())
shape: (6, 2)
┌─────────────────────┬──────┐
│ dt                  ┆ v1   │
│ ---                 ┆ ---  │
│ datetime[μs]        ┆ f64  │
╞═════════════════════╪══════╡
│ 2022-12-14 14:00:01 ┆ null │
│ 2022-12-14 14:00:02 ┆ null │
│ 2022-12-14 14:00:03 ┆ null │
│ 2022-12-14 14:00:04 ┆ null │
│ 2022-12-14 14:00:05 ┆ null │
│ 2022-12-14 14:00:06 ┆ null │
└─────────────────────┴──────┘

So my 'iterative' approach would be to interpolate each column individually, for instance like

from scipy.interpolate import interp1d

f = interp1d(df["dt"].dt.timestamp(), df["v1"],
             kind="linear", bounds_error=False, fill_value="extrapolate")

out = f(df_ref["dt"].dt.timestamp())
df_ref = df_ref.with_columns(pl.Series(out).alias("v1_interp"))

print(df_ref.head(6))
shape: (6, 2)
┌─────────────────────┬───────────┐
│ dt                  ┆ v1_interp │
│ ---                 ┆ ---       │
│ datetime[μs]        ┆ f64       │
╞═════════════════════╪═══════════╡
│ 2022-12-14 14:00:01 ┆ 1.0       │
│ 2022-12-14 14:00:02 ┆ 2.0       │
│ 2022-12-14 14:00:03 ┆ 3.0       │
│ 2022-12-14 14:00:04 ┆ 4.0       │
│ 2022-12-14 14:00:05 ┆ 5.0       │
│ 2022-12-14 14:00:06 ┆ 6.0       │
└─────────────────────┴───────────┘

Although this gives the result I need, I wonder if there is a more idiomatic approach? I hesitate to mention efficiency here since I haven't benchmarked this with real data yet ("measure, don't guess!"). However, I'd assume that a native implementation in the underlying Rust code could add some performance benefits.

Solution

The scipy.interpolate.interpol1d example ends up calling this function.

You could use the same approach and process each column with .map()

def polars_ip(df_ref, df):
   old = df["dt"].dt.timestamp().to_numpy()
   new = df_ref["dt"].dt.timestamp().to_numpy()

   hi = np.searchsorted(old, new).clip(1, len(old) - 1)
   lo = hi - 1

   def _interp(column):
      column = column.to_numpy()
      slope = (column[hi] - column[lo]) / (old[hi] - old[lo])
      return pl.Series(slope * (new - old[lo]) + column[lo])
      
   values = (
      pl.concat([df, df_ref], how="diagonal")
        .select(pl.exclude("dt").map(_interp))
   )
   values.columns = [f"{name}_ref_ip" for name in values.columns]
   
   return df_ref.hstack(values)

>>> %time polars_ip(df_ref, df)
CPU times: user 48.1 ms, sys: 20.4 ms, total: 68.5 ms
Wall time: 22 ms
shape: (85536, 11)

>>> %time scipy_ip(df_ref, df)
CPU times: user 75.5 ms, sys: 5.51 ms, total: 81 ms
Wall time: 74.3 ms
shape: (85536, 11)

Check they return the same values:

>>> polars_ip(df_ref, df).frame_equal(scipy_ip(df_ref, df))
True

You can also generate the same values using:

N_COLS = 10
names = list(map(str, range(N_COLS)))
has_reading = pl.col(names[0]).is_not_null()
has_no_reading = has_reading.is_not()
(
   pl.concat([df, df_ref], how="diagonal")
   .sort("dt")
   .with_columns([
      pl.when(has_reading).then(pl.all())
        .shift(-1).backward_fill().suffix("_hi"),
      pl.when(has_reading).then(pl.all())
        .shift(+1).forward_fill().suffix("_lo")
      ])
   .with_columns([
      pl.when(has_reading).then(pl.col(r"^.+_hi$"))
        .forward_fill().backward_fill(),
      pl.when(has_reading).then(pl.col(r"^.+_lo$"))
        .backward_fill().forward_fill()
      ])
   .filter(has_no_reading)
   .with_column(
      pl.col(r"^dt.*$").dt.timestamp().suffix("_ts"))
   .with_columns([
      (((pl.col(f"{name}_hi")  - pl.col(f"{name}_lo")) 
         / (pl.col("dt_hi_ts") - pl.col("dt_lo_ts")))
         * (pl.col("dt_ts")    - pl.col("dt_lo_ts")) 
         + pl.col(f"{name}_lo"))
         .alias(f"{name}_ref_ip") for name in names
      ])
   .select([
      pl.col("dt"), pl.col("^.+_ref_ip$")
   ])
)