I'm working on a little utility app, coming from python/pandas and trying to rebuild some basic tools that can be distributed via executables. I'm having a hard time interpreting the documentation for what seems like it should be a fairly simple process of reading some raw data, resampling it based on the datetime column, and then interpolating it to fill in missing data as necessary.
My cargo.toml looks like:
[dependencies]
polars = "0.19.0"
And the code I've written so far is:
use polars::prelude::*;
use std::fs::File;
fn main() {
let mut df = CsvReader::new("raw.csv".into())
.finish();
//interpolate to clean up blank/nan
//resample/groupby 15Min-1D using mean, blank/nan if missing
let mut file = File::create("final.csv").expect("File not written!!!");
CsvWriter::new(&mut file)
.has_header(true)
.with_delimiter(b',')
.finish(&df);
}
and the raw.csv data might look like:
site,datetime,val1,val2,val3,val4,val5,val6
XX1,2021-01-01 00:45,,,,4.60,,
XX1,2021-01-01 00:50,,,,2.30,,
XX1,2021-01-01 00:53,21.90,16.00,77.67,3.45,1027.20,0.00
XX1,2021-01-01 01:20,,,,4.60,,
XX1,2021-01-01 01:53,21.90,16.00,77.67,3.45,1026.90,0.00
XX1,2021-01-01 01:55,,,,0.00,,
XX1,2021-01-01 02:00,,,,0.00,,
XX1,2021-01-01 02:45,,,,5.75,,
XX1,2021-01-01 02:50,,,,8.05,,
XX1,2021-01-01 02:53,21.00,16.00,80.69,8.05,1026.80,0.00
But I can't seem to call the methods because I get errors like:
method not found in `Result<DataFrame, PolarsError>`
or
expected struct `DataFrame`, found enum `Result`
and I'm not sure how to properly shift between classes.
I've tried obviously wrong answers like:
let grouped = df.lazy().groupby_dynamic("datetime", "1h").agg("datetime", mean());
but basically, I'm looking for the polars equivalent of pandas code:
df = df.interpolate()
df = df.resample(sample_frequency).mean()
Any help would be appreciated!
Here is an example of how you might:
use chrono::prelude::*;
use polars::prelude::*;
use polars_core::time::*;
use std::io::Cursor;
use polars::frame::groupby::DynamicGroupOptions;
fn main() -> Result<()> {
let csv = "site,datetime,val1,val2,val3,val4,val5,val6
XX1,2021-01-01 00:45,,,,4.60,,
XX1,2021-01-01 00:50,,,,2.30,,
XX1,2021-01-01 00:53,21.90,16.00,77.67,3.45,1027.20,0.00
XX1,2021-01-01 01:20,,,,4.60,,
XX1,2021-01-01 01:53,21.90,16.00,77.67,3.45,1026.90,0.00
XX1,2021-01-01 01:55,,,,0.00,,
XX1,2021-01-01 02:00,,,,0.00,,
XX1,2021-01-01 02:45,,,,5.75,,
XX1,2021-01-01 02:50,,,,8.05,,
XX1,2021-01-01 02:53,21.00,16.00,80.69,8.05,1026.80,0.00
";
let cursor = Cursor::new(csv);
// prefer scan csv when your data is not in memory
let mut df = CsvReader::new(cursor).finish()?;
df.try_apply("datetime", |s| {
s.utf8()?
.as_datetime(Some("%Y-%m-%d %H:%M"), TimeUnit::Nanoseconds)
.map(|ca| ca.into_series())
})?;
// now we take the datetime column and extract timestamps from them
// with these timestamps we create a `date_range` with an interval of 1 minute
let dt = df.column("datetime")?;
let timestamp = dt.cast(&DataType::Int64)?;
let timestamp_ca = timestamp.i64()?;
let first = timestamp_ca.get(0).unwrap();
let last = timestamp_ca.get(timestamp_ca.len() - 1).unwrap();
let range = date_range(
first,
last,
Duration::parse("1m"),
ClosedWindow::Both,
"date_range",
TimeUnit::Nanoseconds,
);
let range_df = DataFrame::new(vec![range.into_series()])?;
// now that we got the date_range we use it to upsample the dataframe.
// after that we interpolate the missing values
// and then we groupby in a fixed time interval to get more regular output
let out = range_df
.lazy()
.join(
df.lazy(),
[col("date_range")],
[col("datetime")],
JoinType::Left,
)
.select([col("*").interpolate()])
.groupby_dynamic([], DynamicGroupOptions {
index_column: "date_range".into(),
every: Duration::parse("15m"),
period: Duration::parse("15m"),
offset: Duration::parse("0m"),
truncate: true,
include_boundaries: false,
closed_window: ClosedWindow::Left,
}).agg([col("*").first()])
.collect()?;
dbg!(out);
Ok(())
}
These are the features I used:
["csv-file", "pretty_fmt", "temporal", "dtype-date", "dtype-datetime", "lazy", "interpolate", "dynamic_groupby"]
This ouputs
[src/main.rs:68] out = shape: (9, 9)
┌─────────────────────┬─────────────────────┬────────────┬────────────┬─────┬────────────┬────────────┬─────────────┬────────────┐
│ date_range ┆ date_range_first ┆ site_first ┆ val1_first ┆ ... ┆ val3_first ┆ val4_first ┆ val5_first ┆ val6_first │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ datetime[ns] ┆ datetime[ns] ┆ str ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═════════════════════╪═════════════════════╪════════════╪════════════╪═════╪════════════╪════════════╪═════════════╪════════════╡
│ 2021-01-01 00:45:00 ┆ 2021-01-01 00:45:00 ┆ XX1 ┆ null ┆ ... ┆ null ┆ 4.6 ┆ null ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2021-01-01 01:00:00 ┆ 2021-01-01 01:00:00 ┆ null ┆ 21.9 ┆ ... ┆ 77.67 ┆ 4.6 ┆ 1027.165 ┆ 0.0 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2021-01-01 01:15:00 ┆ 2021-01-01 01:15:00 ┆ null ┆ 21.9 ┆ ... ┆ 77.67 ┆ 4.6 ┆ 1027.09 ┆ 0.0 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2021-01-01 01:30:00 ┆ 2021-01-01 01:30:00 ┆ null ┆ 21.9 ┆ ... ┆ 77.67 ┆ 4.251515 ┆ 1027.015 ┆ 0.0 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2021-01-01 02:00:00 ┆ 2021-01-01 02:00:00 ┆ XX1 ┆ 21.795 ┆ ... ┆ 78.022333 ┆ 0.0 ┆ 1027.153333 ┆ 0.0 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2021-01-01 02:15:00 ┆ 2021-01-01 02:15:00 ┆ null ┆ 21.57 ┆ ... ┆ 78.777333 ┆ 4.983333 ┆ 1027.053333 ┆ 0.0 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2021-01-01 02:30:00 ┆ 2021-01-01 02:30:00 ┆ null ┆ 21.345 ┆ ... ┆ 79.532333 ┆ 5.366667 ┆ 1026.953333 ┆ 0.0 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2021-01-01 02:45:00 ┆ 2021-01-01 02:45:00 ┆ XX1 ┆ 21.12 ┆ ... ┆ 80.287333 ┆ 5.75 ┆ 1026.853333 ┆ 0.0 │
└─────────────────────┴─────────────────────┴────────────┴────────────┴─────┴────────────┴────────────┴─────────────┴────────────┘
Note that polars_core
also is needed for the time module, this will be exported to polars next patch.