Search code examples
rustrust-polars

How to avoid deep copy when using groupby in polars rust?


I have a dataset where I need to do groupby operation on different columns. Here is minimal working code using polars version "0.21.1"

use polars::prelude::*;
use polars_lazy::prelude::*;
use polars::df;

fn main(){
  let df = df![
    "x1" => ["a", "b", "c", "a"],
    "x2" => ["A", "A", "B", "B"],
    "y" => [1, 2, 3, 4],
    ].unwrap();

  let lf: LazyFrame = df.lazy();

  let out1 = groupby_x1(&lf);
  println!("{:?}", out1.collect());
  let out2 = groupby_x2(&lf);
  println!("{:?}", out2.collect());

}

fn  groupby_x1(lf: &LazyFrame) -> LazyFrame {
  let lf1: LazyFrame = lf.clone().groupby([col("x1")]).agg([
    col("y").sum().alias("y_sum"),
  ]);
  lf1
}

fn  groupby_x2(lf: &LazyFrame) -> LazyFrame {
  let lf1: LazyFrame = lf.clone().groupby([col("x2")]).agg([
    col("y").sum().alias("y_sum"),
  ]);
  lf1
}

But in the code I am making deep copies of whole lazyframe lf (using lf.clone(). How can I avoid that? If I replace lf.clone() with lf in functions groupby_x1 and groupby_x2 I get following error

error[E0507]: cannot move out of `*lf` which is behind a shared reference
  --> src/main.rs:22:24
   |
22 |   let lf1: LazyFrame = lf.groupby([col("x1")]).agg([
   |                        ^^^^^^^^^^^^^^^^^^^^^^^ move occurs because `*lf` has type `polars_lazy::frame::LazyFrame`, which does not implement the `Copy` trait

error[E0507]: cannot move out of `*lf` which is behind a shared reference
  --> src/main.rs:29:24
   |
29 |   let lf1: LazyFrame = lf.groupby([col("x2")]).agg([
   |                        ^^^^^^^^^^^^^^^^^^^^^^^ move occurs because `*lf` has type `polars_lazy::frame::LazyFrame`, which does not implement the `Copy` trait

For more information about this error, try `rustc --explain E0507`.
error: could not compile `polars_try` due to 2 previous errors

Solution

  • Polars Series are a newtype around Arc<Vec<ArrowRef>>. When you clone a DataFrame only the reference count of the Arc is incremented.

    In other words, polars never does deep clones. Clones of a DataFrame are super cheap.