I'm trying to write a query against parquet using Polars SQL Context. It's working great if I pre-filter my arrow table by date. I cannot figure out how to use date in the SQL query.
works:
filters = define_filters(event.get("filters", None))
table = pq.read_table(
f"s3://my_s3_path{partition_path}",
partitioning="hive",
filters=filters,
)
df = pl.from_arrow(table)
ctx = pl.SQLContext(stuff=df)
sql = "SELECT things FROM stuff"
new_df = ctx.execute(sql,eager=True)
doesn't work (filters==None
in this case):
filters = define_filters(event.get("filters", None))
table = pq.read_table(
f"s3://my_s3_path{partition_path}",
partitioning="hive",
filters=filters,
)
df = pl.from_arrow(table)
ctx = pl.SQLContext(stuff=df)
sql = """
SELECT things
FROM stuff
where START_DATE_KEY >= '2023-06-01' and START_DATE_KEY < '2023-06-17'
"""
new_df = ctx.execute(sql,eager=True)
I get the following error:
Traceback (most recent call last):
File "/Users/xaras/projects/arrow-lambda/loose.py", line 320, in <module>
test_runner(target=args.t, limit=args.limit, is_debug=args.debug)
File "/Users/xaras/projects/arrow-lambda/loose.py", line 294, in test_runner
rows, metadata = test_handler(target, limit, display=True, is_debug=is_debug)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/xaras/projects/arrow-lambda/loose.py", line 110, in test_handler
response = json.loads(handler(event, None))
^^^^^^^^^^^^^^^^^^^^
File "/Users/xaras/projects/arrow-lambda/serverless/app.py", line 42, in handler
new_df = ctx.execute(
^^^^^^^^^^^^
File "/Users/xaras/.pyenv/versions/pyarrow/lib/python3.11/site-packages/polars/sql/context.py", line 275, in execute
return res.collect() if (eager or self._eager_execution) else res
^^^^^^^^^^^^^
File "/Users/xaras/.pyenv/versions/pyarrow/lib/python3.11/site-packages/polars/utils/deprecation.py", line 95, in wrapper
return function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/xaras/.pyenv/versions/pyarrow/lib/python3.11/site-packages/polars/lazyframe/frame.py", line 1713, in collect
return wrap_df(ldf.collect())
^^^^^^^^^^^^^
exceptions.ComputeError: cannot compare 'date/datetime/time' to a string value (create native python { 'date', 'datetime', 'time' } or compare to a temporal column)
UPDATE for addl info:
I started using cast('2023-06-01' as date)
. This runs, but does not return any records.
Here is a replicatable example:
import polars as pl
df = pl.DataFrame(
{
"a": ["2023-06-01", "2023-06-01", "2023-06-02"],
"b": [None, None, None],
"c": [4, 5, 6],
"d": [None, None, None],
}
)
df = df.with_columns(pl.col("a").str.strptime(pl.Date, "%Y-%m-%d", strict=False))
print(df)
ctx = pl.SQLContext(stuff=df)
new_df = ctx.execute(
"select * from stuff where a = cast('2023-06-01' as date)",
eager=True,
)
print(new_df)
## Returns...
shape: (3, 4)
┌────────────┬──────┬─────┬──────┐
│ a ┆ b ┆ c ┆ d │
│ --- ┆ --- ┆ --- ┆ --- │
│ date ┆ f32 ┆ i64 ┆ f32 │
╞════════════╪══════╪═════╪══════╡
│ 2023-06-01 ┆ null ┆ 4 ┆ null │
│ 2023-06-01 ┆ null ┆ 5 ┆ null │
│ 2023-06-02 ┆ null ┆ 6 ┆ null │
└────────────┴──────┴─────┴──────┘
shape: (0, 4)
┌──────┬─────┬─────┬─────┐
│ a ┆ b ┆ c ┆ d │
│ --- ┆ --- ┆ --- ┆ --- │
│ date ┆ f32 ┆ i64 ┆ f32 │
╞══════╪═════╪═════╪═════╡
└──────┴─────┴─────┴─────┘
Update: Implicit string → temporal conversion in SQL comparisons was added in Polars 0.20.26
The original query now runs as expected.
import polars as pl
df = pl.DataFrame({
"a": ["2023-06-01", "2023-06-01", "2023-06-02"],
"b": [None, None, None],
"c": [4, 5, 6],
"d": [None, None, None],
})
df = df.with_columns(pl.col("a").str.to_date())
with pl.SQLContext(stuff=df) as ctx:
ctx.execute(
"select * from stuff where a = '2023-06-01'",
eager=True
)
shape: (2, 4)
┌────────────┬──────┬─────┬──────┐
│ a ┆ b ┆ c ┆ d │
│ --- ┆ --- ┆ --- ┆ --- │
│ date ┆ f32 ┆ i64 ┆ f32 │
╞════════════╪══════╪═════╪══════╡
│ 2023-06-01 ┆ null ┆ 4 ┆ null │
│ 2023-06-01 ┆ null ┆ 5 ┆ null │
└────────────┴──────┴─────┴──────┘