I have this table
id(vehicle_id) = SYMBOL
model_id(vehicle_type_id) = SYMBOL
lock_status(true/false) = BOOLEAN
speed(vehicle_speed) = LONG
ts which is time series = TIMESTAMP
Scenario:
At the moment I have a solution based in JOINS, but I was wondering if something more efficient could be done based on window functions
-- Step 1: Identify ride boundaries
WITH ride_boundaries AS (
SELECT
main.vehicle_id,
main.ts,
main.lock_status,
MAX(sub.ts) AS prev_ts,
FIRST(sub.lock_status) AS prev_lock_status
FROM oem_vehicle_callback AS main
LEFT JOIN oem_vehicle_callback AS sub
ON sub.vehicle_id = main.vehicle_id
AND sub.ts < main.ts
AND sub.lock_status != main.lock_status
GROUP BY main.vehicle_id, main.ts, main.lock_status
),
-- Step 2: Filter for ride start events
ride_start_events AS (
SELECT
rb.vehicle_id,
rb.ts AS ride_start,
rb.lock_status,
rb.prev_lock_status
FROM ride_boundaries AS rb
WHERE rb.lock_status = false
AND (rb.prev_lock_status IS NULL OR rb.prev_lock_status = true)
),
-- Step 3: Find ride end events
ride_durations AS (
SELECT
start.vehicle_id,
start.ride_start,
MIN(sub.ts) AS ride_end
FROM ride_start_events AS start
LEFT JOIN oem_vehicle_callback AS sub
ON sub.vehicle_id = start.vehicle_id
AND sub.ts > start.ride_start
AND sub.lock_status != start.lock_status
GROUP BY start.vehicle_id, start.ride_start
),
-- Step 4: Aggregate total ride duration by month
monthly_ride_durations AS (
SELECT
date_trunc('M', ride_start) AS month, -- Corrected here
SUM(CAST((ride_end - ride_start) AS LONG) / 1000000) AS total_ride_duration_seconds
FROM ride_durations
WHERE ride_end IS NOT NULL
GROUP BY date_trunc('M', ride_start) -- Corrected this line
)
-- Step 5: Output the result
SELECT
month,
total_ride_duration_seconds
FROM monthly_ride_durations
ORDER BY month;
Using window functions we can make it work like this:
with prevEvents AS (
select *, first_value(case WHEN lock_status=false THEN 0
WHEN lock_status=true THEN 1
END)
over(
partition by vehicle_id order by ts
ROWS 1 PRECEDING EXCLUDE CURRENT ROW
) as prev_status
from oem_vehicle_callback where ts in today()
), ride_sessions AS (
select *, sum(case
when lock_status = true and prev_status = 0
OR lock_status = false and prev_status = 1
THEN 1
ELSE 0
END
) OVER (partition by vehicle_id order by ts) as ride
from prevEvents
), global_sessions AS (
select *, concat(vehicle_id, '#', ride) as session from ride_sessions
), totals AS (
select first(ts) ts, session,
FIRST(lock_status) lock_status, first(vehicle_id) as vehicle_id
from global_sessions
), prev_ts AS (
SELECT *, first_value(ts::long) OVER(partition by vehicle_id order by ts
ROWS 1 PRECEDING EXCLUDE CURRENT ROW)
as prev_ts from totals)
SELECT *, (ts::long - prev_ts) / 1000000 as seconds from prev_ts
where lock_status=false;
What I do here is:
I believe this query might be simplified a bit, but the idea still remains. By the way, QuestDB will be releasing soon the LAG
and LEAD
functions, which should be easier to use than the first_value
window function.