I have the below code in HQL. It's drawing data from a table with >2.5bn rows and around 334 columns. It takes a while to run so I'm looking for any potential performance boosts where possible. This is main block of the query, there are subsequent CTEs which perform some further processing but the below is the most expensive query.
I appreciate window functions can be heavy in this context, however they are required as highly specific timestamp calculations are needed.
Any help is greatly appreciated!
WITH t1 AS (
SELECT
*,
CASE
WHEN (LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour) > 1
THEN NULL
WHEN (LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour) = 1
THEN UNIX_TIMESTAMP(CONCAT(`date`, " ", LPAD((hour + 1), 2, 0), ":00:00"), 'dd/MM/yyyy HH:mm:ss') - UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss')
WHEN (LAG(s_id) OVER (PARTITION BY id ORDER BY `time`) = s_id) = TRUE
AND (hour - LAG(hour) OVER (PARTITION BY id ORDER BY `time`)) = 1
AND (LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour) = 0
THEN (LEAD(UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id ORDER BY `time`) -
UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') + LAG(hour_overlap_add) OVER (PARTITION BY id ORDER BY `time`))
ELSE LEAD(unix_timestamp(CONCAT(`date`, " ", regexp_replace(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id, hour ORDER BY `time`)
- unix_timestamp(CONCAT(`date`, " ", regexp_replace(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss')
END AS time_to_next_trans
FROM(
SELECT
*,
(overlap_time_to_next_trans - sec_between_current_trans_and_next_hr) as hour_overlap_add
FROM(
SELECT
c_id,
s_id,
id,
rat,
dt,
`date`,
`time`,
hour,
(LEAD(s_id) OVER (PARTITION BY id ORDER BY `time`) = s_id) AS s_id_change,
LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour AS difference_hour,
UNIX_TIMESTAMP(CONCAT(`date`, " ", LPAD((hour + 1), 2, 0), ":00:00"), 'dd/MM/yyyy HH:mm:ss')
- UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') AS sec_between_current_trans_and_next_hr,
LEAD(UNIX_TIMESTAMP(CONCAT(`DATE`, " ", REGEXP_REPLACE(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id ORDER BY `time`) -
UNIX_TIMESTAMP(CONCAT(`DATE`, " ", REGEXP_REPLACE(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') AS overlap_time_to_next_trans
FROM database.tablename
WHERE dt = "${rundate}” AND ID <> 0
)j
)o
)
Have most of your derivations in step 1. And then reuse these without deriving again.
with j as
(
SELECT
c_id
,s_id
,id
,rat
,dt
,`date`
,`time`
,hour
,(LEAD(s_id) OVER (PARTITION BY id ORDER BY `time`) = s_id) AS s_id_change_lead
,(LAG(s_id) OVER (PARTITION BY id ORDER BY `time`) = s_id) AS s_id_change_lag
,LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour AS difference_hour
,hour - LAG(hour) OVER (PARTITION BY id ORDER BY `time`) as new_hour_diff
,UNIX_TIMESTAMP(CONCAT(`date`, " ", LPAD((hour + 1), 2, 0), ":00:00"), 'dd/MM/yyyy HH:mm:ss') - UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') AS sec_between_current_trans_and_next_hr,
,LEAD(UNIX_TIMESTAMP(CONCAT(`DATE`, " ", REGEXP_REPLACE(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id ORDER BY `time`) - UNIX_TIMESTAMP(CONCAT(`DATE`, " ", REGEXP_REPLACE(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') AS overlap_time_to_next_trans
FROM database.tablename
WHERE dt = "${rundate}" AND ID <> 0
),
o as
(
SELECT
*,
overlap_time_to_next_trans - sec_between_current_trans_and_next_hr as hour_overlap_add
FROM j
),
t1 as
(
SELECT
*
,CASE
WHEN difference_hour > 1
THEN NULL
WHEN difference_hour = 1
THEN sec_between_current_trans_and_next_hr
WHEN s_id_change_lag = TRUE AND new_hour_diff=1 AND difference_hour = 0
THEN overlap_time_to_next_trans + LAG(hour_overlap_add) OVER (PARTITION BY id ORDER BY `time`)
ELSE LEAD(unix_timestamp(CONCAT(`date`, " ", regexp_replace(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id, hour ORDER BY `time`) - unix_timestamp(CONCAT(`date`, " ", regexp_replace(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss')
END AS time_to_next_trans
FROM o
)
select * from t1;
Hope this helps.