I am using PostGIS to calculate length of interactions between two types of employees. I am performing the same set of operations for each area id. Here is my code:
CREATE OR REPLACE FUNCTION public.interactions_looped_ucmo_aic(areaid_array character varying[], d_date date, start_time__ timestamp without time zone, end_time__ timestamp without time zone)
RETURNS TABLE(area_id uuid, "createdAt_" timestamp without time zone, "updatedAt_" timestamp without time zone, "ID_" character varying, parent_imei character varying, child_imei character varying, parent_ping_time timestamp without time zone, child_ping_time timestamp without time zone, maxima timestamp without time zone, minima timestamp without time zone, "interactionID" character varying, interaction_type character varying)
LANGUAGE plpgsql AS $function$
DECLARE i integer := 1;
BEGIN
FOR i IN 1..array_length(areaid_array, 1) LOOP
RETURN QUERY with pings as (
select * from -- ================= SELECTING Parent INFO
(select p.imei,
st_transform(p."geoJson", 32643) as geom,
"generatedAt" as t,
p."ID" as u_ping_id
from campaign_pings p
where "generatedAt" + interval '5 hours' between start_time__ and end_time__
and p.imei in (/*multiple left joins*/)
) u
left join ( -- ================= SELECTING child INFO
SELECT p.imei,
p."ID" as a_ping_id,
st_transform(p."geoJson", 32643) as geom,
"generatedAt" as t
FROM campaign_pings p
where "generatedAt"+ interval '5 hours' between start_time__ and end_time__
and p.imei in (/*multiple left joins*/)
) a on ST_DWithin(u."geom", a."geom", 30)
and a.t BETWEEN u.t - INTERVAL '30 seconds'
and u.t + INTERVAL '30 seconds'
where
a.imei is not null
)/*more code to calculate length of continuous interactions in minutes*/;
END LOOP;
END $function$;
So essentially, I am performing the same set of operations for each area id. Is there a way to make use of caching here to reduce execution time? I mean to enforce the use of cache.
P.S - here is the full function:
CREATE OR REPLACE FUNCTION public.interactions_looped_ucmo_aic(areaid_array character varying[], d_date date, start_time__ timestamp without time zone, end_time__ timestamp without time zone)
RETURNS TABLE(area_id uuid, "createdAt_" timestamp without time zone, "updatedAt_" timestamp without time zone, "ID_" character varying, parent_imei character varying, child_imei character varying, parent_ping_time timestamp without time zone, child_ping_time timestamp without time zone, maxima timestamp without time zone, minima timestamp without time zone, "interactionID" character varying, interaction_type character varying)
LANGUAGE plpgsql
AS $function$
DECLARE
i integer := 1;
BEGIN
FOR i IN 1..array_length(areaid_array, 1) LOOP
RETURN QUERY
--SELECT areaid_array[i], d_date;
with pings as ( -- ================= SELECTING JOINED data with join conditions -- ================= -- ================= -- ================= -- ================= -- =================
select
u."imei" as u_imei,
a."imei" as a_imei,
u.t as u_ping_time,
a.t as a_ping_time,
u_ping_id,
a_ping_id
from
(-- ================= SELECTING Parent INFO [imei, geom, generatedAt, pingID] -- ================= -- ================= -- ================= -- ================= -- =================
select
p.imei,
st_transform(p."geoJson", 32643) as geom,
"generatedAt" as t,
p."ID" as u_ping_id
from
campaign_pings p
where --"createdAt"::date = d_date--d_date
"generatedAt" + interval '5 hours' between start_time__ and end_time__
and
p.imei in (
select
assets."IMEI"
from
campaign_staff staff
left join campaign_staff_info csi on csi."campaignStaffID"=staff."ID" and csi."date" = (select max(date) from campaign_staff_info csi2 where csi2."campaignStaffID" = csi."campaignStaffID" and csi2.date <= d_date)
left join campaign_staff_assets csa on csa."campaignStaffID" = staff."ID" and csa."date" = (select max(date) from campaign_staff_assets cs2 where cs2."campaignStaffID"= csa."campaignStaffID" and cs2.date<=d_date)
left join campaign_area_assets caa on caa."assetID" = csa."assetID"
and caa."campaignScopeAreaID" = csa."campaignScopeID"
left join assets assets on assets."ID" = caa."assetID"
left join staff_type staff_type ON staff_type."ID" = staff."staffTypeID"
left join campaign_asset_status cas on cas."campaignStaffID" = staff."ID" and cas."date" = (select max(date) from campaign_asset_status cas2 where cas2."campaignStaffID"= cas."campaignStaffID" and cas2.date<=d_date)
left join campaign_scope cs on cs."areaID" = caa."campaignScopeAreaID"
where
caa."campaignScopeAreaID" = areaid_array[i]::uuid --'0156276d-29e3-4da8-bc76-39d6ad3bf1f6'
and assets."IMEI" is not null
and staff_type."ID" = '38eb4157-c0e8-4334-ad79-9801610c4ab8' -- UCMO ID
and (cas.status is null or cas.status <> 'DeployedWoPhone')
and ((csi.number is null) or (csi.number <> '0'))
and d_date between cs."campaignStartDate" and cs."catchupEndDate"
)
) u
left join ( -- ================= SELECTING child INFO [imei, geom, generatedAt, pingID] -- ================= -- ================= -- ================= -- ================= -- =================
SELECT
p.imei,
p."ID" as a_ping_id,
st_transform(p."geoJson", 32643) as geom,
"generatedAt" as t
FROM
campaign_pings p
where --"createdAt"::date = d_date --d_date
"generatedAt"+ interval '5 hours' between start_time__ and end_time__
and
p.imei in (
select
assets."IMEI"
from
campaign_staff staff
left join campaign_staff_assets csa on csa."campaignStaffID" = staff."ID" and csa."date" = (select max(date) from campaign_staff_assets cs2 where cs2."campaignStaffID"= csa."campaignStaffID" and cs2.date<=d_date)
left join campaign_area_assets caa on caa."assetID" = csa."assetID"
and caa."campaignScopeAreaID" = csa."campaignScopeID"
left join assets assets on assets."ID" = caa."assetID"
left join staff_type staff_type ON staff_type."ID" = staff."staffTypeID"
left join campaign_asset_status cas on cas."campaignStaffID" = staff."ID" and cas."date" = (select max(date) from campaign_asset_status cas2 where cas2."campaignStaffID"= cas."campaignStaffID" and cas2.date<=d_date)
left join campaign_scope cs on cs."areaID" = caa."campaignScopeAreaID"
where
caa."campaignScopeAreaID" = areaid_array[i]::uuid
and assets."IMEI" is not null
and staff_type."ID" = '93313048-4e75-4e91-9bcd-44897a6df7c7' -- AIC ID
and (cas.status is null or cas.status <> 'DeployedWoPhone')
and d_date between cs."campaignStartDate" and cs."catchupEndDate"
)
) a on ST_DWithin(u."geom", a."geom", 30)
and a.t BETWEEN u.t - INTERVAL '30 seconds'
and u.t + INTERVAL '30 seconds'
where
a.imei is not null
)
,
ranked_interactions AS (
SELECT
u_imei,
a_imei,
a_ping_time +interval '5 hours' as a_ping_time,
u_ping_time+interval '5 hours' as u_ping_time,
(LEAD(u_ping_time) OVER (PARTITION BY u_imei, a_imei ORDER BY u_ping_time)) + interval '5 hours' AS next_parent_time,-- OG
ROW_NUMBER() OVER (PARTITION BY u_imei, a_imei ORDER BY u_ping_time) AS interaction_rank
FROM
pings
)
,
dummy as (
select *,next_parent_time-u_ping_time as time_diff from ranked_interactions
)
,
periods as (
select *, case when time_diff < interval '3 minute' then 'normal'
when time_diff >= interval '3 minute' then 'abnormal'
end as periods
from dummy )
,
ranked_data AS (
SELECT
*,
LEAD(a_ping_time) OVER (PARTITION BY u_imei, a_imei ORDER BY a_ping_time) AS start_time,
max(a_ping_time) over (PARTITION BY u_imei, a_imei) as max_time
FROM periods
),
f_data AS (
SELECT
*
FROM ranked_data
WHERE periods = 'abnormal'
union
select * FROM ranked_data
where interaction_rank=1
)
,
format as (
select *,lead(a_ping_time) over (PARTITION BY u_imei, a_imei) as duration_calc
from f_data
)
,
results as (
select *,
duration_calc-start_time as duration from format
where interaction_rank <> 1 and duration_calc is not null
union
select *,
-- duration_calc - a_ping_time as duration from format
duration_calc - start_time as duration from format---changedcode
where interaction_rank = 1
union
select *, max_time - start_time as duration from format
where duration_calc is null
order by u_imei,a_imei,interaction_rank
)
,
results_filtered as (
select * from results where duration>= interval '3 minute'
)
,
--parent_imei, child_imei, parent_ping_time, child_ping_time, maxima, minima, "interactionID", "forUcmo"
adding as (
select
--null as created_at,null as updated_at,null as id,
--results_filtered.*,
*,
case when interaction_rank = 1 and periods='normal'
then start_time - interval '1 minute'
else start_time
end as start_time_final from results_filtered)
select areaid_array[i]::uuid as area_id_,
NOW()::TIMESTAMP WITHOUT TIME zone as createdat,NOW()::TIMESTAMP WITHOUT TIME ZONE as updatedat,
CONCAT(adding.start_time_final, '-', adding.a_imei, '-', adding.u_imei,'-',NOW())::varchar as id,
--adding.created_at,
--adding.updated_at,
--adding.id,
adding.u_imei::varchar as parent_imei_ ,
adding.a_imei::varchar as child_imei_,
adding.u_ping_time as parent_ping_time_,
adding.a_ping_time as child_ping_time_,
case when adding.interaction_rank=1 and adding.periods='normal'
then adding.start_time_final+duration+interval '1 minute'
else adding.start_time_final+duration
end as maxima_,
adding.start_time_final as minima_,
--adding.u_imei+adding.a_imei+duration as "interactionID_",
CONCAT(adding.u_imei, '-', adding.a_imei, '-', duration)::varchar as "interactionID_",
'UA'::varchar as interaction_type
--adding.duration,adding.start_time_final
from adding
;
END LOOP;
END
$function$
;
The example isn't reproducible so I'm just compiling a bunch of comments into a sort-of-an-answer to the underlying question of how to speed this up, plus some random tips, in no particular order:
int
iterator, then subscript the array. You can iterate over the array directlyCREATE OR REPLACE FUNCTION public.interactions_looped_ucmo_aic(areaid_array uuid[]
--...
DECLARE current_areaid uuid;
BEGIN FOREACH current_areaid IN ARRAY areaid_array LOOP RETURN QUERY
--...
WHERE caa."campaignScopeAreaID" = current_areaid
null
, use concat_ws()
. It lets you specify the separator once, and then just list the fields to be separated by it.CONCAT_WS('-',adding.start_time_final, adding.a_imei, adding.u_imei,NOW())::varchar as id,
campaign_staff
drives the imei
list subquery, and then you filter that down afterwards. You might want to make sure it's driven by the smallest table and that you filter things out as early as possible. To an extent, Postgres will try to rearrange your joins and conditions attempting to do that automatically, but with this much stuff that might not be effective.campaign_staff_info
, campaign_staff_assets
and campaign_asset_status
, then filter that down using a subquery from that same table. You could just as well join directly to the subquery and do your filtering in it. You could also try to pre-fetch those max(date)
s along with the csi."campaignStaffID"
and later join to that pre-fetched, static temp table - if holding on to the additional column in a not fully filtered set isn't that much of an expense, you might get some performance by doing it once ahead of the loop instead of repeating twice in each loop.areaid
, then let another regular SQL function wrap it to allow for accepting an array, just to unnest()
it and call the primary function for each element. PostgreSQL will likely do a better job distributing the work, not necessarily stuffing it all into a queue, where each area has to wait for the previous ones to finish. It might even distribute some parallel workers and evaluate all queries for each areaid
, all at the same time.CREATE OR REPLACE FUNCTION public.interactions_looped_ucmo_aic(p_areaid uuid, d_date date, start_time__ timestamp without time zone, end_time__ timestamp without time zone)
RETURNS TABLE(area_id uuid, "createdAt_" timestamp without time zone, "updatedAt_" timestamp without time zone, "ID_" character varying, parent_imei character varying, child_imei character varying, parent_ping_time timestamp without time zone, child_ping_time timestamp without time zone, maxima timestamp without time zone, minima timestamp without time zone, "interactionID" character varying, interaction_type character varying)
LANGUAGE SQL AS $function$
with pings as (
--...
CREATE OR REPLACE FUNCTION public.interactions_looped_ucmo_aic(p_areaid_array uuid[], d_date date, start_time__ timestamp without time zone, end_time__ timestamp without time zone)
RETURNS TABLE(area_id uuid, "createdAt_" timestamp without time zone, "updatedAt_" timestamp without time zone, "ID_" character varying, parent_imei character varying, child_imei character varying, parent_ping_time timestamp without time zone, child_ping_time timestamp without time zone, maxima timestamp without time zone, minima timestamp without time zone, "interactionID" character varying, interaction_type character varying)
LANGUAGE SQL AS $function$
SELECT interactions_looped_ucmo_aic(p_areaid, d_date, start_time__, end_time__)
FROM UNNEST(p_areaid_array)p_areaid
$function$;