I have a query here that uses four subqueries inside a single CTE, and each subquery is scanning every row of another CTE for each row in itself. I would think that this is very inefficient.
Are there any SQL optimizations that I can implement now that the proof of concept is finished? I don't have write access to the database, so optimizations would be required within the select clause.
WITH datetable AS (
SELECT generate_series(
DATE_TRUNC('week', (SELECT MIN(created_at) FROM org_accounts.deleted_users)),
DATE_TRUNC('week', now()),
'1 week'::INTERVAL
)::DATE AS week_start
), all_users AS (
SELECT
id,
registered_at,
NULL AS deleted_at
FROM org_accounts.users
WHERE status = 'active'
AND org_accounts.__user_is_qa(id) <> 'Y'
AND email NOT LIKE '%@org%'
UNION ALL
SELECT
id,
created_at AS registered_at,
deleted_at
FROM org_accounts.deleted_users
WHERE deleter_id = id
AND email NOT LIKE '%@org%'
), weekly_activity AS (
SELECT
DATE_TRUNC('week', date)::DATE AS week_start,
COUNT(DISTINCT user_id) AS weekly_active_users
FROM (
SELECT user_id, date
FROM org_storage_extra.stats_user_daily_counters
WHERE type in ('created_file', 'created_folder', 'created_secure_fetch')
UNION ALL
SELECT user_id, date
FROM ipfs_pinning_facility.stats_user_daily_counters
WHERE type <> 'shares_viewed_by_others'
) activity_ids_dates
WHERE EXISTS(SELECT 1 from all_users WHERE id = user_id)
GROUP BY week_start
), preprocessed AS (
SELECT
week_start,
(
SELECT COUNT(DISTINCT id)
FROM all_users
WHERE registered_at < week_start
AND (deleted_at IS NULL OR deleted_at > week_start)
) AS actual_users,
(
SELECT COUNT(DISTINCT id)
FROM all_users
WHERE deleted_at < week_start + '1 week'::INTERVAL
) AS cumulative_churned_users,
(
SELECT COUNT(DISTINCT id)
FROM all_users
WHERE registered_at >= week_start
AND registered_at < week_start + '1 week'::INTERVAL
) AS weekly_new_users,
(
SELECT COUNT(DISTINCT id)
FROM all_users
WHERE deleted_at >= week_start
AND deleted_at < week_start + '1 week'::INTERVAL
) AS weekly_churned_users,
COALESCE(weekly_active_users, 0) AS weekly_active_users
FROM datetable dt
LEFT JOIN weekly_activity USING (week_start)
ORDER BY week_start DESC
)
SELECT
week_start AS for_week_of,
actual_users + cumulative_churned_users AS cumulative_users,
cumulative_churned_users,
cumulative_churned_users::FLOAT / NULLIF((actual_users + cumulative_churned_users)::FLOAT, 0) AS cumulated_churn_rate,
actual_users,
weekly_new_users,
weekly_churned_users,
weekly_active_users,
weekly_churned_users::FLOAT / NULLIF(actual_users::FLOAT, 0) AS weekly_churn_rate
FROM preprocessed;
Results of query analysis:
QUERY PLAN
Subquery Scan on preprocessed (cost=40875.45..7501783.95 rows=1000 width=68) (actual time=1553.471..13613.116 rows=231 loops=1)
Output: preprocessed.week_start, (preprocessed.actual_users + preprocessed.cumulative_churned_users), preprocessed.cumulative_churned_users, ((preprocessed.cumulative_churned_users)::double precision / NULLIF(((preprocessed.actual_users + preprocessed.cumulative_churned_users))::double precision, '0'::double precision)), preprocessed.actual_users, preprocessed.weekly_new_users, preprocessed.weekly_churned_users, preprocessed.weekly_active_users, ((preprocessed.weekly_churned_users)::double precision / NULLIF((preprocessed.actual_users)::double precision, '0'::double precision))
Buffers: shared hit=287734 read=1964, temp read=274840 written=873
CTE all_users
-> Append (cost=0.00..30953.99 rows=70293 width=32) (actual time=0.099..1313.372 rows=71228 loops=1)
Buffers: shared hit=285995 read=1964
-> Seq Scan on org_accounts.users (cost=0.00..27912.65 rows=70009 width=32) (actual time=0.099..1289.469 rows=70007 loops=1)
Output: users.id, users.registered_at, NULL::timestamp with time zone
Filter: ((users.email !~~ '%@mailinator%'::text) AND (users.email !~~ '%@org%'::text) AND (users.email !~~ '%testaccnt%'::text) AND (users.status = 'active'::text) AND ((org_accounts.__user_is_qa(users.id))::text <> 'Y'::text))
Rows Removed by Filter: 9933
Buffers: shared hit=285269 read=1964
-> Seq Scan on org_accounts.deleted_users (cost=0.00..1986.94 rows=284 width=32) (actual time=0.014..14.267 rows=1221 loops=1)
Output: deleted_users.id, deleted_users.created_at, deleted_users.deleted_at
Filter: ((deleted_users.email !~~ '%@mailinator%'::text) AND (deleted_users.email !~~ '%@org%'::text) AND (deleted_users.email !~~ '%testaccnt%'::text) AND (deleted_users.deleter_id = deleted_users.id))
Rows Removed by Filter: 61826
Buffers: shared hit=726
-> Merge Left Join (cost=9921.47..7470794.97 rows=1000 width=44) (actual time=1553.467..13612.496 rows=231 loops=1)
Output: (((generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)))::date), (SubPlan 2), (SubPlan 3), (SubPlan 4), (SubPlan 5), COALESCE(weekly_activity.weekly_active_users, '0'::bigint)
Inner Unique: true
Merge Cond: ((((generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)))::date) = weekly_activity.week_start)
Buffers: shared hit=287734 read=1964, temp read=274840 written=873
-> Sort (cost=1601.45..1603.95 rows=1000 width=4) (actual time=10.108..10.250 rows=231 loops=1)
Output: (((generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)))::date)
Sort Key: (((generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)))::date) DESC
Sort Method: quicksort Memory: 35kB
Buffers: shared hit=726
-> Result (cost=1514.10..1541.62 rows=1000 width=4) (actual time=9.986..10.069 rows=231 loops=1)
Output: ((generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)))::date
Buffers: shared hit=726
InitPlan 6 (returns $5)
-> Aggregate (cost=1514.09..1514.10 rows=1 width=8) (actual time=9.974..9.975 rows=1 loops=1)
Output: min(deleted_users_1.created_at)
Buffers: shared hit=726
-> Seq Scan on org_accounts.deleted_users deleted_users_1 (cost=0.00..1356.47 rows=63047 width=8) (actual time=0.006..4.332 rows=63047 loops=1)
Output: deleted_users_1.id, deleted_users_1.email, deleted_users_1.created_at, deleted_users_1.deleter_id, deleted_users_1.deleted_at, deleted_users_1.registration_app
Buffers: shared hit=726
-> ProjectSet (cost=0.00..5.03 rows=1000 width=8) (actual time=9.984..10.030 rows=231 loops=1)
Output: generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)
Buffers: shared hit=726
-> Result (cost=0.00..0.01 rows=1 width=0) (actual time=0.000..0.001 rows=1 loops=1)
-> Sort (cost=8320.02..8320.52 rows=200 width=12) (actual time=1475.315..1475.418 rows=159 loops=1)
Output: weekly_activity.weekly_active_users, weekly_activity.week_start
Sort Key: weekly_activity.week_start DESC
Sort Method: quicksort Memory: 32kB
Buffers: shared hit=287008 read=1964, temp read=412 written=872
-> Subquery Scan on weekly_activity (cost=8050.90..8312.37 rows=200 width=12) (actual time=1466.686..1475.279 rows=159 loops=1)
Output: weekly_activity.weekly_active_users, weekly_activity.week_start
Buffers: shared hit=287008 read=1964, temp read=412 written=872
-> GroupAggregate (cost=8050.90..8310.37 rows=200 width=12) (actual time=1466.685..1475.254 rows=159 loops=1)
Output: ((date_trunc('week'::text, ("*SELECT* 1".date)::timestamp with time zone))::date), count(DISTINCT "*SELECT* 1".user_id)
Group Key: ((date_trunc('week'::text, ("*SELECT* 1".date)::timestamp with time zone))::date)
Buffers: shared hit=287008 read=1964, temp read=412 written=872
-> Sort (cost=8050.90..8136.22 rows=34130 width=20) (actual time=1466.668..1468.872 rows=23005 loops=1)
Output: ((date_trunc('week'::text, ("*SELECT* 1".date)::timestamp with time zone))::date), "*SELECT* 1".user_id
Sort Key: ((date_trunc('week'::text, ("*SELECT* 1".date)::timestamp with time zone))::date)
Sort Method: quicksort Memory: 2566kB
Buffers: shared hit=287008 read=1964, temp read=412 written=872
-> Hash Join (cost=1586.09..5481.12 rows=34130 width=20) (actual time=1411.350..1462.022 rows=23005 loops=1)
Output: (date_trunc('week'::text, ("*SELECT* 1".date)::timestamp with time zone))::date, "*SELECT* 1".user_id
Inner Unique: true
Hash Cond: ("*SELECT* 1".user_id = all_users.id)
Buffers: shared hit=287008 read=1964, temp read=412 written=872
-> Append (cost=0.00..3080.17 rows=68261 width=20) (actual time=0.010..25.441 rows=68179 loops=1)
Buffers: shared hit=1013
-> Subquery Scan on "*SELECT* 1" (cost=0.00..1018.43 rows=21568 width=20) (actual time=0.008..7.895 rows=21532 loops=1)
Output: "*SELECT* 1".date, "*SELECT* 1".user_id
Buffers: shared hit=372
-> Seq Scan on org_storage_extra.stats_user_daily_counters (cost=0.00..802.75 rows=21568 width=20) (actual time=0.008..5.910 rows=21532 loops=1)
Output: stats_user_daily_counters.user_id, stats_user_daily_counters.date
Filter: (stats_user_daily_counters.type = ANY ('{created_file,created_folder,created_secure_fetch}'::text[]))
Rows Removed by Filter: 9795
Buffers: shared hit=372
-> Subquery Scan on "*SELECT* 2" (cost=0.00..1720.44 rows=46693 width=20) (actual time=0.009..12.460 rows=46647 loops=1)
Output: "*SELECT* 2".date, "*SELECT* 2".user_id
Buffers: shared hit=641
-> Seq Scan on ipfs_pinning_facility.stats_user_daily_counters stats_user_daily_counters_1 (cost=0.00..1253.51 rows=46693 width=20) (actual time=0.009..8.209 rows=46647 loops=1)
Output: stats_user_daily_counters_1.user_id, stats_user_daily_counters_1.date
Filter: (stats_user_daily_counters_1.type <> 'shares_viewed_by_others'::text)
Rows Removed by Filter: 2354
Buffers: shared hit=641
-> Hash (cost=1583.59..1583.59 rows=200 width=16) (actual time=1411.250..1411.251 rows=71228 loops=1)
Output: all_users.id
Buckets: 131072 (originally 1024) Batches: 2 (originally 1) Memory Usage: 3073kB
Buffers: shared hit=285995 read=1964, temp read=100 written=717
-> HashAggregate (cost=1581.59..1583.59 rows=200 width=16) (actual time=1383.986..1398.270 rows=71228 loops=1)
Output: all_users.id
Group Key: all_users.id
Batches: 5 Memory Usage: 4161kB Disk Usage: 1544kB
Buffers: shared hit=285995 read=1964, temp read=100 written=560
-> CTE Scan on all_users (cost=0.00..1405.86 rows=70293 width=16) (actual time=0.102..1351.241 rows=71228 loops=1)
Output: all_users.id
Buffers: shared hit=285995 read=1964, temp written=296
SubPlan 2
-> Aggregate (cost=1777.05..1777.06 rows=1 width=8) (actual time=20.197..20.197 rows=1 loops=231)
Output: count(DISTINCT all_users_1.id)
Buffers: temp read=68607 written=1
-> CTE Scan on all_users all_users_1 (cost=0.00..1757.33 rows=7888 width=16) (actual time=0.883..10.874 rows=27239 loops=231)
Output: all_users_1.id, all_users_1.registered_at, all_users_1.deleted_at
Filter: ((all_users_1.registered_at < (((generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)))::date)) AND ((all_users_1.deleted_at IS NULL) OR (all_users_1.deleted_at > (((generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)))::date))))
Rows Removed by Filter: 43989
Buffers: temp read=68607 written=1
SubPlan 3
-> Aggregate (cost=1815.90..1815.91 rows=1 width=8) (actual time=11.215..11.215 rows=1 loops=231)
Output: count(DISTINCT all_users_2.id)
Buffers: temp read=68607
-> CTE Scan on all_users all_users_2 (cost=0.00..1757.33 rows=23431 width=16) (actual time=11.009..11.150 rows=231 loops=231)
Output: all_users_2.id, all_users_2.registered_at, all_users_2.deleted_at
Filter: (all_users_2.deleted_at < ((((generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)))::date) + '7 days'::interval))
Rows Removed by Filter: 70997
Buffers: temp read=68607
SubPlan 4
-> Aggregate (cost=1933.94..1933.95 rows=1 width=8) (actual time=14.515..14.515 rows=1 loops=231)
Output: count(DISTINCT all_users_3.id)
Buffers: temp read=68607
-> CTE Scan on all_users all_users_3 (cost=0.00..1933.06 rows=351 width=16) (actual time=2.264..14.424 rows=308 loops=231)
Output: all_users_3.id, all_users_3.registered_at, all_users_3.deleted_at
Filter: ((all_users_3.registered_at >= (((generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)))::date)) AND (all_users_3.registered_at < ((((generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)))::date) + '7 days'::interval)))
Rows Removed by Filter: 70920
Buffers: temp read=68607
SubPlan 5
-> Aggregate (cost=1933.94..1933.95 rows=1 width=8) (actual time=6.556..6.556 rows=1 loops=231)
Output: count(DISTINCT all_users_4.id)
Buffers: temp read=68607
-> CTE Scan on all_users all_users_4 (cost=0.00..1933.06 rows=351 width=16) (actual time=6.441..6.547 rows=5 loops=231)
Output: all_users_4.id, all_users_4.registered_at, all_users_4.deleted_at
Filter: ((all_users_4.deleted_at >= (((generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)))::date)) AND (all_users_4.deleted_at < ((((generate_series(date_trunc('week'::text, $5), date_trunc('week'::text, now()), '7 days'::interval)))::date) + '7 days'::interval)))
Rows Removed by Filter: 71223
Buffers: temp read=68607
Planning Time: 0.612 ms
Execution Time: 13615.054 ms
An obvious optimization is to eliminate redundant table scans. There isn't any need in preprocessed
to query from all_users
more than once. The following query uses COUNT
with FILTER
to gather the same statistics:
WITH datetable AS (SELECT GENERATE_SERIES(
DATE_TRUNC('week', (SELECT MIN(created_at) FROM org_accounts.deleted_users)),
DATE_TRUNC('week', NOW()),
'1 week'::INTERVAL
)::DATE AS week_start),
all_users AS (SELECT id,
registered_at,
NULL AS deleted_at
FROM org_accounts.users
WHERE status = 'active'
AND org_accounts.__user_is_qa(id) <> 'Y'
AND email NOT LIKE '%@org%'
UNION ALL
SELECT id,
created_at AS registered_at,
deleted_at
FROM org_accounts.deleted_users
WHERE deleter_id = id
AND email NOT LIKE '%@org%'),
weekly_activity AS (SELECT DATE_TRUNC('week', date)::DATE AS week_start,
COUNT(DISTINCT user_id) AS weekly_active_users
FROM (SELECT user_id, date
FROM org_storage_extra.stats_user_daily_counters
WHERE type IN ('created_file', 'created_folder', 'created_secure_fetch')
UNION ALL
SELECT user_id, date
FROM ipfs_pinning_facility.stats_user_daily_counters
WHERE type <> 'shares_viewed_by_others') activity_ids_dates
WHERE EXISTS(SELECT 1 FROM all_users WHERE id = user_id)
GROUP BY week_start),
preprocessed AS (SELECT week_start,
us.actual_users,
us.cumulative_churned_users,
us.weekly_new_users,
us.weekly_churned_users,
COALESCE(weekly_active_users, 0) AS weekly_active_users
FROM datetable dt
CROSS JOIN LATERAL (SELECT
COUNT(DISTINCT u.id) FILTER (WHERE u.registered_at < dt.week_start AND
(u.deleted_at IS NULL OR u.deleted_at > dt.week_start)) AS actual_users,
COUNT(DISTINCT u.id)
FILTER (WHERE u.deleted_at < dt.week_start + '1 week'::INTERVAL) AS cumulative_churned_users,
COUNT(DISTINCT u.id)
FILTER (WHERE u.registered_at >= dt.week_start AND u.registered_at <
dt.week_start +
'1 week'::INTERVAL) AS weekly_new_users,
COUNT(DISTINCT u.id)
FILTER (WHERE u.deleted_at >= dt.week_start AND u.deleted_at <
dt.week_start +
'1 week'::INTERVAL) AS weekly_churned_users
FROM all_users u
WHERE u.registered_at < dt.week_start + '1 week'::INTERVAL
OR (u.deleted_at >= dt.week_start AND
u.deleted_at < dt.week_start + '1 week'::INTERVAL)) us
LEFT JOIN weekly_activity
USING (week_start)
ORDER BY week_start DESC)
SELECT week_start AS for_week_of,
actual_users + cumulative_churned_users AS cumulative_users,
cumulative_churned_users,
cumulative_churned_users::FLOAT /
NULLIF((actual_users + cumulative_churned_users)::FLOAT, 0) AS cumulated_churn_rate,
actual_users,
weekly_new_users,
weekly_churned_users,
weekly_active_users,
weekly_churned_users::FLOAT / NULLIF(actual_users::FLOAT, 0) AS weekly_churn_rate
FROM preprocessed;
There are probably other optimizations possible, but this one was immediately evident.