I've made a query based on the one shared here: https://github.com/sagishporer/big-query-queries-for-firebase/wiki/Query:-Daily-retention to calculate user retention in BigQuery with the data streamed from Firebase.
So far it has been working, but as the data sets get bigger, it is no longer able to run it due to the following error:
Resources exceeded during query execution: The query could not be executed in the allotted memory. Peak usage: 129% of limit. Top memory consumer(s): sort operations used for analytic OVER() clauses: 100%
The query is the following:
SELECT
install_date,
SUM(CASE
WHEN days_since_install = 0 THEN users
ELSE 0 END) AS day_0,
SUM(CASE
WHEN days_since_install = 1 THEN users
ELSE 0 END) AS day_1,
SUM(CASE
WHEN days_since_install = 2 THEN users
ELSE 0 END) AS day_2,
SUM(CASE
WHEN days_since_install = 3 THEN users
ELSE 0 END) AS day_3,
SUM(CASE
WHEN days_since_install = 4 THEN users
ELSE 0 END) AS day_4,
SUM(CASE
WHEN days_since_install = 5 THEN users
ELSE 0 END) AS day_5,
SUM(CASE
WHEN days_since_install = 6 THEN users
ELSE 0 END) AS day_6,
SUM(CASE
WHEN days_since_install = 7 THEN users
ELSE 0 END) AS day_7,
SUM(CASE
WHEN days_since_install = 8 THEN users
ELSE 0 END) AS day_8,
SUM(CASE
WHEN days_since_install = 9 THEN users
ELSE 0 END) AS day_9,
SUM(CASE
WHEN days_since_install = 10 THEN users
ELSE 0 END) AS day_10,
SUM(CASE
WHEN days_since_install = 11 THEN users
ELSE 0 END) AS day_11,
SUM(CASE
WHEN days_since_install = 12 THEN users
ELSE 0 END) AS day_12,
SUM(CASE
WHEN days_since_install = 13 THEN users
ELSE 0 END) AS day_13,
SUM(CASE
WHEN days_since_install = 14 THEN users
ELSE 0 END) AS day_14,
SUM(CASE
WHEN days_since_install = 15 THEN users
ELSE 0 END) AS day_15,
SUM(CASE
WHEN days_since_install = 16 THEN users
ELSE 0 END) AS day_16,
SUM(CASE
WHEN days_since_install = 17 THEN users
ELSE 0 END) AS day_17,
SUM(CASE
WHEN days_since_install = 18 THEN users
ELSE 0 END) AS day_18,
SUM(CASE
WHEN days_since_install = 19 THEN users
ELSE 0 END) AS day_19,
SUM(CASE
WHEN days_since_install = 20 THEN users
ELSE 0 END) AS day_20,
SUM(CASE
WHEN days_since_install = 21 THEN users
ELSE 0 END) AS day_21,
SUM(CASE
WHEN days_since_install = 22 THEN users
ELSE 0 END) AS day_22,
SUM(CASE
WHEN days_since_install = 23 THEN users
ELSE 0 END) AS day_23,
SUM(CASE
WHEN days_since_install = 24 THEN users
ELSE 0 END) AS day_24,
SUM(CASE
WHEN days_since_install = 25 THEN users
ELSE 0 END) AS day_25,
SUM(CASE
WHEN days_since_install = 26 THEN users
ELSE 0 END) AS day_26,
SUM(CASE
WHEN days_since_install = 27 THEN users
ELSE 0 END) AS day_27,
SUM(CASE
WHEN days_since_install = 28 THEN users
ELSE 0 END) AS day_28,
SUM(CASE
WHEN days_since_install = 29 THEN users
ELSE 0 END) AS day_29,
SUM(CASE
WHEN days_since_install = 30 THEN users
ELSE 0 END) AS day_30
FROM (
SELECT
DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) AS install_date,
DATE(TIMESTAMP_MICROS(event_timestamp)) AS event_realdate,
DATE_DIFF(DATE(TIMESTAMP_MICROS(event_timestamp)), DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)), day) AS days_since_install,
COUNT(DISTINCT user_pseudo_id) AS users
FROM
`dataset.events_2019*`
WHERE
event_name = 'user_engagement'
AND user_pseudo_id NOT IN (
SELECT
user_pseudo_id
FROM (
SELECT
MIN(global_session_id),
user_pseudo_id,
user_first_touch_timestamp,
event_timestamp
FROM (
SELECT
*,
IF (previous_event='some_event'
AND LAG(global_session_id,1)OVER (ORDER BY global_session_id, event_name)=global_session_id,
LAG(global_session_id,1) OVER (ORDER BY global_session_id, event_name),
NULL) AS match
FROM (
SELECT
*,
LAG(event_name,1) OVER (ORDER BY global_session_id, event_name) AS previous_event
FROM (
SELECT
global_session_id,
event_name,
user_first_touch_timestamp,
event_timestamp,
user_pseudo_id
FROM (
SELECT
global_session_id,
event_name,
user_pseudo_id,
event_timestamp,
user_first_touch_timestamp,
IF (some_kill=1,
global_session_id,
NULL) AS session_some_kill,
IF (event_name='user_engagement',
global_session_id,
NULL) AS session
FROM (
SELECT
*,
CASE
WHEN event_params.key = 'Kills' AND event_params.value.int_value>0 THEN 1
ELSE 0
END AS some_kill,
SUM(is_new_session) OVER (ORDER BY user_pseudo_id, event_timestamp, event_name) AS global_session_id,
SUM(is_new_session) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp) AS user_session_id
FROM (
SELECT
*,
CASE
WHEN event_timestamp - last_event >= (30 * 60 * 1000) OR last_event IS NULL THEN 1
ELSE 0
END AS is_new_session
FROM (
SELECT
user_pseudo_id,
event_timestamp,
event_name,
event_params,
user_first_touch_timestamp,
LAG(event_timestamp,1) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp) AS last_event
FROM (
SELECT
user_pseudo_id,
event_timestamp,
event_name,
event_params,
user_first_touch_timestamp
FROM `dataset.events_2019*`,
UNNEST (event_params) AS event_params)
) last
) agg
)
)
WHERE
session_some_kill IS NOT NULL
OR session IS NOT NULL
GROUP BY
global_session_id,
event_name,
user_first_touch_timestamp,
event_timestamp,
user_pseudo_id
ORDER BY
global_session_id ) ) )
WHERE
match IS NOT NULL
AND event_timestamp-user_first_touch_timestamp<1.8e+9
GROUP BY
user_pseudo_id,
user_first_touch_timestamp,
event_timestamp))
GROUP BY
install_date,
event_realdate,
days_since_install )
GROUP BY
install_date
HAVING
day_0 > 0 /* Remove older dates - not enough data, you should also ignore the first record for partial data */
ORDER BY
install_date
Try the following:
In your UNNEST
clause add a WHERE
to reduce the size of the return records which impact performance, for example:
SELECT
user_pseudo_id,
event_timestamp,
event_name,
event_params,
user_first_touch_timestamp
FROM `analytics_185672896.events_2019*`,
UNNEST (event_params) AS event_params)
event_name = 'user_engagement
Remove ORDER BY
in the inner SQL to avoid extra calculation where not needed since BQ will need to get all results and ORDER before moving to the next step in the execution plan, See this link for more information