Let me preface by saying that I know this is not a particularly efficient or elegant piece of code. I am querying from a temp table called INSIDE made like so:
CREATE TEMP TABLE INSIDE (CONNECT_DATE DATE, DAILY_CONNECTIONS INT);`
I'm then attempting to run the following query on INSIDE in an attempt to test a model I've been working on.
SELECT *
, q5.DAN_PREDICTION - q5.LINEAR_PREDICTION AS PREDICTION_COMPARISON
, q5.DAN_PREDICTION - q5.ACTUAL_MONTH_END_AMOUNT AS DAN_VARIANCE
, q5.LINEAR_PREDICTION - q5.ACTUAL_MONTH_END_AMOUNT AS LINEAR_VARIANCE
FROM (SELECT *
, q4.mtd + q4.last_yr_remainder + q4.run_rate * q4.days_remaining AS DAN_PREDICTION
, q4.mtd + q4.curr_yr_7_day * days_remaining AS LINEAR_PREDICTION
FROM(
SELECT
*
, q3.curr_yr_7_day - q3.last_yr_7_day AS RUN_RATE
FROM(
SELECT
CONNECT_DATE
, DAILY_CONNECTIONS
, (cur_yr_1_prev + cur_yr_2_prev + cur_yr_3_prev + cur_yr_4_prev + cur_yr_5_prev + cur_yr_6_prev + cur_yr_7_prev)/7 AS CURR_YR_7_DAY
, (last_yr_1_prev + last_yr_2_prev + last_yr_3_prev + last_yr_4_prev + last_yr_5_prev + last_yr_6_prev + last_yr_7_prev)/7 AS LAST_YR_7_DAY
, (SELECT ISNULL(SUM(ins.DAILY_CONNECTIONS), 0)
FROM INSIDE ins
WHERE DATEPART(MONTH, ins.CONNECT_DATE) = DATEPART(MONTH, q2.CONNECT_DATE)
AND DATEPART(YEAR, ins.CONNECT_DATE) = DATEPART(YEAR, q2.CONNECT_DATE)
AND ins.CONNECT_DATE <= q2.CONNECT_DATE) AS MTD
, (SELECT ISNULL(SUM(ins.DAILY_CONNECTIONS), 0)
FROM INSIDE ins
WHERE DATEPART(MONTH, ins.CONNECT_DATE) = DATEPART(MONTH, q2.CONNECT_DATE)
AND DATEPART(YEAR, ins.CONNECT_DATE) = DATEPART(YEAR, q2.CONNECT_DATE)-1
AND ins.CONNECT_DATE > DATEADD(YEAR, -1, q2.CONNECT_DATE)) AS LAST_YR_REMAINDER
, (SELECT TOP 1 DATEPART(DAY, last_day(CONNECT_DATE))
FROM INSIDE
WHERE CONNECT_DATE = q2.CONNECT_DATE)-DATEPART(DAY, q2.CONNECT_DATE) DAYS_REMAINING
, (SELECT ISNULL(SUM(ins.DAILY_CONNECTIONS), 0)
FROM INSIDE ins
WHERE DATEPART(MONTH, ins.CONNECT_DATE) = DATEPART(MONTH, q2.CONNECT_DATE)
AND DATEPART(YEAR, ins.CONNECT_DATE) = DATEPART(YEAR, q2.CONNECT_DATE)) AS ACTUAL_MONTH_END_AMOUNT
FROM
(SELECT
q1.CONNECT_DATE CONNECT_DATE
, q1.DAILY_CONNECTIONS DAILY_CONNECTIONS
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-1,q1.connect_date)), 0) CUR_YR_1_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-2,q1.connect_date)), 0) CUR_YR_2_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-3,q1.connect_date)), 0) CUR_YR_3_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-4,q1.connect_date)), 0) CUR_YR_4_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-5,q1.connect_date)), 0) CUR_YR_5_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-6,q1.connect_date)), 0) CUR_YR_6_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-7,q1.connect_date)), 0) CUR_YR_7_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-1,q1.connect_date)))), 0) LAST_YR_1_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-2,q1.connect_date)))), 0) LAST_YR_2_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-3,q1.connect_date)))), 0) LAST_YR_3_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-4,q1.connect_date)))), 0) LAST_YR_4_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-5,q1.connect_date)))), 0) LAST_YR_5_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-6,q1.connect_date)))), 0) LAST_YR_6_PREV
, ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-7,q1.connect_date)))), 0) LAST_YR_7_PREV
FROM INSIDE q1 ORDER BY q1.CONNECT_DATE
) q2 ORDER BY q2.connect_date
) q3
) q4
) q5
Running the inner q1 query seems to work just fine; the problem starts when I run the subqueries in q2. Running any more than one of them (MTD, LAST_YR_REMAINDER, etc) at a time produces the following error:
Amazon Invalid operation: This type of correlated subquery pattern is not supported due to internal error;
I've been reviewing the documentation for unsupported subquery types in Redshift and don't understand which rule these are breaking. Any help would be greatly appreciated.
You have too many inline subqueries. Try using common table expressions (CTE
) to decompose the logic in a way that Redshift can run efficiently.
Most of your inline subqueries can be rewritten as an aggregate over a Cartesian product.
WITH cte1 AS (
SELECT i1.CONNECT_DATE CONNECT_DATE
,i1.DAILY_CONNECTIONS DAILY_CONNECTIONS
-- Sub-selects converted to an aggregate over a sparse matrix
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY, -1, i1.connect_date) THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_1_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY, -2, i1.connect_date) THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_2_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY, -3, i1.connect_date) THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_3_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY, -4, i1.connect_date) THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_4_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY, -5, i1.connect_date) THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_5_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY, -6, i1.connect_date) THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_6_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY, -7, i1.connect_date) THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_7_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-1, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_1_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-2, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_2_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-3, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_3_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-4, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_4_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-5, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_5_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-6, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_6_PREV
,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-7, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_7_PREV
,SUM(CASE WHEN DATEPART(MONTH, i2.CONNECT_DATE) = DATEPART(MONTH, i1.CONNECT_DATE)
AND DATEPART(YEAR, i2.CONNECT_DATE) = DATEPART(YEAR, i1.CONNECT_DATE)
AND i2.CONNECT_DATE <= i1.CONNECT_DATE
THEN i2.DAILY_CONNECTIONS
ELSE NULL END) AS MTD
,SUM(CASE WHEN DATEPART(MONTH, i2.CONNECT_DATE) = DATEPART(MONTH, i1.CONNECT_DATE)
AND DATEPART(YEAR, i2.CONNECT_DATE) = DATEPART(YEAR, i1.CONNECT_DATE)-1
AND i2.CONNECT_DATE > DATEADD(YEAR, -1, i1.CONNECT_DATE)
THEN i2.DAILY_CONNECTIONS
ELSE NULL END) AS LAST_YR_REMAINDER
,MAX(CASE WHEN i2.CONNECT_DATE = i1.CONNECT_DATE-DATEPART(DAY, i1.CONNECT_DATE)
THEN DATEPART(DAY, last_day(CONNECT_DATE))
ELSE NULL END) AS DAYS_REMAINING
,SUM(CASE WHEN DATEPART(MONTH, i2.CONNECT_DATE) = DATEPART(MONTH, i1.CONNECT_DATE)
AND DATEPART(YEAR, i2.CONNECT_DATE) = DATEPART(YEAR, i1.CONNECT_DATE)
THEN i2.DAILY_CONNECTIONS
ELSE NULL END) AS ACTUAL_MONTH_END_AMOUNT
FROM INSIDE i1
-- Create an intentional cartesian product
CROSS JOIN INSIDE i2
/* Consider limiting the cartesian to a specific overlap range. E.g.
WHERE i2.CONNECT_DATE <= DATEADD(YEAR, -1, (DATEADD(DAY,-7, i1.connect_date)))
*/
-- Use group by to collapse the cartesian back to the original size
GROUP BY 1, 2
ORDER BY 1
), cte2 AS (
SELECT CONNECT_DATE
, DAILY_CONNECTIONS
, (cur_yr_1_prev + cur_yr_2_prev + cur_yr_3_prev + cur_yr_4_prev + cur_yr_5_prev + cur_yr_6_prev + cur_yr_7_prev)/7 AS CURR_YR_7_DAY
, (last_yr_1_prev + last_yr_2_prev + last_yr_3_prev + last_yr_4_prev + last_yr_5_prev + last_yr_6_prev + last_yr_7_prev)/7 AS LAST_YR_7_DAY
, MTD, LAST_YR_REMAINDER, DAYS_REMAINING, ACTUAL_MONTH_END_AMOUNT
FROM cte1
ORDER BY connect_date
), cte3 AS (
SELECT *, curr_yr_7_day - last_yr_7_day AS RUN_RATE
FROM cte2
), cte4 AS (
SELECT *
, mtd + last_yr_remainder + run_rate * days_remaining AS DAN_PREDICTION
, mtd + curr_yr_7_day * days_remaining AS LINEAR_PREDICTION
FROM cte3
)
SELECT *
, DAN_PREDICTION - LINEAR_PREDICTION AS PREDICTION_COMPARISON
, DAN_PREDICTION - ACTUAL_MONTH_END_AMOUNT AS DAN_VARIANCE
, LINEAR_PREDICTION - ACTUAL_MONTH_END_AMOUNT AS LINEAR_VARIANCE
FROM cte4