Search code examples
sqlamazon-redshiftcorrelated-subquery

Redshift Correlated Subquery Pattern Not Supported


Let me preface by saying that I know this is not a particularly efficient or elegant piece of code. I am querying from a temp table called INSIDE made like so:

CREATE TEMP TABLE INSIDE (CONNECT_DATE DATE, DAILY_CONNECTIONS INT);`

I'm then attempting to run the following query on INSIDE in an attempt to test a model I've been working on.

SELECT *
, q5.DAN_PREDICTION - q5.LINEAR_PREDICTION AS PREDICTION_COMPARISON
, q5.DAN_PREDICTION - q5.ACTUAL_MONTH_END_AMOUNT AS DAN_VARIANCE
, q5.LINEAR_PREDICTION - q5.ACTUAL_MONTH_END_AMOUNT AS LINEAR_VARIANCE
FROM (SELECT *
  , q4.mtd + q4.last_yr_remainder + q4.run_rate * q4.days_remaining AS DAN_PREDICTION
  , q4.mtd + q4.curr_yr_7_day * days_remaining AS LINEAR_PREDICTION
    FROM(
SELECT 
       *
      , q3.curr_yr_7_day - q3.last_yr_7_day AS RUN_RATE
      FROM(
        SELECT 
          CONNECT_DATE
        , DAILY_CONNECTIONS 
        , (cur_yr_1_prev + cur_yr_2_prev + cur_yr_3_prev + cur_yr_4_prev + cur_yr_5_prev + cur_yr_6_prev + cur_yr_7_prev)/7 AS CURR_YR_7_DAY
        , (last_yr_1_prev + last_yr_2_prev + last_yr_3_prev + last_yr_4_prev + last_yr_5_prev + last_yr_6_prev + last_yr_7_prev)/7 AS LAST_YR_7_DAY
        , (SELECT ISNULL(SUM(ins.DAILY_CONNECTIONS), 0) 
            FROM INSIDE ins 
            WHERE DATEPART(MONTH, ins.CONNECT_DATE) = DATEPART(MONTH, q2.CONNECT_DATE) 
            AND DATEPART(YEAR, ins.CONNECT_DATE) = DATEPART(YEAR, q2.CONNECT_DATE)
            AND ins.CONNECT_DATE <= q2.CONNECT_DATE) AS MTD
        , (SELECT ISNULL(SUM(ins.DAILY_CONNECTIONS), 0) 
            FROM INSIDE ins 
            WHERE DATEPART(MONTH, ins.CONNECT_DATE) = DATEPART(MONTH, q2.CONNECT_DATE) 
            AND DATEPART(YEAR, ins.CONNECT_DATE) = DATEPART(YEAR, q2.CONNECT_DATE)-1
            AND ins.CONNECT_DATE > DATEADD(YEAR, -1, q2.CONNECT_DATE)) AS LAST_YR_REMAINDER
        , (SELECT TOP 1 DATEPART(DAY, last_day(CONNECT_DATE)) 
            FROM INSIDE 
            WHERE CONNECT_DATE = q2.CONNECT_DATE)-DATEPART(DAY, q2.CONNECT_DATE) DAYS_REMAINING
        , (SELECT ISNULL(SUM(ins.DAILY_CONNECTIONS), 0) 
            FROM INSIDE ins 
            WHERE DATEPART(MONTH, ins.CONNECT_DATE) = DATEPART(MONTH, q2.CONNECT_DATE)
            AND DATEPART(YEAR, ins.CONNECT_DATE) = DATEPART(YEAR, q2.CONNECT_DATE)) AS ACTUAL_MONTH_END_AMOUNT
        FROM
          (SELECT 
            q1.CONNECT_DATE CONNECT_DATE
          , q1.DAILY_CONNECTIONS DAILY_CONNECTIONS
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-1,q1.connect_date)), 0) CUR_YR_1_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-2,q1.connect_date)), 0) CUR_YR_2_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-3,q1.connect_date)), 0) CUR_YR_3_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-4,q1.connect_date)), 0) CUR_YR_4_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-5,q1.connect_date)), 0) CUR_YR_5_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-6,q1.connect_date)), 0) CUR_YR_6_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(DAY,-7,q1.connect_date)), 0) CUR_YR_7_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-1,q1.connect_date)))), 0) LAST_YR_1_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-2,q1.connect_date)))), 0) LAST_YR_2_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-3,q1.connect_date)))), 0) LAST_YR_3_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-4,q1.connect_date)))), 0) LAST_YR_4_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-5,q1.connect_date)))), 0) LAST_YR_5_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-6,q1.connect_date)))), 0) LAST_YR_6_PREV
          , ISNULL((SELECT DAILY_CONNECTIONS FROM INSIDE WHERE CONNECT_DATE = DATEADD(YEAR, -1,(DATEADD(DAY,-7,q1.connect_date)))), 0) LAST_YR_7_PREV
          FROM INSIDE q1 ORDER BY q1.CONNECT_DATE
        ) q2 ORDER BY q2.connect_date
      ) q3  
    ) q4 
  ) q5

Running the inner q1 query seems to work just fine; the problem starts when I run the subqueries in q2. Running any more than one of them (MTD, LAST_YR_REMAINDER, etc) at a time produces the following error:

Amazon Invalid operation: This type of correlated subquery pattern is not supported due to internal error;

I've been reviewing the documentation for unsupported subquery types in Redshift and don't understand which rule these are breaking. Any help would be greatly appreciated.


Solution

  • You have too many inline subqueries. Try using common table expressions (CTE) to decompose the logic in a way that Redshift can run efficiently.

    Most of your inline subqueries can be rewritten as an aggregate over a Cartesian product.

    WITH cte1 AS (
        SELECT i1.CONNECT_DATE       CONNECT_DATE
              ,i1.DAILY_CONNECTIONS  DAILY_CONNECTIONS
               -- Sub-selects converted to an aggregate over a sparse matrix
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY,  -1, i1.connect_date)                    THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_1_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY,  -2, i1.connect_date)                    THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_2_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY,  -3, i1.connect_date)                    THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_3_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY,  -4, i1.connect_date)                    THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_4_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY,  -5, i1.connect_date)                    THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_5_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY,  -6, i1.connect_date)                    THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_6_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(DAY,  -7, i1.connect_date)                    THEN DAILY_CONNECTIONS ELSE NULL END) CUR_YR_7_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-1, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_1_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-2, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_2_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-3, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_3_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-4, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_4_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-5, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_5_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-6, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_6_PREV
              ,MAX(CASE WHEN i2.CONNECT_DATE = DATEADD(YEAR, -1, (DATEADD(DAY,-7, i1.connect_date))) THEN DAILY_CONNECTIONS ELSE NULL END) LAST_YR_7_PREV
              ,SUM(CASE WHEN DATEPART(MONTH, i2.CONNECT_DATE) = DATEPART(MONTH, i1.CONNECT_DATE) 
                         AND DATEPART(YEAR, i2.CONNECT_DATE) = DATEPART(YEAR, i1.CONNECT_DATE) 
                         AND i2.CONNECT_DATE <= i1.CONNECT_DATE
                        THEN i2.DAILY_CONNECTIONS 
                   ELSE NULL END)   AS MTD
              ,SUM(CASE WHEN DATEPART(MONTH, i2.CONNECT_DATE) = DATEPART(MONTH, i1.CONNECT_DATE) 
                         AND DATEPART(YEAR, i2.CONNECT_DATE) = DATEPART(YEAR, i1.CONNECT_DATE)-1
                         AND i2.CONNECT_DATE > DATEADD(YEAR, -1, i1.CONNECT_DATE)
                        THEN i2.DAILY_CONNECTIONS 
                   ELSE NULL END)   AS LAST_YR_REMAINDER
              ,MAX(CASE WHEN i2.CONNECT_DATE = i1.CONNECT_DATE-DATEPART(DAY, i1.CONNECT_DATE) 
                        THEN DATEPART(DAY, last_day(CONNECT_DATE)) 
                   ELSE NULL END)   AS DAYS_REMAINING
              ,SUM(CASE WHEN DATEPART(MONTH, i2.CONNECT_DATE) = DATEPART(MONTH, i1.CONNECT_DATE)
                         AND DATEPART(YEAR, i2.CONNECT_DATE) = DATEPART(YEAR, i1.CONNECT_DATE)
                        THEN i2.DAILY_CONNECTIONS
                   ELSE NULL END)   AS ACTUAL_MONTH_END_AMOUNT
        FROM       INSIDE i1
        -- Create an intentional cartesian product
        CROSS JOIN INSIDE i2
        /*  Consider limiting the cartesian to a specific overlap range. E.g.
        WHERE i2.CONNECT_DATE <= DATEADD(YEAR, -1, (DATEADD(DAY,-7, i1.connect_date)))
        */
        -- Use group by to collapse the cartesian back to the original size
        GROUP BY 1, 2
        ORDER BY 1
    ), cte2 AS (
        SELECT CONNECT_DATE
             , DAILY_CONNECTIONS 
             , (cur_yr_1_prev + cur_yr_2_prev + cur_yr_3_prev + cur_yr_4_prev + cur_yr_5_prev + cur_yr_6_prev + cur_yr_7_prev)/7 AS CURR_YR_7_DAY
             , (last_yr_1_prev + last_yr_2_prev + last_yr_3_prev + last_yr_4_prev + last_yr_5_prev + last_yr_6_prev + last_yr_7_prev)/7 AS LAST_YR_7_DAY
             , MTD, LAST_YR_REMAINDER, DAYS_REMAINING, ACTUAL_MONTH_END_AMOUNT
        FROM cte1
        ORDER BY connect_date
    ), cte3 AS (
        SELECT *, curr_yr_7_day - last_yr_7_day AS RUN_RATE
        FROM cte2  
    ), cte4 AS (
        SELECT *
              , mtd + last_yr_remainder + run_rate * days_remaining AS DAN_PREDICTION
              , mtd + curr_yr_7_day * days_remaining AS LINEAR_PREDICTION
        FROM cte3
    )
    SELECT *
         , DAN_PREDICTION - LINEAR_PREDICTION AS PREDICTION_COMPARISON
         , DAN_PREDICTION - ACTUAL_MONTH_END_AMOUNT AS DAN_VARIANCE
         , LINEAR_PREDICTION - ACTUAL_MONTH_END_AMOUNT AS LINEAR_VARIANCE
    FROM cte4