Search code examples
sqlsnowflake-cloud-data-platformoverlapping

Find Duplicate Subscription for overlapping period


I need to identify valid subscription for users for different order placed.

CREATE OR REPLACE TEMP TABLE customer_orders(
    customer_id VARCHAR
  , product_id VARCHAR
  , subscription_start_date DATE
  , subscription_end_date DATE
);

INSERT INTO customer_orders
VALUES
  ('customer_id_001', 'product_id_001', '2024-01-01', '2024-01-31'),
  ('customer_id_001', 'product_id_001', '2024-02-01', '2024-02-29'),
  ('customer_id_001', 'product_id_001', '2024-03-01', '2024-03-31'),
  ('customer_id_001', 'product_id_001', '2024-04-01', '2024-04-15'),
  ('customer_id_001', 'product_id_001', '2024-04-01', '2024-04-30');

SELECT * FROM customer_orders ORDER BY 1,2,3,4,5;

In this data, we have an invalid order order_id_004 as there is a subsequent order order_id_005, which covers the subscription of whole month. I need to flag this record for audit purpose.

CUSTOMER_ID    PRODUCT_ID  SUBSCRIPTION_START_DATE SUBSCRIPTION_END_DATE COMMENTS 
customer_id_001     product_id_001  2024-01-01  2024-01-31 VALID ORDER
customer_id_001     product_id_001  2024-02-01  2024-02-29 VALID ORDER
customer_id_001     product_id_001  2024-03-01  2024-03-31 VALID ORDER
customer_id_001     product_id_001  2024-04-01  2024-04-15 INVALID ORDER
customer_id_001     product_id_001  2024-04-01  2024-04-30 VALID ORDER

Here is work in progress, producing invalid results:

SELECT   
    c1.customer_id 
  , c1.product_id 
  , c1.subscription_start_date 
  , c1.subscription_end_date  
  , CASE WHEN c1.subscription_start_date BETWEEN c2.subscription_start_date AND c2.subscription_end_date
           OR c1.subscription_end_date   BETWEEN c2.subscription_start_date AND c2.subscription_end_date
           OR c2.subscription_start_date BETWEEN c1.subscription_start_date AND c1.subscription_end_date
         THEN 'Duplicate Subscription'
         ELSE 'Valid Subscription'
    END AS comment     
FROM customer_orders c1
INNER JOIN customer_orders c2 
   ON c1.customer_id = c2.customer_id
  AND c1.product_id  = c2.product_id
GROUP BY ALL  
ORDER BY 1,2,3,4,5;

Solution

  • Try something like this:

    WITH src AS (
        SELECT *, ROW_NUMBER() OVER(ORDER BY customer_id, product_id, subscription_start_date, subscription_end_date) AS RN
          FROM customer_orders
    )
    SELECT   
        c1.customer_id 
      , c1.product_id 
      , c1.subscription_start_date 
      , c1.subscription_end_date  
      , IFF(c2.customer_id IS NOT NULL, 'Duplicate Subscription', 'Valid Subscription') AS comment
      , IFF(DATEDIFF(DAY, c2.subscription_start_date, c2.subscription_end_date) >= DATEDIFF(DAY, c1.subscription_start_date, c1.subscription_end_date), 'INVALID ORDER', 'VALID ORDER') AS Status
    FROM src c1
    LEFT JOIN src c2 
       ON c1.customer_id = c2.customer_id
      AND c1.product_id  = c2.product_id
      AND c1.RN <> c2.RN
      AND c1.subscription_start_date <= c2.subscription_end_date
      AND c2.subscription_start_date <= c1.subscription_end_date  
    ORDER BY 1,2,3,4;