Search code examples
sqldatabasetemp-tablesgaps-and-islandsglobal-temp-tables

Find gap between values of the same type in sql


When there is a gap between commitments of the same type, the fact should contain multiple records that show the accurate start and end date of each continuous commitment. An example of this is patid 1001 and when there is a continuation of the same status without a gap, it should be a single record.


CREATE TABLE #legal_data (
    ClaimID VARCHAR(20)
    ,dim_legalstat_key int -- dimensionkey
    ,[order_start_date] DATE
    ,[order_end_date] DATE
    ,[days_committed]  int -- days between order_start_date & order_end_date
)

INSERT INTO #legal_data
VALUES
    ('1001','11','2022-05-11','2022-10-29','171')
    ,('1001','131','2022-07-15','2023-03-19','247')
    ,('1001','116','2023-03-14','2023-03-20','6')
    ,('1001','11','2023-03-20','2023-03-23','3')
    ,('1207','11','2022-09-13','2023-03-12','180')
    ,('1207','11','2023-03-10','2023-03-23','13')
    ,('1924','2','2021-12-18','2022-06-19','183')
    ,('1924','2','2022-06-19','2023-12-20','184')
    ,('1842','77','2021-02-20','2022-06-17','482')
    ,('1842','77','2022-06-18','2023-12-20','550')
    ,('1661','22','2022-02-14','2023-03-20','399')
    ,('1661','22','2022-02-14','2023-03-23','402')
    ,('1553','4','2022-01-14','2022-02-12','29')
    ,('1553','4','2022-02-14','2023-03-23','402')


----- desired result
CREATE TABLE #legal_Result (
    ClaimID VARCHAR(20)
    ,dim_legalstat_key int-- dimensionkey
    ,[order_start_date] DATE
    ,[order_end_date] DATE
    ,[days_committed]  int --days between order_start_date & order_end_date

)

INSERT INTO #legal_Result
VALUES
    ('1001','11','2022-05-11','2022-10-29','171')
    ,('1001','131','2022-07-15','2023-03-19','247')
    ,('1001','116','2023-03-14','2023-03-20','6')
    ,('1001','11','2023-03-20','2023-03-23','3')
    ,('1207','11','2022-09-13','2023-03-23','191')
    ,('1924','2','2021-12-18','2023-12-20','732')
    ,('1842','77','2021-02-20','2023-12-20','1033') --not working
    ,('1661','22','2022-02-14','2023-03-23','402') ---
    ,('1553','4','2022-01-14','2022-02-12','29') --anything the 
    ,('1553','4','2022-02-14','2023-03-23','402')

select * from #legal_data

select * from #legal_Result

Solution

  • I looked at the solution proposed by @dougp. It works well with "normally" ordered data. I think, problem is with "chaotically" ordered data such as this:

    period1 ---------------------------
    period2              -----
    period3                     ---------------------------
    

    For this question, for example

                ,('2925','5','2022-12-10','2022-12-20','x')
                ,('2925','5','2022-12-15','2022-12-18','x')
                ,('2925','5','2022-12-19','2022-12-29','x')
    

    Query result for this rows

    ClaimID dim_legalstat_key order_start_date order_end_date days_committed
    2925 5 2022-12-10 2022-12-20 10
    2925 5 2022-12-19 2022-12-29 10

    Expected result

    ClaimID dim_legalstat_key order_start_date order_end_date days_committed
    2925 5 2022-12-10 2022-12-29 19

    Maybe the data is ordered "correctly" and there will be no such case.

    This task is interesting. I see, my view of the solution would be similar to @dougp's solution. Seeing a possible error on the test data, I will propose a recursive solution for consideration.

    with ndata as(
    select ClaimID,dim_legalstat_key,order_start_date
      ,max(order_end_date)order_end_date
      ,datediff(d,order_start_date,max(order_end_date))days_committed
      ,row_number()over(partition by ClaimId order by order_start_date) rn 
    from #legal_data
    group by ClaimID,dim_legalstat_key,order_start_date
    )
    ,r as( --Islands head rows
    select rn headrow,rn,1 lvl,ClaimID,dim_legalstat_key
           ,order_start_date,order_end_date,days_committed
           ,cast(rn as varchar(1000)) rowlist
    from ndata t1
    where not exists
        (
         select * from ndata t2 
         where t2.ClaimId=t1.ClaimId and t2.rn<>t1.rn
           and t2.dim_legalstat_key=t1.dim_legalstat_key
           and t2.order_start_date<=t1.order_start_date 
           and t2.order_end_date>=t1.order_start_date 
         )
    
    union all  --iterate through all possible rows
    
    select r.headrow,t2.rn,r.lvl+1 lvl,r.ClaimID,t2.dim_legalstat_key
           ,case when r.order_start_date<t2.order_start_date then r.order_start_date
            else t2.order_start_date end order_start_date
           ,case when r.order_end_date>t2.order_end_date then r.order_end_date
            else t2.order_end_date end order_end_date
           ,r.days_committed
           ,cast(concat(r.rowlist,',',cast(t2.rn as varchar)) as varchar(1000))rowlist
    from r inner join ndata t2 
      on  t2.ClaimId=r.ClaimId  and t2.dim_legalstat_key=r.dim_legalstat_key
        and r.rn<>t2.rn
        and charindex(','+cast(t2.rn as varchar)+',',','+r.rowlist+',')=0
        and t2.order_start_date>=r.order_start_date 
        and t2.order_start_date<=r.order_end_date 
     and lvl<100  
    )
    select ClaimID,dim_legalstat_key
      ,min(order_start_date) order_start_date
      ,max(order_end_date)order_end_date
      ,datediff(d,min(order_start_date),max(order_end_date)) days_committed
    from r
    group by ClaimID,dim_legalstat_key,headrow
    order by ClaimID,dim_legalstat_key
    

    Example

    Update1.

    For concatenated ranges (start_date=next day from end_date =end_date+1day) I'll add some checkings. (for example case with ClaimId 1842)

    First CTE ndata unite rows with the same start_date to 1 row.

    Base part of recursion query r selects first row from group of intersected or concatenated rows.
    Recursive part of query consecutively combines all other rows from this group.

    Corrected query

    with ndata as(
    select ClaimID,dim_legalstat_key,order_start_date
      ,max(order_end_date)order_end_date
      ,datediff(d,order_start_date,max(order_end_date))days_committed
      ,row_number()over(partition by ClaimId order by order_start_date) rn 
    from #legal_data
    group by ClaimID,dim_legalstat_key,order_start_date
    )
    ,r as(
    select rn headrow,rn,1 lvl,ClaimID,dim_legalstat_key
           ,order_start_date,order_end_date,days_committed
           ,cast(rn as varchar(1000)) rowlist
    from ndata t1
    where not exists(
      select * from ndata t2 
      where t2.ClaimId=t1.ClaimId and t2.rn<>t1.rn
        and t2.dim_legalstat_key=t1.dim_legalstat_key
        and 
           ( 
            (  t2.order_start_date<=t1.order_start_date 
                and t2.order_end_date>=t1.order_start_date   
            )
            or
            (   -- concatenated date ranges 
               dateadd(day,1,t2.order_end_date)=t1.order_start_date
            )
           )
      )
    union all
    select r.headrow,t2.rn,r.lvl+1 lvl,r.ClaimID,t2.dim_legalstat_key
           ,case when r.order_start_date<t2.order_start_date then r.order_start_date
            else t2.order_start_date end order_start_date
           ,case when r.order_end_date>t2.order_end_date then r.order_end_date
            else t2.order_end_date end order_end_date
           ,r.days_committed
           ,cast(concat(r.rowlist,',',cast(t2.rn as varchar)) as varchar(1000))rowlist
    from r inner join ndata t2 
      on  t2.ClaimId=r.ClaimId  and t2.dim_legalstat_key=r.dim_legalstat_key
        and r.rn<>t2.rn
        and charindex(','+cast(t2.rn as varchar)+',',','+r.rowlist+',')=0
        and t2.order_start_date>=r.order_start_date
        and t2.order_start_date<=dateadd(day,1,r.order_end_date) -- concatenated date ranges
     and lvl<100  
    )
    --  select * from r;
    select ClaimID,dim_legalstat_key
      ,min(order_start_date) order_start_date
      ,max(order_end_date)order_end_date
      ,datediff(d,min(order_start_date),max(order_end_date)) days_committed
    from r
    group by ClaimID,dim_legalstat_key,headrow
    order by ClaimID,dim_legalstat_key
    

    Example here