Search code examples
sqlsql-serverintervalsmathematical-optimization

In a set of overlapping, version-numbered intervals, find the most recent version at each point in time


I'm working with a set of date intervals where each interval has a version number and new intervals will frequently overlap old ones, or even be subsets of them. From this data I need to calculate a new set of intervals that shows the most recent version number, at each point in time. Is there a set-based solution to this problem?

Here's an illustration:

Interval 1: 11111111111111111111111      
Interval 2:     2222222222               
Interval 3:   33333333333333             
Interval 4:                     444444444
Interval 5:                   555555555  
Result    : 11333333333333331155555555544

Here is a sample of the data I'm working with:

groupId   startDate  endDate     version
--------  ---------  ----------  ------
1         1/1/2010   1/1/2011    1
1         10/1/2010  7/5/2011    2
1         7/5/2011   8/13/2012   3
1         8/13/2012  12/31/2012  6
1         10/1/2012  11/1/2012   8

... and the desired output:

groupId   startDate  endDate     version
--------  ---------  ----------  ------
1         1/1/2010   10/1/2010   1
1         10/1/2010  7/5/2011    2
1         7/5/2011   8/13/2012   3
1         8/13/2011  10/1/2012   6
1         10/1/2012  11/1/2012   8 << note how version 8 supersedes version 6
1         11/1/2012  12/31/2012  6 << version 6 is split into two records

I haven't found any other examples of this problem, my googling only turns up queries that identify gaps and islands or covering sets.

I think I have an iterative solution (SQL Server 2008). It starts with a temp table for intervals in the result set and defines the start and end points for the range that we want to cover by inserting records with special version numbers. Then, it repeatedly identifies gaps between result set intervals and attempts to fill them with the most recent records from the original data set, until there are no more gaps or no more records to add:

GO
-- Create data set and results table
CREATE TABLE #Data (
     groupId    INT
    ,startDate  DATE
    ,endDate    DATE
    ,versionId  INT
)

INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2007-12-22', '2008-12-22', 8)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2008-12-22', '2009-12-22', 9)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2009-12-22', '2010-12-22', 10)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2010-12-22', '2011-12-22', 11)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-01-01', '2011-11-30', 500)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-12-22', '2012-12-22', 12)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 13)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 14)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 17)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 19)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-01-01', '2011-01-01', 1)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-10-01', '2011-07-05', 2)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2011-07-05', '2012-08-13', 3)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-08-13', '2012-12-31', 6)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-10-01', '2012-11-01', 8)


CREATE TABLE #Results (
     groupId        VARCHAR(10)
    ,startDate  DATE
    ,endDate    DATE
    ,versionId      BIGINT
)

DECLARE @startDate      DATE
DECLARE @endDate        DATE
DECLARE @placeholderId  BIGINT

SET @startDate = '20030101'
SET @endDate = '20121231'
SET @placeholderId = 999999999999999

INSERT #Results
SELECT DISTINCT
     groupId
    ,CASE WHEN MIN(startDate) < @startDate THEN MIN(startDate) ELSE @startDate END
    ,CASE WHEN MIN(startDate) < @startDate THEN @startDate ELSE MIN(startDate) END
    ,@placeholderId
FROM #data
GROUP BY groupId
UNION ALL
SELECT DISTINCT
     groupId
    ,CASE WHEN MAX(endDate) < @endDate THEN MAX(endDate) ELSE @endDate END
    ,CASE WHEN MAX(endDate) < @endDate THEN @endDate ELSE MAX(endDate) END
    ,@placeholderId
FROM #data
GROUP BY groupId
GO

-- Fill gaps in results table
DECLARE @startDate      DATE
DECLARE @endDate        DATE
DECLARE @placeholderId  BIGINT

SET @startDate = '20030101'
SET @endDate = '20111231'
SET @placeholderId = 999999999999999

DECLARE @counter INT
SET @counter = 0

WHILE @counter < 10
BEGIN
    SET @counter = @counter + 1;
    WITH Gaps AS (
        SELECT
             gs.groupId
            ,gs.startDate
            ,MIN(ge.endDate) as endDate
            ,ROW_NUMBER() OVER (ORDER BY gs.groupId, gs.startDate) as gapId
        FROM (
            SELECT groupId, endDate as startDate
            FROM #Results r1 
            WHERE NOT EXISTS (
                    SELECT * 
                    FROM #Results r2 
                    WHERE r2.groupId = r1.groupId
                        AND r2.versionId <> r1.versionId
                        AND r2.startDate <= r1.endDate
                        AND r2.endDate > r1.endDate
                )
                AND NOT (endDate >= @endDate AND versionId = @placeholderId)
        ) gs
        INNER JOIN (
            SELECT groupId, startDate as endDate
            FROM #Results r1 
            WHERE NOT EXISTS (
                    SELECT * 
                    FROM #Results r2 
                    WHERE r2.groupId = r1.groupId
                        AND r2.versionId <> r1.versionId
                        AND r2.endDate >= r1.startDate
                        AND r2.startDate < r1.startDate
                )
                AND NOT (startDate <= @startDate AND versionId = @placeholderId)
        ) ge
            ON ge.groupId = gs.groupId
            AND ge.endDate >= gs.startDate
        GROUP BY gs.groupId, gs.startDate
    )
    INSERT #Results (
         groupId
        ,startDate
        ,endDate
        ,versionId
    )
    SELECT
         d.groupId
        ,CASE WHEN d.startDate < g.startDate THEN g.startDate ELSE d.startDate END
        ,CASE WHEN d.endDate > g.endDate THEN g.endDate ELSE d.endDate END
        ,d.versionId
    FROM #Data d
    INNER JOIN Gaps g
        ON g.groupId = d.groupId
        AND g.startDate <= d.endDate
        AND g.endDate >= d.startDate
    INNER JOIN (
        SELECT 
             d.groupId
            ,gapId
            ,MAX(d.versionId) as versionId
        FROM #Data d
        INNER JOIN Gaps g
            ON g.groupId = d.groupId
            AND g.startDate <= d.endDate
            AND g.endDate >= d.startDate
        WHERE d.versionId < (
                SELECT MIN(versionId)
                FROM #Results r
                WHERE r.groupId = d.groupId
                    AND (r.startDate = g.endDate OR r.endDate = g.startDate)
            )
            AND NOT EXISTS (
                SELECT *
                FROM #Data dsup
                WHERE dsup.groupId = d.groupId
                    AND dsup.versionId > d.versionId
                    AND dsup.startDate <= d.startDate
                    AND dsup.endDate >= d.endDate
            )
        GROUP BY
             d.groupId
            ,g.gapId
    ) mg
        ON mg.groupId = g.groupId
        AND mg.gapId = g.gapId
        AND mg.versionId = d.versionId
END

SELECT *
FROM #Results
WHERE versionId <> @placeholderId
order by groupId, startDate

A set-based solution would be much more useful, but I've struggled to find one. Any ideas?


Solution

  • -- create a dates table
    create table dates (thedate date primary key clustered);
    ;with dates(thedate) as (
      select dateadd(yy,years.number,0)+days.number
        from master..spt_values years
        join master..spt_values days
          on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
       where years.type='p' and years.number between 100 and 150
          -- note: 100-150 creates dates in the year range 2000-2050
          --       adjust as required
    )
    insert dbo.dates select * from dates;
    
    -- for each date, determine the prevailing version
      select t.groupId, d.thedate, max(t.versionId) versionId
        into #tmp1
        from dates d
        join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
    group by t.groupId, d.thedate;
    
    -- create index to help
    create clustered index cix_tmp1 on #tmp1(groupId, thedate, versionId);
    
    -- find the start dates
    ;with t as (
       select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
         from #tmp1 a
    left join #tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
        where b.versionId is null
    )
       select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
         from t c
    left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
     order by groupId, startdate;
    

    Of course, you can do everything in "one query" but do it at your peril, as the performance goes down the drain, big time.

    DO NOT USE - for academic interest only-

    ;with dates(thedate) as (
      select dateadd(yy,years.number,0)+days.number
        from master..spt_values years
        join master..spt_values days
          on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
       where years.type='p' and years.number between 100 and 150
          -- note: 100-150 creates dates in the year range 2000-2050
          --       adjust as required
    ), tmp1 as (
      select t.groupId, d.thedate, max(t.versionId) versionId
        from dates d
        join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
    group by t.groupId, d.thedate
    ), t as (
       select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
         from tmp1 a
    left join tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
        where b.versionId is null
    )
       select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
         from t c
    left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
     order by groupId, startdate;