Search code examples
sqlsql-servert-sqlgaps-and-islands

Group records by continuous date periods in SQL


I have a table like this:

t104f005_employee_no t104f040_position_no t104f025_date_effective t104f030_date_to
11354 89043 01/07/1999 01/04/2012
11354 89043 02/04/2012 31/08/2014
11354 89043 01/09/2014 21/09/2014
11354 89043 22/09/2014 27/09/2015
11354 23273 04/05/2015 27/09/2015
11354 94040 28/09/2015 10/01/2016
11354 94040 11/01/2016 08/05/2017
11354 94040 09/05/2017 24/03/2019
11354 99406 26/02/2018 01/07/2018
11354 99406 02/07/2018 30/09/2018
11354 99406 01/10/2018 01/11/2018
11354 97293 02/11/2018 25/11/2018
11354 99406 26/11/2018 10/03/2019
11354 97293 11/03/2019 24/03/2019
11354 99406 25/03/2019 30/09/2019
11354 99406 01/10/2019 15/03/2020
11354 97293 11/11/2019 24/11/2019
11354 99406 16/03/2020 10/05/2020
11354 101808 11/05/2020 14/06/2020
11354 99406 11/05/2020 14/06/2020
11354 101808 15/06/2020 04/10/2020
11354 99406 15/06/2020 04/10/2020
11354 101808 05/10/2020 25/10/2020
11354 99406 05/10/2020 21/11/2021
11354 101808 26/10/2020 31/01/2021
11354 101808 01/02/2021 28/02/2021
11354 101808 01/03/2021 30/06/2021
11354 101806 01/07/2021 18/07/2021
11354 101808 19/07/2021 21/11/2021
11354 101808 22/11/2021 28/11/2021
11354 99406 22/11/2021 28/11/2021
11354 101808 29/11/2021 20/03/2022
11354 101806 21/03/2022 17/04/2022
11354 101808 21/03/2022 22/05/2022
11354 101808 23/05/2022 09/04/2023
11354 101903 15/08/2022 12/02/2023
11354 105009 13/02/2023 28/02/2023
11354 105009 01/03/2023 09/04/2023
11354 101808 10/04/2023 NULL
11354 80717 15/05/2023 12/05/2024

And I need to be able to identify the continuous position no periods with the min start and max end date outputted for each continuous period. Expected outcome would look like this:

t104f005_employee_no t104f040_position_no t104f025_date_effective t104f030_date_to
11354 89043 01/07/1999 27/09/2015
11354 23273 04/05/2015 27/09/2015
11354 94040 28/09/2015 24/03/2019
11354 99406 26/02/2018 01/11/2018
11354 97293 02/11/2018 25/11/2018
11354 99406 26/11/2018 10/03/2019
11354 97293 11/03/2019 24/03/2019
11354 99406 25/03/2019 28/11/2021
11354 97293 11/11/2019 24/11/2019
11354 101808 11/05/2020 30/06/2021
11354 101806 01/07/2021 18/07/2021
11354 101808 19/07/2021 NULL
11354 101806 21/03/2022 17/04/2022
11354 101903 15/08/2022 12/02/2023
11354 105009 13/02/2023 28/02/2023
11354 105009 01/03/2023 09/04/2023
11354 80717 15/05/2023 12/05/2024

I have tried the following methods but it's not working out and I'm a bit stumped now.

WITH RankedPositions AS (
  SELECT
    [t104f005_employee_no],
    [t104f040_position_no],
    [t104f025_date_effective],
    [t104f030_date_to],
    ROW_NUMBER() OVER (PARTITION BY [t104f005_employee_no] ORDER BY [t104f025_date_effective]) -
    ROW_NUMBER() OVER (PARTITION BY [t104f005_employee_no], [t104f040_position_no] ORDER BY [t104f025_date_effective]) AS grp
  FROM [AUR11PROD].[dbo].[t104_employment_history] with (nolock)
  WHERE [t104f005_employee_no] = '11354'
)    
SELECT
  [t104f005_employee_no],
  [t104f040_position_no],
  MIN([t104f025_date_effective]) AS min_startdate,
  MAX([t104f030_date_to]) AS max_enddate
FROM RankedPositions
GROUP BY [t104f005_employee_no], [t104f040_position_no], grp
ORDER BY [t104f005_employee_no], min_startdate;

and this one

WITH RECURSIVE ContinuousPositions AS (
  SELECT
    employee_no,
    position_no,
    startdate,
    enddate
  FROM your_table_name
  WHERE NOT EXISTS (
    SELECT 1
    FROM your_table_name AS t2
    WHERE t2.employee_no = your_table_name.employee_no
      AND t2.position_no = your_table_name.position_no
      AND t2.startdate < your_table_name.startdate
  )
  
  UNION ALL
  
  SELECT
    cp.employee_no,
    cp.position_no,
    cp.startdate,
    t.enddate
  FROM ContinuousPositions AS cp
  JOIN your_table_name AS t ON (
    cp.employee_no = t.employee_no
    AND cp.position_no = t.position_no
    AND cp.enddate = DATEADD(day, -1, t.startdate)
  )
)    
SELECT
  employee_no,
  position_no,
  MIN(startdate) AS min_startdate,
  MAX(enddate) AS max_enddate
FROM ContinuousPositions
GROUP BY employee_no, position_no
ORDER BY employee_no, min_startdate;

Solution

  • This is a gaps and islands problem. One way is to use LEAD() or LAG() to identify when the continuous date break and set a flag (g). Performing a cumulative sum() on the flag gives you the required grouping (grp).

    with 
    cte as
    (
        select employee_no, position_no, date_effective, date_expiry,
               g = case when dateadd(day, -1, date_effective)
                        <>   lag(date_expiry) over (partition by employee_no, position_no
                                                        order by date_effective)
                        then 1
                        else 0
                        end
        from   employment_history
    ),
    cte2 as
    (
       select employee_no, position_no, date_effective, 
              date_expiry = isnull(date_expiry, '99991231'), 
              grp = sum(g) over (partition by employee_no, position_no
                                     order by date_effective)  
        from  cte
    )
    select employee_no, position_no, 
           date_effective = min(date_effective), 
           date_expiry    = nullif(max(date_expiry), '99991231')
    from   cte2
    group by employee_no, position_no, grp
    order by employee_no, date_effective
    

    db<>fiddle demo