Search code examples
sqlapache-sparkapache-spark-sql

How to get time differences between rows for each ID and Change in Status in SQL


I am trying to get the time differences per id and status change with the following data:

create table Table1(idversion text, id text, status text,  dt datetime);

insert into Table1 values 
("1_1", "1", "OK",'2020-05-14T01:00:00')
,("1_2", "1", "OK",'2020-05-14T01:00:10')
,("1_3", "1", "NOT OK",'2020-05-14T01:00:20')
,("2_1", "2", "OK",'2020-05-14T01:00:00')
,("2_2", "2", "NOT OK", '2020-05-14T01:00:10')
,("1_4", "1", "OK",'2020-05-14T01:00:30')
,("1_5", "1", "OK",'2020-05-14T01:00:40')
,("1_6", "1", "OK",'2020-05-14T01:00:50')
;
Select * FROM Table1 ORDER BY idversion;
idversion id status dt
1_1 1 OK 2020-05-14 01:00:00
1_2 1 OK 2020-05-14 01:00:10
1_3 1 NOT OK 2020-05-14 01:00:20
1_4 1 OK 2020-05-14 01:00:30
1_5 1 OK 2020-05-14 01:00:40
1_6 1 OK 2020-05-14 01:00:50
2_1 2 OK 2020-05-14 01:00:00
2_2 2 NOT OK 2020-05-14 01:00:10

I can get the time_diff between each id as such

SELECT *,
  dt - LAG(dt) OVER ( partition by id order by idversion )as lag
FROM Table1
ORDER BY idversion;

But what I am looking to do is get all the time differences for the same ID and when there is a status change.

So the expected output would be

idversion id status time_diff
"1_1" "1" "OK" 20
"1_3" "1" "NOT OK" 10
"2_1" "2" "OK" 10

Solution

  • Fixing the table first as I do have those fields, I just omitted them

    create table Table1(idversion text, id text, version int, status text,  dt datetime);
    
    insert into Table1 values 
    ("1_1", "1", 1, "OK",'2020-05-14T01:00:00')
    ,("1_2", "1", 2, "OK",'2020-05-14T01:00:10')
    ,("1_3", "1", 3, "NOT OK",'2020-05-14T01:00:20')
    ,("2_1", "2", 1, "OK",'2020-05-14T01:00:00')
    ,("2_2", "2", 2, "NOT OK", '2020-05-14T01:00:10')
    ,("1_4", "1", 4, "OK",'2020-05-14T01:00:30')
    ,("1_5", "1", 5, "OK",'2020-05-14T01:00:40')
    ,("1_6", "1", 6, "OK",'2020-05-14T01:00:50')
    ;
    
    Records: 8  Duplicates: 0  Warnings: 0
    
    Select * FROM Table1 ORDER BY id, version;
    
    idversion id version status dt
    1_1 1 1 OK 2020-05-14 01:00:00
    1_2 1 2 OK 2020-05-14 01:00:10
    1_3 1 3 NOT OK 2020-05-14 01:00:20
    1_4 1 4 OK 2020-05-14 01:00:30
    1_5 1 5 OK 2020-05-14 01:00:40
    1_6 1 6 OK 2020-05-14 01:00:50
    2_1 2 1 OK 2020-05-14 01:00:00
    2_2 2 2 NOT OK 2020-05-14 01:00:10

    And with the following CTEs, I have the desired output, for ID changes and state changes only being shown

    WITH cte  AS (SELECT *,
      COALESCE(lag(status) OVER (partition by id order by id, version), "init") as last_status
    FROM Table1
    ORDER BY id, version
    ),
    cte2 AS (
      SELECT *,
      LEAD(dt) OVER (partition by id order by id, version) - dt as time_diff FROM cte
    WHERE status != last_status
    )
    SELECT * FROM cte2 WHERE time_diff IS NOT NULL;
    
    idversion id version status dt last_status time_diff
    1_1 1 1 OK 2020-05-14 01:00:00 init 20
    1_3 1 3 NOT OK 2020-05-14 01:00:20 OK 10
    2_1 2 1 OK 2020-05-14 01:00:00 init 10

    fiddle