Search code examples
sqlgoogle-bigquerycountercounting

How to get a running count from a string value fields


I have a table that needs to be manipulated to have a rolling count. The issue I'm having is getting it to aggregate continually.

id      |sent|read_at|created_at             |sent_at                |direction|actor_type|
--------+----+-------+-----------------------+-----------------------+---------+----------+
TA-12345|   1|       |2022-12-28 13:45:54.000|2022-12-28 13:45:54.000|OUTBOUND |nurse     |
TA-12345|   1|       |2022-12-28 14:38:47.000|2022-12-28 14:38:47.000|OUTBOUND |nurse     |
TA-12345|   1|       |2022-12-28 13:47:01.000|2022-12-28 13:47:01.000|OUTBOUND |nurse     |
TA-12345|   1|       |2022-12-28 13:18:58.000|2022-12-28 13:18:58.000|OUTBOUND |nurse     |
TA-12345|   1|       |2022-12-28 14:38:51.000|2022-12-28 14:38:51.000|OUTBOUND |nurse     |
TA-12345|   1|       |2022-12-28 13:52:40.000|2022-12-28 13:52:40.000|OUTBOUND |nurse     |
TA-12345|   1|       |2022-12-28 15:06:10.000|2022-12-28 15:06:10.000|OUTBOUND |nurse     |
TA-12345|   1|       |2022-12-28 14:41:43.000|2022-12-28 14:41:43.000|INBOUND  |patient   |
TA-12345|   1|       |2022-12-28 13:49:11.000|2022-12-28 13:49:11.000|INBOUND  |patient   |
TA-12345|   1|       |2022-12-28 13:19:35.000|2022-12-28 13:19:35.000|INBOUND  |patient   |
TA-12345|   1|       |2022-12-28 12:58:26.000|2022-12-28 12:58:26.000|INBOUND  |patient   |
TA-12345|   1|       |2022-12-28 13:19:48.000|2022-12-28 13:19:48.000|INBOUND  |patient   |
TA-12345|   1|       |2022-12-28 13:49:14.000|2022-12-28 13:49:14.000|INBOUND  |patient   |
TA-12345|   1|       |2022-12-28 13:51:23.000|2022-12-28 13:51:23.000|INBOUND  |patient   |
TA-12345|   1|       |2022-12-28 13:47:52.000|2022-12-28 13:47:52.000|INBOUND  |patient   |

My code is as:

select 
         id
        , sent      
        , read_at
        , created_at 
        , sent_at
        , direction
        , CASE
                WHEN direction = 'OUTBOUND' THEN 'nurse'
                WHEN direction = 'INBOUND' THEN 'patient'
            END as actor_type
    from `projectid`.warehouse.datasetid
        where id = 'TA-12345'

I did use row_number() over(partition by) but I wasn't sure if this would give me an accurate rolling count if I were to partition by created_at and id

For example:

select 
        id
        , sent      
        , read_at
        , created_at 
        , sent_at
        , direction
        , CASE
                WHEN direction = 'OUTBOUND' THEN 'nurse'
                WHEN direction = 'INBOUND' THEN 'patient'
            END as actor_type
        , row_number() over(PARTITION by direction 
            order by created_at ) as running_count
    from `projectid`.warehouse.datasetid
        where id = 'TA-12345'

That give me this:

id      |sent|read_at|created_at             |sent_at                |direction|actor_type|running_count|
--------+----+-------+-----------------------+-----------------------+---------+----------+-------------+
TA-12345|   1|       |2022-12-28 12:58:26.000|2022-12-28 12:58:26.000|INBOUND  |patient   |            1|
TA-12345|   1|       |2022-12-28 13:19:35.000|2022-12-28 13:19:35.000|INBOUND  |patient   |            2|
TA-12345|   1|       |2022-12-28 13:19:48.000|2022-12-28 13:19:48.000|INBOUND  |patient   |            3|
TA-12345|   1|       |2022-12-28 13:47:52.000|2022-12-28 13:47:52.000|INBOUND  |patient   |            4|
TA-12345|   1|       |2022-12-28 13:49:11.000|2022-12-28 13:49:11.000|INBOUND  |patient   |            5|
TA-12345|   1|       |2022-12-28 13:49:14.000|2022-12-28 13:49:14.000|INBOUND  |patient   |            6|
TA-12345|   1|       |2022-12-28 13:51:23.000|2022-12-28 13:51:23.000|INBOUND  |patient   |            7|
TA-12345|   1|       |2022-12-28 14:41:43.000|2022-12-28 14:41:43.000|INBOUND  |patient   |            8|
TA-12345|   1|       |2022-12-28 13:18:58.000|2022-12-28 13:18:58.000|OUTBOUND |nurse     |            1|
TA-12345|   1|       |2022-12-28 13:45:54.000|2022-12-28 13:45:54.000|OUTBOUND |nurse     |            2|
TA-12345|   1|       |2022-12-28 13:47:01.000|2022-12-28 13:47:01.000|OUTBOUND |nurse     |            3|
TA-12345|   1|       |2022-12-28 13:52:40.000|2022-12-28 13:52:40.000|OUTBOUND |nurse     |            4|
TA-12345|   1|       |2022-12-28 14:38:47.000|2022-12-28 14:38:47.000|OUTBOUND |nurse     |            5|
TA-12345|   1|       |2022-12-28 14:38:51.000|2022-12-28 14:38:51.000|OUTBOUND |nurse     |            6|
TA-12345|   1|       |2022-12-28 15:06:10.000|2022-12-28 15:06:10.000|OUTBOUND |nurse     |            7|

I tried sum(sent) over (order by direction) but that gave me two rows both resulting in one. I also tried SUM(IF(actor_type ="patient", sent, NULL)) AS patient_message_count, But that errored out. How could I get a better way of a running count?

My desired results should look something like this:

id       |sent|read_at|created_at             |sent_at                |direction|actor_type|running_count|
---------+----+-------+-----------------------+-----------------------+---------+----------+-------------+
TA-100000|   1|       |2023-03-20 09:12:41.000|2023-03-20 09:12:41.000|INBOUND  |patient   |            1|
TA-100001|   1|       |2023-03-13 23:45:34.000|2023-03-13 23:45:34.000|INBOUND  |patient   |            1|
TA-100009|   1|       |2023-03-01 10:06:26.000|2023-03-01 10:06:26.000|INBOUND  |patient   |            1|
TA-100011|   1|       |2023-03-06 17:36:21.000|2023-03-06 17:36:21.000|INBOUND  |patient   |            1|
TA-100011|   1|       |2023-03-07 11:36:25.000|2023-03-07 11:36:25.000|INBOUND  |patient   |            2|
TA-100011|   1|       |2023-03-21 12:02:31.000|2023-03-21 12:02:31.000|INBOUND  |patient   |            3|
TA-100014|   1|       |2023-03-17 07:47:11.000|2023-03-17 07:47:11.000|INBOUND  |patient   |            1|
TA-100014|   1|       |2023-03-17 07:47:23.000|2023-03-17 07:47:23.000|INBOUND  |patient   |            2|
TA-100014|   1|       |2023-03-17 07:47:40.000|2023-03-17 07:47:40.000|INBOUND  |patient   |            3|
TA-100014|   1|       |2023-03-17 14:12:46.000|2023-03-17 14:12:46.000|INBOUND  |patient   |            4|
TA-100016|   1|       |2023-03-02 11:10:50.000|2023-03-02 11:10:50.000|INBOUND  |patient   |            1|
TA-100017|   1|       |2023-03-03 12:13:03.000|2023-03-03 12:13:03.000|INBOUND  |patient   |            1|
TA-100019|   1|       |2023-03-31 10:40:19.000|2023-03-31 10:40:19.000|INBOUND  |patient   |            1|
TA-100020|   1|       |2023-03-07 12:32:23.000|2023-03-07 12:32:23.000|INBOUND  |patient   |            1|
TA-100021|   1|       |2023-03-03 11:02:17.000|2023-03-03 11:02:17.000|INBOUND  |patient   |            1|
TA-100024|   1|       |2023-04-03 18:45:19.000|2023-04-03 18:45:19.000|INBOUND  |patient   |            1|
TA-100024|   1|       |2023-04-03 18:56:57.000|2023-04-03 18:56:57.000|INBOUND  |patient   |            2|
TA-100024|   1|       |2023-04-03 18:57:10.000|2023-04-03 18:57:10.000|INBOUND  |patient   |            3|
TA-100024|   1|       |2023-04-04 08:36:56.000|2023-04-04 08:36:56.000|INBOUND  |patient   |            4|
TA-100024|   1|       |2023-04-19 14:57:00.000|2023-04-19 14:57:00.000|INBOUND  |patient   |            5|

When partitioning by id and direction I get:

id       |sent|read_at|created_at             |sent_at                |direction|actor_type|running_count|
---------+----+-------+-----------------------+-----------------------+---------+----------+-------------+
TA-100000|   1|       |2023-04-05 15:00:44.000|2023-04-05 15:00:44.000|OUTBOUND |nurse     |            1|
TA-100000|   1|       |2023-03-20 09:12:41.000|2023-03-20 09:12:41.000|INBOUND  |patient   |            1|
TA-100000|   1|       |2023-03-20 08:50:18.000|2023-03-20 08:50:18.000|OUTBOUND |nurse     |            1|
TA-100000|   1|       |2023-03-20 09:13:42.000|2023-03-20 09:13:42.000|OUTBOUND |nurse     |            1|
TA-100001|   1|       |2023-03-13 23:45:34.000|2023-03-13 23:45:34.000|INBOUND  |patient   |            1|
TA-100001|   1|       |2023-03-14 10:31:31.000|2023-03-14 10:31:31.000|OUTBOUND |nurse     |            1|
TA-100001|   1|       |2023-03-13 13:39:44.000|2023-03-13 13:39:44.000|OUTBOUND |nurse     |            1|
TA-100001|   1|       |2023-03-01 13:11:30.000|2023-03-01 13:11:30.000|OUTBOUND |nurse     |            1|

Expected:

id       |sent|read_at|created_at             |sent_at                |direction|actor_type|running_count|
---------+----+-------+-----------------------+-----------------------+---------+----------+-------------+
TA-100000|   1|       |2023-04-05 15:00:44.000|2023-04-05 15:00:44.000|OUTBOUND |nurse     |            1|
TA-100000|   1|       |2023-03-20 09:12:41.000|2023-03-20 09:12:41.000|INBOUND  |patient   |            1|
TA-100000|   1|       |2023-03-20 08:50:18.000|2023-03-20 08:50:18.000|OUTBOUND |nurse     |            2|
TA-100000|   1|       |2023-03-20 09:13:42.000|2023-03-20 09:13:42.000|OUTBOUND |nurse     |            3|
TA-100001|   1|       |2023-03-13 23:45:34.000|2023-03-13 23:45:34.000|INBOUND  |patient   |            1|
TA-100001|   1|       |2023-03-14 10:31:31.000|2023-03-14 10:31:31.000|OUTBOUND |nurse     |            1|
TA-100001|   1|       |2023-03-13 13:39:44.000|2023-03-13 13:39:44.000|OUTBOUND |nurse     |            2|
TA-100001|   1|       |2023-03-01 13:11:30.000|2023-03-01 13:11:30.000|OUTBOUND |nurse     |            3|

Solution

  • I tried row_number() OVER(PARTITION BY id,direction ORDER BY created_at) AS running_count

        CREATE TABLE datasetid 
        (
            id  VARCHAR(512),
            sent    VARCHAR(512),
            read_at VARCHAR(512),
            created_at  VARCHAR(512),
            sent_at VARCHAR(512),
            direction   VARCHAR(512),
            actor_type  VARCHAR(512)
        );
        
        INSERT INTO datasetid (id, sent, read_at, created_at, sent_at, direction, actor_type) VALUES
            ('TA-100000', '1', '', '2023-04-05 15:00:44.000', '2023-04-05 15:00:44.000', 'OUTBOUND', 'nurse'),
            ('TA-100000', '1', '', '2023-03-20 09:12:41.000', '2023-03-20 09:12:41.000', 'INBOUND', 'patient'),
            ('TA-100000', '1', '', '2023-03-20 08:50:18.000', '2023-03-20 08:50:18.000', 'OUTBOUND', 'nurse'),
            ('TA-100000', '1', '', '2023-03-20 09:13:42.000', '2023-03-20 09:13:42.000', 'OUTBOUND', 'nurse'),
            ('TA-100001', '1', '', '2023-03-13 23:45:34.000', '2023-03-13 23:45:34.000', 'INBOUND', 'patient'),
            ('TA-100001', '1', '', '2023-03-14 10:31:31.000', '2023-03-14 10:31:31.000', 'OUTBOUND', 'nurse'),
            ('TA-100001', '1', '', '2023-03-13 13:39:44.000', '2023-03-13 13:39:44.000', 'OUTBOUND', 'nurse'),
            ('TA-100001', '1', '', '2023-03-01 13:11:30.000', '2023-03-01 13:11:30.000', 'OUTBOUND', 'nurse');
        
        SELECT *,
     row_number() OVER(PARTITION BY id,direction ORDER BY created_at) AS running_count 
        FROM datasetid
    
    id sent read_at created_at sent_at direction actor_type running_count
    TA-100000 1 2023-03-20 09:12:41.000 2023-03-20 09:12:41.000 INBOUND patient 1
    TA-100000 1 2023-03-20 08:50:18.000 2023-03-20 08:50:18.000 OUTBOUND nurse 1
    TA-100000 1 2023-03-20 09:13:42.000 2023-03-20 09:13:42.000 OUTBOUND nurse 2
    TA-100000 1 2023-04-05 15:00:44.000 2023-04-05 15:00:44.000 OUTBOUND nurse 3
    TA-100001 1 2023-03-13 23:45:34.000 2023-03-13 23:45:34.000 INBOUND patient 1
    TA-100001 1 2023-03-01 13:11:30.000 2023-03-01 13:11:30.000 OUTBOUND nurse 1
    TA-100001 1 2023-03-13 13:39:44.000 2023-03-13 13:39:44.000 OUTBOUND nurse 2
    TA-100001 1 2023-03-14 10:31:31.000 2023-03-14 10:31:31.000 OUTBOUND nurse 3