postgresql plpgsql timescaledb postgresql-triggers

Retaining and inserting state into a postgres aggregation function

I have a case where I have a table "tab" that stores data (keyed by symbol and time) as well as a second table "summ" that stores the exponential weighted running average of the data in the first table. I have it set up so that when data is added to the first table it triggers the calculation of the running average of the corresponding rows in the second table.

I cant figure out a good way to recover the state however for continuing on the running average. Instead it starts anew each time the trigger is called. How do I save the state of the aggregation at the end of

I have a minimal example at https://dbfiddle.uk/6dqXCwIQ which is also reproduced below.

First making a table that will contain the unsmoothed data as well as a summary table that will hold the smoothed data. There is also a trigger to populate the second table given the first.

/* Making a table that will contain the unsmoothed data */
create table tab (
  symbol text,
  time_to timestamptz,
  something int4,
  PRIMARY KEY(time_to, symbol)
);
CREATE UNIQUE INDEX index_name2 ON tab USING btree (symbol, time_to);

/* Making a summarisation table too and a trigger to populate it when something is inserted into tab. */
CREATE OR REPLACE FUNCTION smoother_state(state double precision[],
                                          newval double precision, frac double precision)
 RETURNS double precision[]
 LANGUAGE plpgsql
 IMMUTABLE PARALLEL SAFE LEAKPROOF
AS $function$
  declare
      resul double precision := case when state[1] is null then newval else state[1] * (1-frac) + newval * frac end;
  begin
    return ARRAY[resul, coalesce(state[2] + 1, 1)];
  END;
 $function$
;
CREATE OR REPLACE AGGREGATE smoother(val double precision, frac double precision) (
    SFUNC = smoother_state,
    STYPE = double precision[2]
);

create table summ (
  symbol text,
  time_to timestamptz,
  smoothed_something double precision,
  number_of_periods double precision,
  PRIMARY KEY(time_to, symbol)
);
CREATE UNIQUE INDEX index_name3 ON summ USING btree (symbol, time_to);

/* Making a trigger.
I guess it should read the smoothed_something and number_of_periods values for the symbol
and use them as an initial state vector.
*/
CREATE OR REPLACE FUNCTION do_update()
 RETURNS trigger
 LANGUAGE plpgsql
 PARALLEL SAFE STRICT LEAKPROOF
AS $function$
  DECLARE
    BEGIN
with a as (select symbol, time_to, something,
  smoother(something, 0.3) over (partition by symbol order by time_to) as smoo
FROM newtab
  ), b as (select symbol, time_to, smoo[1] as smoothed_something, smoo[2] as number_of_periods from a)
INSERT INTO summ (symbol, time_to, smoothed_something, number_of_periods) select * from b;
        RETURN null;
    END;
$function$
;

create trigger update_smoothed after
insert on tab
referencing new table as newtab
for each statement
execute function do_update()

Inserting data the first time both tab and summ look correct.

insert into tab (symbol, time_to, something) values
  ('a', '2022-01-01 00:00:15+01:00'::timestamptz, 15),
  ('b', '2021-01-01 00:00:15+01:00'::timestamptz, 18),
  ('b', '2022-01-01 00:00:15+01:00'::timestamptz, 13),
  ('b', '2023-01-01 00:00:15+01:00'::timestamptz, 11),
  ('b', '2024-01-01 00:00:15+01:00'::timestamptz, 3),
  ('c', '2022-01-01 00:00:16+01:00'::timestamptz, 15),
  ('c', '2022-01-01 00:00:17+01:00'::timestamptz, 150);

Inserting data the second time tab looks correct but summ is not as the aggregation starts from a null state.

insert into tab (symbol, time_to, something) values
  ('a', '2022-06-01 00:00:15+01:00'::timestamptz, 150),
  ('a', '2022-07-01 01:00:15+01:00'::timestamptz, 170),
  ('b', '2024-08-01 00:00:15+01:00'::timestamptz, 180),
  ('b', '2024-09-01 00:00:15+01:00'::timestamptz, 130);

You can see the resultant data in each table below and also in the fiddle site.

For this problem:

Is there a way to modify the trigger to input the correct state when running the aggregation the second time? I am happy to impose that we only ever add data that happens later in time than earlier observations.
Is there a better way to accomplish this? I have alot of data so want to avoid having to do a materialised view (as it takes too long to refresh the whole thing).

Solution

An alternative, closer to the idea from the title: the function can accept an init_state parameter:
_{demo at db<>fiddle}

CREATE OR REPLACE FUNCTION smoother_state
   (state double precision[],
    newval double precision, 
    frac double precision,
    init_state double precision[] default null::float[] )
 RETURNS double precision[]
 LANGUAGE sql
 IMMUTABLE PARALLEL SAFE LEAKPROOF
 RETURN ARRAY[ coalesce(state[1] * (1-frac) + newval * frac, 
                        init_state[1] * (1-frac) + newval * frac, 
                        newval)
              ,coalesce(state[2] + 1, 
                        init_state[2] + 1,
                        1)];
CREATE OR REPLACE AGGREGATE smoother(val double precision, 
                                     frac double precision, 
                                     init_state double precision[]) (
    SFUNC = smoother_state,
    STYPE = double precision[2]
);

You still need to look up the init state and inject it:

CREATE OR REPLACE FUNCTION do_update()
 RETURNS trigger PARALLEL SAFE STRICT LEAKPROOF
AS $function$ BEGIN
WITH a AS (
  SELECT symbol, 
         time_to, 
         smoother(something, 0.3, init_state)OVER w AS smoo
  FROM newtab
  LEFT JOIN(SELECT DISTINCT ON(symbol)
                   symbol,
                   array[smoothed_something,number_of_periods] AS init_state
            FROM summ
            ORDER BY symbol, time_to DESC) AS init_states
  USING(symbol)
  WINDOW w AS(PARTITION BY symbol ORDER BY time_to)
)
INSERT INTO summ (symbol, time_to, smoothed_something, number_of_periods) 
SELECT symbol, 
       time_to, 
       smoo[1] as smoothed_something, 
       smoo[2] as number_of_periods 
FROM a;
RETURN null; END $function$ LANGUAGE plpgsql;

symbol	time_to	smoothed_something	number_of_periods
a	2021-12-31 23:00:15+00	15	1
b	2020-12-31 23:00:15+00	18	1
b	2021-12-31 23:00:15+00	16.5	2
b	2022-12-31 23:00:15+00	14.849999999999998	3
b	2023-12-31 23:00:15+00	11.294999999999998	4
c	2021-12-31 23:00:16+00	15	1
c	2021-12-31 23:00:17+00	55.5	2
a	2022-06-01 00:00:15+01	55.5	2
a	2022-07-01 01:00:15+01	89.85	3
b	2024-08-01 00:00:15+01	61.9065	5
b	2024-09-01 00:00:15+01	82.33455000000001	6