Search code examples
sqlpostgresqlduplicatestemporal-database

Why am I getting duplicate rows with these postgresql temporal db schema & queries?


I'm following some info about setting up a temporal database in postgresql. First the question, then the technical bits.

Question: When I do a single clean insert into the public.countries table, why do I get doubled rows in the temporal.countries table? I only see a single insert (in the countries_ins rule). Is this a feature or a bug?

Ok, now the schema:

DROP SCHEMA IF EXISTS temporal CASCADE;
DROP SCHEMA IF EXISTS history CASCADE;

----------------------------
-- Temporal countries schema
-- [email protected]
--
create schema temporal; -- schema containing all temporal tables
create schema history;  -- schema containing all history tables

-- Current countries data - nothing special
--
create table temporal.countries (
  id   serial primary key,
  name varchar UNIQUE
);

-- Countries historical data.
--
-- Inheritance is used to avoid duplicating the schema from the main table.
-- Please note that columns on the main table cannot be dropped, and other caveats
-- http://www.postgresql.org/docs/9.0/static/ddl-inherit.html#DDL-INHERIT-CAVEATS
--
create table history.countries (

  hid         serial primary key,
  valid_from  timestamp not null,
  valid_to    timestamp not null default '9999-12-31',
  recorded_at timestamp not null default now(),

  constraint from_before_to check (valid_from < valid_to),

  constraint overlapping_times exclude using gist (
    box(
      point( extract( epoch from valid_from), id ),
      point( extract( epoch from valid_to - interval '1 millisecond'), id )
    ) with &&
  )
) inherits ( temporal.countries );

create index timestamps on history.countries using btree ( valid_from, valid_to ) with ( fillfactor = 100 );
create index country_id on history.countries using btree ( id ) with ( fillfactor = 90 );

-- The countries view, what the Rails' application ORM will actually CRUD on, and
-- the core of the temporal updates.
--
-- SELECT - return only current data
--
create view public.countries as select * from only temporal.countries;

-- INSERT - insert data both in the current data table and in the history table
--
create rule countries_ins as on insert to public.countries do instead (
  insert into temporal.countries ( name )
    values ( new.name )
    returning temporal.countries.*;

  insert into history.countries ( id, name, valid_from )
    values ( currval('temporal.countries_id_seq'), new.name, now() )
);

-- UPDATE - set the last history entry validity to now, save the current data in
-- a new history entry and update the current table with the new data.
--
create rule countries_upd as on update to countries do instead (
  update history.countries
    set   valid_to = now()
    where id       = old.id and valid_to = '9999-12-31';

  insert into history.countries ( id, name, valid_from ) 
  values ( old.id, new.name, now() );

  update only temporal.countries
    set name = new.name
    where id = old.id
);

-- DELETE - save the current data in the history and eventually delete the data
-- from the current table.
--
create rule countries_del as on delete to countries do instead (
  update history.countries
    set   valid_to = now()
    where id       = old.id and valid_to = '9999-12-31';

  delete from only temporal.countries
  where temporal.countries.id = old.id
);

-- EOF

And when I load it into a blank db and do a single insert, here's what happens (check out lines 39-40 for the surprising (to me) results).

 1 test=# \i /home/username/temporal.sql
 2 psql:/home/sirrobert/temporal.sql:1: NOTICE:  drop cascades to 3 other objects
 3 DETAIL:  drop cascades to table temporal.countries
 4 drop cascades to view countries
 5 drop cascades to table history.countries
 6 DROP SCHEMA
 7 DROP SCHEMA
 8 CREATE SCHEMA
 9 CREATE SCHEMA
10 CREATE TABLE
11 CREATE TABLE
12 CREATE INDEX
13 CREATE INDEX
14 CREATE VIEW
15 CREATE RULE
16 CREATE RULE
17 CREATE RULE
18 test=# SELECT * FROM public.countries;
19  id | name
20 ----+------
21 (0 rows)
22 
23 test=# SELECT * FROM temporal.countries;
24  id | name
25 ----+------
26 (0 rows)
27 
28 test=# INSERT INTO public.countries (name) VALUES ('USA');
29 INSERT 0 1
30 test=# SELECT * FROM public.countries;
31  id | name
32 ----+------
33   1 | USA
34 (1 row)
35 
36 test=# SELECT * FROM temporal.countries;
37  id | name
38 ----+------
39   1 | USA
40   1 | USA
41 (2 rows)

Solution

  • You insert data into both tables temporal.countries and history.countries with the latter being inherited from the former. That is the wrong approach. You should only insert into history.countries, with the additional attributes. When you then query temporal.countries you see a single record, but without the valid from/to information.

    Once you update a record you will get duplicates. There is no way around that with your current approach. But you don't really need inheritance to begin with. You can have two separate tables and then create a view public.countries that returns currently valid rows from temporal.countries:

    create table temporal.countries (
      id   serial primary key,
      name varchar UNIQUE
    );
    
    create table history.countries (
      hid         serial primary key,
      country     integer not null references temporal.countries,
      name        varchar,
      valid_from  timestamp not null,
      valid_to    timestamp not null default '9999-12-31',
      recorded_at timestamp not null default now(),
    
      constraint from_before_to check (valid_from < valid_to),
    
      constraint overlapping_times exclude using gist (
        box(
          point( extract( epoch from valid_from), id ),
          point( extract( epoch from valid_to - interval '1 millisecond'), id )
        ) with &&
      )
    )  inherits ( temporal.countries );

    Now create the view to return only currently valid countries:

    create view public.countries as
      select c.*
      from temporal.countries c
      join history.countries h on h.country = c.id
      where localtimestamp between h.valid_from and h.valid_to;
    

    And your three rules:

    -- INSERT - insert data in temporal.countries and metadata in history.countries
    create rule countries_ins as on insert to public.countries do instead (
      insert into temporal.countries ( name )
        values ( new.name )
        returning temporal.countries.*;
    
      insert into history.countries ( country, name, valid_from )
        values ( currval('temporal.countries_id_seq'), new.name, now() )
    );
    
    -- UPDATE - set the last history entry validity to now, save the current data in
    -- a new history entry and update the current table with the new data.
    create rule countries_upd as on update to countries do instead (
      update history.countries
        set   valid_to = now()
        where id       = old.id and valid_to = '9999-12-31'; -- view shows only valid data
    
      insert into history.countries ( country, name, valid_from ) 
      values ( old.id, new.name, now() );
    
      update only temporal.countries
        set name = new.name
        where id = old.id
    );
    
    -- DELETE - save the current date in the history and eventually delete the data
    -- from the current table.
    create rule countries_del as on delete to countries do instead (
      update history.countries
        set   valid_to = LOCALTIMESTAMP
        where id       = old.id and valid_to = '9999-12-31';
    
      -- don't delete country data, view won't show it anyway
      delete from only temporal.countries
      where temporal.countries.id = old.id
    );