Search code examples
postgresqlquery-planner

Why does the postgres planner create an all-vs-all join for some queries and not others?


I was writing a function to process selected entries of one table, by creating entries in two other tables. The second inserted table contains a reference to the first inserted table.

The plan was:

  • Select all of the appropriate details from selected rows of the master table
  • Insert entries in the first inserted table using appropriate values and return the new ids
  • Insert entries in the second inserted table using appropriate values, and the corresponding ids from the first inserted table.

This seemed weirdly slow, so I rewrote it as a select - and it still seemed weirdly slow. It seems like, for some WHERE clauses, the postgres planner will calculate an all-against-all join?

Postgres version: PostgreSQL 16.2 on aarch64-unknown-linux-gnu, compiled by aarch64-unknown-linux-gnu-gcc (GCC) 9.5.0, 64-bit

Here's the table definition (though we only use a)

DROP TABLE IF EXISTS c;
DROP TABLE IF EXISTS b;
DROP TABLE IF EXISTS a;
CREATE TABLE a (
  a_id UUID NOT NULL DEFAULT public.gen_random_uuid(), 
  file_id INT NOT NULL,
  ab TEXT NOT NULL,
  ac TEXT NOT NULL,
  name TEXT NOT NULL,
  PRIMARY KEY (a_id)
);
CREATE INDEX "file_id" on a using btree ("file_id");
CREATE TABLE b (
  b_id UUID NOT NULL DEFAULT public.gen_random_uuid(), 
  a_id UUID NOT NULL,
  ab TEXT NOT NULL,
  PRIMARY KEY (b_id),
  CONSTRAINT b_a_id_key FOREIGN KEY (a_id) REFERENCES a(a_id)
);
CREATE TABLE c (
  c_id UUID NOT NULL DEFAULT public.gen_random_uuid(), 
  b_id UUID NOT NULL,
  ac TEXT NOT NULL,
  PRIMARY KEY (c_id),
  CONSTRAINT c_b_id_key FOREIGN KEY (b_id) REFERENCES b(b_id)
);

I filled a thusly with a 12-repeating pattern: you can see the effect with 1000 lines, though it's really noticable with 100,000 (I have a file to attach, if I can attach files?)

insert into a (file_id, ab, ac, name)
VALUES 
(14530, 'ab00000', 'ac00000', 'Huey'), 
(14531, 'ab00001', 'ac00001', 'Dewey'), 
(14532, 'ab00002', 'ac00002', 'Louie'), 
(14530, 'ab00003', 'ac00003', 'Scrooge'), 
(14531, 'ab00004', 'ac00004', 'Huey'), 
(14532, 'ab00005', 'ac00005', 'Dewey'), 
(14530, 'ab00006', 'ac00006', 'Louie'), 
(14531, 'ab00007', 'ac00007', 'Scrooge'), 
(14532, 'ab00008', 'ac00008', 'Huey'), 
(14530, 'ab00009', 'ac00009', 'Dewey'), 
(14531, 'ab00010', 'ac00010', 'Louie'), 
(14532, 'ab00011', 'ac00011', 'Scrooge'), 
(14530, 'ab00012', 'ac00012', 'Huey'), 
(14531, 'ab00013', 'ac00013', 'Dewey'), 
...

Here's the select query:

EXPLAIN (ANALYZE, FORMAT JSON)
WITH upd AS 
(
    SELECT
        a_id, ab, ac
    FROM
        a
    WHERE
        file_id = 14532
    AND 
        SUBSTRING(cast (name AS TEXT),1,1)='H'
    AND 
        SUBSTRING(cast (ac AS TEXT),1,1)='a'
),
inserted_b AS 
(
    SELECT
        a_id, ab
    FROM
        upd
),
inserted_c AS
(
    SELECT
        upd.a_id, ac
    FROM
        upd
    JOIN
        inserted_b
    ON
        inserted_b.a_id = upd.a_id
)
SELECT * FROM inserted_c

And the plan for 100,000 records:

[
  {
    "Plan": {
      "Node Type": "Nested Loop",
      "Parallel Aware": false,
      "Async Capable": false,
      "Join Type": "Inner",
      "Startup Cost": 2052.89,
      "Total Cost": 2052.95,
      "Plan Rows": 1,
      "Plan Width": 48,
      "Actual Startup Time": 1.098,
      "Actual Total Time": 16569.145,
      "Actual Rows": 8333,
      "Actual Loops": 1,
      "Inner Unique": false,
      "Join Filter": "(upd.a_id = upd_1.a_id)",
      "Rows Removed by Join Filter": 69430556,
      "Plans": [
        {
          "Node Type": "Bitmap Heap Scan",
          "Parent Relationship": "InitPlan",
          "Subplan Name": "CTE upd",
          "Parallel Aware": false,
          "Async Capable": false,
          "Relation Name": "a",
          "Alias": "a",
          "Startup Cost": 363.69,
          "Total Cost": 2052.89,
          "Plan Rows": 1,
          "Plan Width": 32,
          "Actual Startup Time": 1.091,
          "Actual Total Time": 11.902,
          "Actual Rows": 8333,
          "Actual Loops": 1,
          "Recheck Cond": "(file_id = 14532)",
          "Rows Removed by Index Recheck": 0,
          "Filter": "((\"substring\"(name, 1, 1) = 'H'::text) AND (\"substring\"(ac, 1, 1) = 'a'::text))",
          "Rows Removed by Filter": 25000,
          "Exact Heap Blocks": 935,
          "Lossy Heap Blocks": 0,
          "Plans": [
            {
              "Node Type": "Bitmap Index Scan",
              "Parent Relationship": "Outer",
              "Parallel Aware": false,
              "Async Capable": false,
              "Index Name": "file_id",
              "Startup Cost": 0,
              "Total Cost": 363.69,
              "Plan Rows": 33520,
              "Plan Width": 0,
              "Actual Startup Time": 0.968,
              "Actual Total Time": 0.968,
              "Actual Rows": 33333,
              "Actual Loops": 1,
              "Index Cond": "(file_id = 14532)"
            }
          ]
        },
        {
          "Node Type": "CTE Scan",
          "Parent Relationship": "Outer",
          "Parallel Aware": false,
          "Async Capable": false,
          "CTE Name": "upd",
          "Alias": "upd",
          "Startup Cost": 0,
          "Total Cost": 0.02,
          "Plan Rows": 1,
          "Plan Width": 48,
          "Actual Startup Time": 1.094,
          "Actual Total Time": 2.415,
          "Actual Rows": 8333,
          "Actual Loops": 1
        },
        {
          "Node Type": "CTE Scan",
          "Parent Relationship": "Inner",
          "Parallel Aware": false,
          "Async Capable": false,
          "CTE Name": "upd",
          "Alias": "upd_1",
          "Startup Cost": 0,
          "Total Cost": 0.02,
          "Plan Rows": 1,
          "Plan Width": 16,
          "Actual Startup Time": 0,
          "Actual Total Time": 1.066,
          "Actual Rows": 8333,
          "Actual Loops": 8333
        }
      ]
    },
    "Planning Time": 0.225,
    "Triggers": [],
    "Execution Time": 16570.526
  }
]

You can see the size of the initial select is 8,333 rows, and the majority of the time is a) building up a 8333x8333 cross join and b) whittling it back down. If I remove the SUBSTRING(cast (ac AS TEXT),1,1)='a' clause, it runs fine, even though it's the same data. If I inline upd:

EXPLAIN (ANALYZE, FORMAT JSON)
WITH 
inserted_b AS 
(
    SELECT
        a_id, ab
    FROM
        a
    WHERE
        file_id = 14532
    AND 
        SUBSTRING(cast (name AS TEXT),1,1)='H'
    AND 
        SUBSTRING(cast (ac AS TEXT),1,1)='a'
),
inserted_c AS
(
    SELECT
        a.a_id, ac
    FROM
        a
    JOIN
        inserted_b
    ON
        inserted_b.a_id = a.a_id
    WHERE
        file_id = 14532
    AND 
        SUBSTRING(cast (name AS TEXT),1,1)='H'
    AND 
        SUBSTRING(cast (ac AS TEXT),1,1)='a'
)
SELECT * FROM inserted_c

It also works fine, back down to 153ms from 17 seconds:

[
  {
    "Plan": {
      "Node Type": "Nested Loop",
      "Parallel Aware": false,
      "Async Capable": false,
      "Join Type": "Inner",
      "Startup Cost": 364.11,
      "Total Cost": 2069.37,
      "Plan Rows": 1,
      "Plan Width": 24,
      "Actual Startup Time": 1.098,
      "Actual Total Time": 49.088,
      "Actual Rows": 8333,
      "Actual Loops": 1,
      "Inner Unique": true,
      "Plans": [
        {
          "Node Type": "Bitmap Heap Scan",
          "Parent Relationship": "Outer",
          "Parallel Aware": false,
          "Async Capable": false,
          "Relation Name": "a",
          "Alias": "a",
          "Startup Cost": 363.69,
          "Total Cost": 2052.89,
          "Plan Rows": 1,
          "Plan Width": 24,
          "Actual Startup Time": 1.086,
          "Actual Total Time": 13.49,
          "Actual Rows": 8333,
          "Actual Loops": 1,
          "Recheck Cond": "(file_id = 14532)",
          "Rows Removed by Index Recheck": 0,
          "Filter": "((\"substring\"(name, 1, 1) = 'H'::text) AND (\"substring\"(ac, 1, 1) = 'a'::text))",
          "Rows Removed by Filter": 25000,
          "Exact Heap Blocks": 935,
          "Lossy Heap Blocks": 0,
          "Plans": [
            {
              "Node Type": "Bitmap Index Scan",
              "Parent Relationship": "Outer",
              "Parallel Aware": false,
              "Async Capable": false,
              "Index Name": "file_id",
              "Startup Cost": 0,
              "Total Cost": 363.69,
              "Plan Rows": 33520,
              "Plan Width": 0,
              "Actual Startup Time": 0.963,
              "Actual Total Time": 0.963,
              "Actual Rows": 33333,
              "Actual Loops": 1,
              "Index Cond": "(file_id = 14532)"
            }
          ]
        },
        {
          "Node Type": "Index Scan",
          "Parent Relationship": "Inner",
          "Parallel Aware": false,
          "Async Capable": false,
          "Scan Direction": "Forward",
          "Index Name": "a_pkey",
          "Relation Name": "a",
          "Alias": "a_1",
          "Startup Cost": 0.42,
          "Total Cost": 8.45,
          "Plan Rows": 1,
          "Plan Width": 16,
          "Actual Startup Time": 0.003,
          "Actual Total Time": 0.003,
          "Actual Rows": 1,
          "Actual Loops": 8333,
          "Index Cond": "(a_id = a.a_id)",
          "Rows Removed by Index Recheck": 0,
          "Filter": "((file_id = 14532) AND (\"substring\"(name, 1, 1) = 'H'::text) AND (\"substring\"(ac, 1, 1) = 'a'::text))",
          "Rows Removed by Filter": 0
        }
      ]
    },
    "Planning Time": 0.902,
    "Triggers": [],
    "Execution Time": 49.503
  }
]

Which seems a little odd - if anything I'd expect it to be slower when it's not reusing the query? Any help greatly appreciated - is it a postgres planner issue, or is there something I can do to avoid the cross join?

(If I switch the SUBSTRING(cast (name AS TEXT),1,1)='H' clause to cast (name AS TEXT)='Huey' then it speeds up again - for a while I thought that text manipulation was the issue, but I can find versions where that's not the problem, it just seems to be an arbitrary "this query is too complex" issue?)

EDIT: for @LaurenzAlbe, the non-JSON versions - the bad query:

"Nested Loop  (cost=2052.89..2052.95 rows=1 width=48) (actual time=1.089..17730.019 rows=8333 loops=1)"
"  Join Filter: (upd.a_id = upd_1.a_id)"
"  Rows Removed by Join Filter: 69430556"
"  Buffers: shared hit=968"
"  CTE upd"
"    ->  Bitmap Heap Scan on a  (cost=363.69..2052.89 rows=1 width=32) (actual time=1.078..12.880 rows=8333 loops=1)"
"          Recheck Cond: (file_id = 14532)"
"          Filter: ((""substring""(name, 1, 1) = 'H'::text) AND (""substring""(ac, 1, 1) = 'a'::text))"
"          Rows Removed by Filter: 25000"
"          Heap Blocks: exact=935"
"          Buffers: shared hit=968"
"          ->  Bitmap Index Scan on file_id  (cost=0.00..363.69 rows=33520 width=0) (actual time=0.953..0.953 rows=33333 loops=1)"
"                Index Cond: (file_id = 14532)"
"                Buffers: shared hit=33"
"  ->  CTE Scan on upd  (cost=0.00..0.02 rows=1 width=48) (actual time=1.081..2.504 rows=8333 loops=1)"
"        Buffers: shared hit=34"
"  ->  CTE Scan on upd upd_1  (cost=0.00..0.02 rows=1 width=16) (actual time=0.001..1.146 rows=8333 loops=8333)"
"        Buffers: shared hit=934"
"Planning:"
"  Buffers: shared hit=5"
"Planning Time: 0.256 ms"
"Execution Time: 17731.739 ms"

And the good:

"Nested Loop  (cost=364.11..2069.37 rows=1 width=24) (actual time=1.079..47.229 rows=8333 loops=1)"
"  Buffers: shared hit=34300"
"  ->  Bitmap Heap Scan on a  (cost=363.69..2052.89 rows=1 width=24) (actual time=1.066..13.455 rows=8333 loops=1)"
"        Recheck Cond: (file_id = 14532)"
"        Filter: ((""substring""(name, 1, 1) = 'H'::text) AND (""substring""(ac, 1, 1) = 'a'::text))"
"        Rows Removed by Filter: 25000"
"        Heap Blocks: exact=935"
"        Buffers: shared hit=968"
"        ->  Bitmap Index Scan on file_id  (cost=0.00..363.69 rows=33520 width=0) (actual time=0.934..0.934 rows=33333 loops=1)"
"              Index Cond: (file_id = 14532)"
"              Buffers: shared hit=33"
"  ->  Index Scan using a_pkey on a a_1  (cost=0.42..8.45 rows=1 width=16) (actual time=0.003..0.003 rows=1 loops=8333)"
"        Index Cond: (a_id = a.a_id)"
"        Filter: ((file_id = 14532) AND (""substring""(name, 1, 1) = 'H'::text) AND (""substring""(ac, 1, 1) = 'a'::text))"
"        Buffers: shared hit=33332"
"Planning:"
"  Buffers: shared hit=42"
"Planning Time: 1.016 ms"
"Execution Time: 47.735 ms"

Solution

  • The problem is probably that the estimates are horrible, because PostgreSQL cannot estimate the substring() conditions properly.

    See if advanced statistics improve the situation:

    CREATE STATISTICS substring_stat ON
       SUBSTRING(cast (name AS TEXT),1,1),
       SUBSTRING(cast (ac AS TEXT),1,1)
       FROM a;
    
    ANALYZE a;