Search code examples
clickhouse

Data block trigger timing of Materialized View


We've known materialized view is triggered by insertion. Large amount of data from one insertion will be divided into multiple blocks, each block will trigger one MV select, according to this doc.

Will MV select triggered after all the rows of a block received? Or it depends on the select? Say, if the select tries to count rows of inserted data, it won't be trigger until all data received? If the select is just doing data replication/reversed indexing, it will be triggered as soon as one record of the block received?


Solution

  • Insert triggers MatView after block is inserted into the main table. So the insert just passes the pointer to the block of rows (in memory) into MatView.

    -- by default clickhouse-client forms blocks = 1048545
    -- clickhouse-client by itself parses input stream and inserts into 
    -- server in Native format
    
    select value from system.settings where name = 'max_insert_block_size';
    1048545
    
    
    -- here is a setup which traces sizes of blocks in rows
    create table test(a Int, b String) engine=Null;
    create materialized view test_mv 
    Engine=MergeTree order by ts as 
    select now() ts, count() rows from test;
    
    -- create file with 100mil rows in TSV format
    clickhouse-client -q "select number, 'x' from numbers(100000000) format TSV" >test.tsv
    
    clickhouse-client -q "insert into test format TSV" <test.tsv
    
    select max(rows), min(rows), count() from test_mv;
    ┌─max(rows)─┬─min(rows)─┬─count()─┐
    │   1947598 │        32 │      65 │
    └───────────┴───────────┴─────────┘
    
    -- 1947598 -<- several blocks were squashed into a single block because 
    -- of parallel parsing
    -- 65 blocks were passed
    
    truncate test_mv;
    
    
    
    -- let's disable parallel parsing
    clickhouse-client --input_format_parallel_parsing=0 -q "insert into test format TSV" <test.tsv
    
    select max(rows), min(rows), count() from test_mv;
    ┌─max(rows)─┬─min(rows)─┬─count()─┐
    │   1048545 │    388225 │      96 │
    └───────────┴───────────┴─────────┘
    
    -- got 1048545 = max_insert_block_size
    -- 96 blocks were passed
    -- 100000000 - 95 * 1048545 = 388225 
    -- (95 blocks by max_insert_block_size and a remain 1 block = 388225 rows
    
    
    
    truncate test_mv;
    
    
    
    -- let's grow max_insert_block_size
    clickhouse-client --max_insert_block_size=10000000000  --input_format_parallel_parsing=0 -q "insert into test format TSV" <test.tsv
    
    select max(rows), min(rows), count() from test_mv;
    ┌─max(rows)─┬─min(rows)─┬─count()─┐
    │ 100000000 │ 100000000 │       1 │
    └───────────┴───────────┴─────────┘
    
    -- 1 block == 100 mil rows
    

    more https://kb.altinity.com/altinity-kb-queries-and-syntax/atomic-insert/