Search code examples
clickhouse

Designing a Performant Materialized View for Web Analytics Data in Clickhouse


I have the following schema in Clickhouse for a web analytics system with tens of millions of records:

CREATE TABLE IF NOT EXISTS analytics.Records
(
  -- performance
  PerfIsProcessed UInt8, --bool
  PerfPageLoadTime UInt16,
  PerfDomainLookupTime UInt16,
  PerfTCPConnectTime UInt16,
  PerfServerResponseTime UInt16,
  PerfPageDownloadTime UInt16,
  PerfRedirectTime UInt16,
  PerfDOMInteractiveTime UInt16,
  PerfContentLoadTime UInt16,
  PerfResource UInt16,
  
  -- geo:asn
  GeoIPAutonomousSystemNumber UInt16,
  GeoIPAutonomousSystemOrganization String,
  
  -- requirements
  PublicInstanceID String,
  Mode UInt8,
  Created Datetime
)
ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/analytics/Records', '{replica}')
ORDER BY (Created, Mode, xxHash32(PublicInstanceID))
PARTITION BY toYYYYMM(Created)
SAMPLE BY (xxHash32(PublicInstanceID));

Now, I want to create a materialized view to obtain the following data:

SELECT
    toStartOfDay(Created) AS Date,
    AVG(PerfPageLoadTime) AS AvgPageLoadTime,
    MIN(PerfPageLoadTime) AS MinPageLoadTime,
    MAX(PerfPageLoadTime) AS MaxPageLoadTime,
    MEDIAN(PerfPageLoadTime) AS MedianPageLoadTime,
    STDDEV(PerfPageLoadTime) AS PageLoadTimeStdDev,
    AVG(PerfDomainLookupTime) AS AvgDomainLookupTime,
    MIN(PerfDomainLookupTime) AS MinDomainLookupTime,
    MAX(PerfDomainLookupTime) AS MaxDomainLookupTime,
    MEDIAN(PerfDomainLookupTime) AS MedianDomainLookupTime,
    STDDEV(PerfDomainLookupTime) AS DomainLookupTimeStdDev,
    AVG(PerfTCPConnectTime) AS AvgTCPConnectTime,
    MIN(PerfTCPConnectTime) AS MinTCPConnectTime,
    MAX(PerfTCPConnectTime) AS MaxTCPConnectTime,
    MEDIAN(PerfTCPConnectTime) AS MedianTCPConnectTime,
    STDDEV(PerfTCPConnectTime) AS TCPConnectTimeStdDev,
    AVG(PerfServerResponseTime) AS AvgServerResponseTime,
    MIN(PerfServerResponseTime) AS MinServerResponseTime,
    MAX(PerfServerResponseTime) AS MaxServerResponseTime,
    MEDIAN(PerfServerResponseTime) AS MedianServerResponseTime,
    STDDEV(PerfServerResponseTime) AS ServerResponseTimeStdDev,
    GeoIPAutonomousSystemOrganization
FROM
    analytics.Records
GROUP BY
    Date, GeoIPAutonomousSystemOrganization;

I would like to design the materialized view in a performant way considering the following requirements:

  • The accuracy of the data is not critical; a 5% error is acceptable.
  • I only need data from the previous day and the days before that; live data is not necessary.
  • Performance is crucial, and I don't want the gathering and calculation of this data to impact the performance of the original table.

Any suggestions on how to design the materialized view to meet these requirements would be greatly appreciated.


Solution

  • CREATE TABLE IF NOT EXISTS Records
    (
      -- performance
      PerfIsProcessed UInt8, --bool
      PerfPageLoadTime UInt16,
      PerfDomainLookupTime UInt16,
      PerfTCPConnectTime UInt16,
      PerfServerResponseTime UInt16,
      PerfPageDownloadTime UInt16,
      PerfRedirectTime UInt16,
      PerfDOMInteractiveTime UInt16,
      PerfContentLoadTime UInt16,
      PerfResource UInt16,
      GeoIPAutonomousSystemNumber UInt16,
      GeoIPAutonomousSystemOrganization String,
      PublicInstanceID String,
      Mode UInt8,
      Created Datetime
    )
    ENGINE = MergeTree
    ORDER BY (Created, Mode, xxHash32(PublicInstanceID))
    PARTITION BY toYYYYMM(Created);
    
    create table agg (
        Date                               DateTime,
        AvgPageLoadTime                    AggregateFunction(avg, UInt16),
        MinPageLoadTime                    SimpleAggregateFunction(min,UInt16),
        MaxPageLoadTime                    SimpleAggregateFunction(max,UInt16),
        MedianPageLoadTime                 AggregateFunction(quantilesTDigest(0.5), UInt16),
        PageLoadTimeStdDev                 AggregateFunction(stddevSamp, UInt16),
        AvgDomainLookupTime                AggregateFunction(avg, UInt16),
        MinDomainLookupTime                SimpleAggregateFunction(min,UInt16),
        MaxDomainLookupTime                SimpleAggregateFunction(max,UInt16),
        MedianDomainLookupTime             AggregateFunction(quantilesTDigest(0.5), UInt16),
        DomainLookupTimeStdDev             AggregateFunction(stddevSamp, UInt16),
        AvgTCPConnectTime                  AggregateFunction(avg, UInt16),
        MinTCPConnectTime                  SimpleAggregateFunction(min,UInt16),
        MaxTCPConnectTime                  SimpleAggregateFunction(max,UInt16),
        MedianTCPConnectTime               AggregateFunction(quantilesTDigest(0.5), UInt16),
        TCPConnectTimeStdDev               AggregateFunction(stddevSamp, UInt16),
        AvgServerResponseTime              AggregateFunction(avg, UInt16),
        MinServerResponseTime              SimpleAggregateFunction(min,UInt16),
        MaxServerResponseTime              SimpleAggregateFunction(max,UInt16),
        MedianServerResponseTime           AggregateFunction(quantilesTDigest(0.5), UInt16),
        ServerResponseTimeStdDev           AggregateFunction(stddevSamp, UInt16),
        GeoIPAutonomousSystemOrganization  String
    ) Engine = AggregatingMergeTree order by (GeoIPAutonomousSystemOrganization, Date)
    partition by toYYYYMM(Date);
    
    CREATE materialized view agg_mv to agg as 
    SELECT
        toStartOfDay(Created) AS Date,
        avgState(PerfPageLoadTime) AS AvgPageLoadTime,
        min(PerfPageLoadTime) AS MinPageLoadTime,
        max(PerfPageLoadTime) AS MaxPageLoadTime,
        quantilesTDigestState(0.5)(PerfPageLoadTime) AS MedianPageLoadTime,
        stddevSampState(PerfPageLoadTime) AS PageLoadTimeStdDev,
        avgState(PerfDomainLookupTime) AS AvgDomainLookupTime,
        min(PerfDomainLookupTime) AS MinDomainLookupTime,
        max(PerfDomainLookupTime) AS MaxDomainLookupTime,
        quantilesTDigestState(0.5)(PerfDomainLookupTime) AS MedianDomainLookupTime,
        stddevSampState(PerfDomainLookupTime) AS DomainLookupTimeStdDev,
        avgState(PerfTCPConnectTime) AS AvgTCPConnectTime,
        min(PerfTCPConnectTime) AS MinTCPConnectTime,
        max(PerfTCPConnectTime) AS MaxTCPConnectTime,
        quantilesTDigestState(0.5)(PerfTCPConnectTime) AS MedianTCPConnectTime,
        stddevSampState(PerfTCPConnectTime) AS TCPConnectTimeStdDev,
        avgState(PerfServerResponseTime) AS AvgServerResponseTime,
        min(PerfServerResponseTime) AS MinServerResponseTime,
        max(PerfServerResponseTime) AS MaxServerResponseTime,
        quantilesTDigestState(0.5)(PerfServerResponseTime) AS MedianServerResponseTime,
        stddevSampState(PerfServerResponseTime) AS ServerResponseTimeStdDev,
        GeoIPAutonomousSystemOrganization
    FROM
        Records
    GROUP BY
        Date, GeoIPAutonomousSystemOrganization;
    
    set max_partitions_per_insert_block=0;
    
    insert into Records 
    SELECT * FROM generateRandom('PerfIsProcessed UInt8, PerfPageLoadTime UInt16, PerfDomainLookupTime UInt16, PerfTCPConnectTime UInt16, PerfServerResponseTime UInt16, PerfPageDownloadTime UInt16, PerfRedirectTime UInt16, PerfDOMInteractiveTime UInt16, PerfContentLoadTime UInt16, PerfResource UInt16, GeoIPAutonomousSystemNumber UInt16, GeoIPAutonomousSystemOrganization String, PublicInstanceID String, Mode UInt8,Created Datetime', 1, 10, 2) LIMIT 100000;
    
    SELECT
        AVG(PerfPageLoadTime) AS AvgPageLoadTime,
        MIN(PerfPageLoadTime) AS MinPageLoadTime,
        MAX(PerfPageLoadTime) AS MaxPageLoadTime,
        median(PerfPageLoadTime) AS MedianPageLoadTime,
        stddevSamp(PerfPageLoadTime) AS PageLoadTimeStdDev,
        AVG(PerfDomainLookupTime) AS AvgDomainLookupTime,
        MIN(PerfDomainLookupTime) AS MinDomainLookupTime,
        MAX(PerfDomainLookupTime) AS MaxDomainLookupTime,
        median(PerfDomainLookupTime) AS MedianDomainLookupTime,
        stddevSamp(PerfDomainLookupTime) AS DomainLookupTimeStdDev,
        AVG(PerfTCPConnectTime) AS AvgTCPConnectTime,
        MIN(PerfTCPConnectTime) AS MinTCPConnectTime,
        MAX(PerfTCPConnectTime) AS MaxTCPConnectTime,
        median(PerfTCPConnectTime) AS MedianTCPConnectTime,
        stddevSamp(PerfTCPConnectTime) AS TCPConnectTimeStdDev,
        AVG(PerfServerResponseTime) AS AvgServerResponseTime,
        MIN(PerfServerResponseTime) AS MinServerResponseTime,
        MAX(PerfServerResponseTime) AS MaxServerResponseTime,
        median(PerfServerResponseTime) AS MedianServerResponseTime,
        stddevSamp(PerfServerResponseTime) AS ServerResponseTimeStdDev
    FROM Records \G
    
    ──────
    AvgPageLoadTime:          32742.03003
    MinPageLoadTime:          0
    MaxPageLoadTime:          65533
    MedianPageLoadTime:       32391.5
    PageLoadTimeStdDev:       18896.77446066387
    AvgDomainLookupTime:      32769.65665
    MinDomainLookupTime:      0
    MaxDomainLookupTime:      65535
    MedianDomainLookupTime:   33132.5
    DomainLookupTimeStdDev:   18901.354485375236
    AvgTCPConnectTime:        32767.53588
    MinTCPConnectTime:        0
    MaxTCPConnectTime:        65534
    MedianTCPConnectTime:     32979.5
    TCPConnectTimeStdDev:     18934.111527912424
    AvgServerResponseTime:    32806.32915
    MinServerResponseTime:    0
    MaxServerResponseTime:    65535
    MedianServerResponseTime: 33434
    ServerResponseTimeStdDev: 18907.003897612496
    
    SELECT
        avgMerge(AvgPageLoadTime) AS AvgPageLoadTime,
        min(MinPageLoadTime) AS MinPageLoadTime,
        max(MaxPageLoadTime) AS MaxPageLoadTime,
        quantilesTDigestMerge(0.5)(MedianPageLoadTime) AS MedianPageLoadTime,
        stddevSampMerge(PageLoadTimeStdDev) AS PageLoadTimeStdDev,
        avgMerge(AvgDomainLookupTime) AS AvgDomainLookupTime,
        min(MinDomainLookupTime) AS MinDomainLookupTime,
        max(MaxDomainLookupTime) AS MaxDomainLookupTime,
        quantilesTDigestMerge(0.5)(MedianDomainLookupTime) AS MedianDomainLookupTime,
        stddevSampMerge(DomainLookupTimeStdDev) AS DomainLookupTimeStdDev,
        avgMerge(AvgTCPConnectTime) AS AvgTCPConnectTime,
        min(MinTCPConnectTime) AS MinTCPConnectTime,
        max(MaxTCPConnectTime) AS MaxTCPConnectTime,
        quantilesTDigestMerge(0.5)(MedianTCPConnectTime) AS MedianTCPConnectTime,
        stddevSampMerge(TCPConnectTimeStdDev) AS TCPConnectTimeStdDev,
        avgMerge(AvgServerResponseTime) AS AvgServerResponseTime,
        min(MinServerResponseTime) AS MinServerResponseTime,
        max(MaxServerResponseTime) AS MaxServerResponseTime,
        quantilesTDigestMerge(0.5)(MedianServerResponseTime) AS MedianServerResponseTime,
        stddevSampMerge(ServerResponseTimeStdDev) AS ServerResponseTimeStdDev
    FROM agg \G
    
    AvgPageLoadTime:          32742.03003
    MinPageLoadTime:          0
    MaxPageLoadTime:          65533
    MedianPageLoadTime:       [32712.125]
    PageLoadTimeStdDev:       18896.77446066387
    AvgDomainLookupTime:      32769.65665
    MinDomainLookupTime:      0
    MaxDomainLookupTime:      65535
    MedianDomainLookupTime:   [32751.398]
    DomainLookupTimeStdDev:   18901.354485375236
    AvgTCPConnectTime:        32767.53588
    MinTCPConnectTime:        0
    MaxTCPConnectTime:        65534
    MedianTCPConnectTime:     [32821.75]
    TCPConnectTimeStdDev:     18934.111527912424
    AvgServerResponseTime:    32806.32915
    MinServerResponseTime:    0
    MaxServerResponseTime:    65535
    MedianServerResponseTime: [32799.582]
    ServerResponseTimeStdDev: 18907.003897612496