I have the following schema in Clickhouse for a web analytics system with tens of millions of records:
CREATE TABLE IF NOT EXISTS analytics.Records
(
-- performance
PerfIsProcessed UInt8, --bool
PerfPageLoadTime UInt16,
PerfDomainLookupTime UInt16,
PerfTCPConnectTime UInt16,
PerfServerResponseTime UInt16,
PerfPageDownloadTime UInt16,
PerfRedirectTime UInt16,
PerfDOMInteractiveTime UInt16,
PerfContentLoadTime UInt16,
PerfResource UInt16,
-- geo:asn
GeoIPAutonomousSystemNumber UInt16,
GeoIPAutonomousSystemOrganization String,
-- requirements
PublicInstanceID String,
Mode UInt8,
Created Datetime
)
ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/analytics/Records', '{replica}')
ORDER BY (Created, Mode, xxHash32(PublicInstanceID))
PARTITION BY toYYYYMM(Created)
SAMPLE BY (xxHash32(PublicInstanceID));
Now, I want to create a materialized view to obtain the following data:
SELECT
toStartOfDay(Created) AS Date,
AVG(PerfPageLoadTime) AS AvgPageLoadTime,
MIN(PerfPageLoadTime) AS MinPageLoadTime,
MAX(PerfPageLoadTime) AS MaxPageLoadTime,
MEDIAN(PerfPageLoadTime) AS MedianPageLoadTime,
STDDEV(PerfPageLoadTime) AS PageLoadTimeStdDev,
AVG(PerfDomainLookupTime) AS AvgDomainLookupTime,
MIN(PerfDomainLookupTime) AS MinDomainLookupTime,
MAX(PerfDomainLookupTime) AS MaxDomainLookupTime,
MEDIAN(PerfDomainLookupTime) AS MedianDomainLookupTime,
STDDEV(PerfDomainLookupTime) AS DomainLookupTimeStdDev,
AVG(PerfTCPConnectTime) AS AvgTCPConnectTime,
MIN(PerfTCPConnectTime) AS MinTCPConnectTime,
MAX(PerfTCPConnectTime) AS MaxTCPConnectTime,
MEDIAN(PerfTCPConnectTime) AS MedianTCPConnectTime,
STDDEV(PerfTCPConnectTime) AS TCPConnectTimeStdDev,
AVG(PerfServerResponseTime) AS AvgServerResponseTime,
MIN(PerfServerResponseTime) AS MinServerResponseTime,
MAX(PerfServerResponseTime) AS MaxServerResponseTime,
MEDIAN(PerfServerResponseTime) AS MedianServerResponseTime,
STDDEV(PerfServerResponseTime) AS ServerResponseTimeStdDev,
GeoIPAutonomousSystemOrganization
FROM
analytics.Records
GROUP BY
Date, GeoIPAutonomousSystemOrganization;
I would like to design the materialized view in a performant way considering the following requirements:
Any suggestions on how to design the materialized view to meet these requirements would be greatly appreciated.
CREATE TABLE IF NOT EXISTS Records
(
-- performance
PerfIsProcessed UInt8, --bool
PerfPageLoadTime UInt16,
PerfDomainLookupTime UInt16,
PerfTCPConnectTime UInt16,
PerfServerResponseTime UInt16,
PerfPageDownloadTime UInt16,
PerfRedirectTime UInt16,
PerfDOMInteractiveTime UInt16,
PerfContentLoadTime UInt16,
PerfResource UInt16,
GeoIPAutonomousSystemNumber UInt16,
GeoIPAutonomousSystemOrganization String,
PublicInstanceID String,
Mode UInt8,
Created Datetime
)
ENGINE = MergeTree
ORDER BY (Created, Mode, xxHash32(PublicInstanceID))
PARTITION BY toYYYYMM(Created);
create table agg (
Date DateTime,
AvgPageLoadTime AggregateFunction(avg, UInt16),
MinPageLoadTime SimpleAggregateFunction(min,UInt16),
MaxPageLoadTime SimpleAggregateFunction(max,UInt16),
MedianPageLoadTime AggregateFunction(quantilesTDigest(0.5), UInt16),
PageLoadTimeStdDev AggregateFunction(stddevSamp, UInt16),
AvgDomainLookupTime AggregateFunction(avg, UInt16),
MinDomainLookupTime SimpleAggregateFunction(min,UInt16),
MaxDomainLookupTime SimpleAggregateFunction(max,UInt16),
MedianDomainLookupTime AggregateFunction(quantilesTDigest(0.5), UInt16),
DomainLookupTimeStdDev AggregateFunction(stddevSamp, UInt16),
AvgTCPConnectTime AggregateFunction(avg, UInt16),
MinTCPConnectTime SimpleAggregateFunction(min,UInt16),
MaxTCPConnectTime SimpleAggregateFunction(max,UInt16),
MedianTCPConnectTime AggregateFunction(quantilesTDigest(0.5), UInt16),
TCPConnectTimeStdDev AggregateFunction(stddevSamp, UInt16),
AvgServerResponseTime AggregateFunction(avg, UInt16),
MinServerResponseTime SimpleAggregateFunction(min,UInt16),
MaxServerResponseTime SimpleAggregateFunction(max,UInt16),
MedianServerResponseTime AggregateFunction(quantilesTDigest(0.5), UInt16),
ServerResponseTimeStdDev AggregateFunction(stddevSamp, UInt16),
GeoIPAutonomousSystemOrganization String
) Engine = AggregatingMergeTree order by (GeoIPAutonomousSystemOrganization, Date)
partition by toYYYYMM(Date);
CREATE materialized view agg_mv to agg as
SELECT
toStartOfDay(Created) AS Date,
avgState(PerfPageLoadTime) AS AvgPageLoadTime,
min(PerfPageLoadTime) AS MinPageLoadTime,
max(PerfPageLoadTime) AS MaxPageLoadTime,
quantilesTDigestState(0.5)(PerfPageLoadTime) AS MedianPageLoadTime,
stddevSampState(PerfPageLoadTime) AS PageLoadTimeStdDev,
avgState(PerfDomainLookupTime) AS AvgDomainLookupTime,
min(PerfDomainLookupTime) AS MinDomainLookupTime,
max(PerfDomainLookupTime) AS MaxDomainLookupTime,
quantilesTDigestState(0.5)(PerfDomainLookupTime) AS MedianDomainLookupTime,
stddevSampState(PerfDomainLookupTime) AS DomainLookupTimeStdDev,
avgState(PerfTCPConnectTime) AS AvgTCPConnectTime,
min(PerfTCPConnectTime) AS MinTCPConnectTime,
max(PerfTCPConnectTime) AS MaxTCPConnectTime,
quantilesTDigestState(0.5)(PerfTCPConnectTime) AS MedianTCPConnectTime,
stddevSampState(PerfTCPConnectTime) AS TCPConnectTimeStdDev,
avgState(PerfServerResponseTime) AS AvgServerResponseTime,
min(PerfServerResponseTime) AS MinServerResponseTime,
max(PerfServerResponseTime) AS MaxServerResponseTime,
quantilesTDigestState(0.5)(PerfServerResponseTime) AS MedianServerResponseTime,
stddevSampState(PerfServerResponseTime) AS ServerResponseTimeStdDev,
GeoIPAutonomousSystemOrganization
FROM
Records
GROUP BY
Date, GeoIPAutonomousSystemOrganization;
set max_partitions_per_insert_block=0;
insert into Records
SELECT * FROM generateRandom('PerfIsProcessed UInt8, PerfPageLoadTime UInt16, PerfDomainLookupTime UInt16, PerfTCPConnectTime UInt16, PerfServerResponseTime UInt16, PerfPageDownloadTime UInt16, PerfRedirectTime UInt16, PerfDOMInteractiveTime UInt16, PerfContentLoadTime UInt16, PerfResource UInt16, GeoIPAutonomousSystemNumber UInt16, GeoIPAutonomousSystemOrganization String, PublicInstanceID String, Mode UInt8,Created Datetime', 1, 10, 2) LIMIT 100000;
SELECT
AVG(PerfPageLoadTime) AS AvgPageLoadTime,
MIN(PerfPageLoadTime) AS MinPageLoadTime,
MAX(PerfPageLoadTime) AS MaxPageLoadTime,
median(PerfPageLoadTime) AS MedianPageLoadTime,
stddevSamp(PerfPageLoadTime) AS PageLoadTimeStdDev,
AVG(PerfDomainLookupTime) AS AvgDomainLookupTime,
MIN(PerfDomainLookupTime) AS MinDomainLookupTime,
MAX(PerfDomainLookupTime) AS MaxDomainLookupTime,
median(PerfDomainLookupTime) AS MedianDomainLookupTime,
stddevSamp(PerfDomainLookupTime) AS DomainLookupTimeStdDev,
AVG(PerfTCPConnectTime) AS AvgTCPConnectTime,
MIN(PerfTCPConnectTime) AS MinTCPConnectTime,
MAX(PerfTCPConnectTime) AS MaxTCPConnectTime,
median(PerfTCPConnectTime) AS MedianTCPConnectTime,
stddevSamp(PerfTCPConnectTime) AS TCPConnectTimeStdDev,
AVG(PerfServerResponseTime) AS AvgServerResponseTime,
MIN(PerfServerResponseTime) AS MinServerResponseTime,
MAX(PerfServerResponseTime) AS MaxServerResponseTime,
median(PerfServerResponseTime) AS MedianServerResponseTime,
stddevSamp(PerfServerResponseTime) AS ServerResponseTimeStdDev
FROM Records \G
──────
AvgPageLoadTime: 32742.03003
MinPageLoadTime: 0
MaxPageLoadTime: 65533
MedianPageLoadTime: 32391.5
PageLoadTimeStdDev: 18896.77446066387
AvgDomainLookupTime: 32769.65665
MinDomainLookupTime: 0
MaxDomainLookupTime: 65535
MedianDomainLookupTime: 33132.5
DomainLookupTimeStdDev: 18901.354485375236
AvgTCPConnectTime: 32767.53588
MinTCPConnectTime: 0
MaxTCPConnectTime: 65534
MedianTCPConnectTime: 32979.5
TCPConnectTimeStdDev: 18934.111527912424
AvgServerResponseTime: 32806.32915
MinServerResponseTime: 0
MaxServerResponseTime: 65535
MedianServerResponseTime: 33434
ServerResponseTimeStdDev: 18907.003897612496
SELECT
avgMerge(AvgPageLoadTime) AS AvgPageLoadTime,
min(MinPageLoadTime) AS MinPageLoadTime,
max(MaxPageLoadTime) AS MaxPageLoadTime,
quantilesTDigestMerge(0.5)(MedianPageLoadTime) AS MedianPageLoadTime,
stddevSampMerge(PageLoadTimeStdDev) AS PageLoadTimeStdDev,
avgMerge(AvgDomainLookupTime) AS AvgDomainLookupTime,
min(MinDomainLookupTime) AS MinDomainLookupTime,
max(MaxDomainLookupTime) AS MaxDomainLookupTime,
quantilesTDigestMerge(0.5)(MedianDomainLookupTime) AS MedianDomainLookupTime,
stddevSampMerge(DomainLookupTimeStdDev) AS DomainLookupTimeStdDev,
avgMerge(AvgTCPConnectTime) AS AvgTCPConnectTime,
min(MinTCPConnectTime) AS MinTCPConnectTime,
max(MaxTCPConnectTime) AS MaxTCPConnectTime,
quantilesTDigestMerge(0.5)(MedianTCPConnectTime) AS MedianTCPConnectTime,
stddevSampMerge(TCPConnectTimeStdDev) AS TCPConnectTimeStdDev,
avgMerge(AvgServerResponseTime) AS AvgServerResponseTime,
min(MinServerResponseTime) AS MinServerResponseTime,
max(MaxServerResponseTime) AS MaxServerResponseTime,
quantilesTDigestMerge(0.5)(MedianServerResponseTime) AS MedianServerResponseTime,
stddevSampMerge(ServerResponseTimeStdDev) AS ServerResponseTimeStdDev
FROM agg \G
AvgPageLoadTime: 32742.03003
MinPageLoadTime: 0
MaxPageLoadTime: 65533
MedianPageLoadTime: [32712.125]
PageLoadTimeStdDev: 18896.77446066387
AvgDomainLookupTime: 32769.65665
MinDomainLookupTime: 0
MaxDomainLookupTime: 65535
MedianDomainLookupTime: [32751.398]
DomainLookupTimeStdDev: 18901.354485375236
AvgTCPConnectTime: 32767.53588
MinTCPConnectTime: 0
MaxTCPConnectTime: 65534
MedianTCPConnectTime: [32821.75]
TCPConnectTimeStdDev: 18934.111527912424
AvgServerResponseTime: 32806.32915
MinServerResponseTime: 0
MaxServerResponseTime: 65535
MedianServerResponseTime: [32799.582]
ServerResponseTimeStdDev: 18907.003897612496