Search code examples
kqlazure-data-explorerkusto-explorer

Percentile at day to day and Percentile Week Level on top of day level Percentile


Below is Percentile at week level or whatever available dates in table

let MySourceTable = datatable(col1:int, col2:int, Value:int, EventUTCDateTime:datetime)
[
1,2,10,'2022-01-01 01:00:01',
1,2,20,'2022-01-01 02:01:01',
1,2,15,'2022-01-02 03:05:01',
1,2,40,'2022-01-03 01:30:01',
1,2,20,'2022-01-03 02:30:01',
1,2,50,'2022-01-04 07:30:01',
1,2,70,'2022-01-05 06:30:01',
1,2,45,'2022-01-05 07:30:01',
1,2,90,'2022-01-05 07:30:01',
1,2,40,'2022-01-06 07:30:01'
];
MySourceTable
| extend EventUTCDate = bin(EventUTCDateTime,1d)
| project col1,col2,Value,EventUTCDate
| summarize  Median=percentile(Value,50)

Below is Percentile at day level

let MySourceTable = datatable(col1:int, col2:int, Value:int, EventUTCDateTime:datetime)
[
1,2,10,'2022-01-01 01:00:01',
1,2,20,'2022-01-01 02:01:01',
1,2,15,'2022-01-02 03:05:01',
1,2,40,'2022-01-03 01:30:01',
1,2,20,'2022-01-03 02:30:01',
1,2,50,'2022-01-04 07:30:01',
1,2,70,'2022-01-05 06:30:01',
1,2,45,'2022-01-05 07:30:01',
1,2,90,'2022-01-05 07:30:01',
1,2,40,'2022-01-06 07:30:01'
];
MySourceTable
| extend EventUTCDate = bin(EventUTCDateTime,1d)
| project col1,col2,Value,EventUTCDate
| summarize Median=percentile(Value,50) by col1,col2,EventUTCDate
| project col1,col2, Median,EventUTCDate

Below is Percentile on top of pre-calculated percentile at day level

let MySourceTable = datatable(col1:int, col2:int, Value:int, EventUTCDateTime:datetime)
[
1,2,10,'2022-01-01 01:00:01',
1,2,20,'2022-01-01 02:01:01',
1,2,15,'2022-01-02 03:05:01',
1,2,40,'2022-01-03 01:30:01',
1,2,20,'2022-01-03 02:30:01',
1,2,50,'2022-01-04 07:30:01',
1,2,70,'2022-01-05 06:30:01',
1,2,45,'2022-01-05 07:30:01',
1,2,90,'2022-01-05 07:30:01',
1,2,40,'2022-01-06 07:30:01'
];
MySourceTable
| extend EventUTCDate = bin(EventUTCDateTime,1d)
| project col1,col2,Value,EventUTCDate
| summarize Median=percentile(Value,50) by col1,col2,EventUTCDate
| project col1,col2, Median,EventUTCDate
| summarize Median02=percentile(Median,50)

Can you please help me how to apply hll() and tdigest(), I have tried as per document but got different result.

My goal is Percentile value in 1st query and last query should be same


Solution

  • tdigest are the intermediate results for computing percentiles. You can compute the tdigest per day, and based on that compute the overall percentile:

    Per day:

    let MySourceTable = datatable(col1:int, col2:int, Value:int, EventUTCDateTime:datetime)
    [
    1,2,10,'2022-01-01 01:00:01',
    1,2,20,'2022-01-01 02:01:01',
    1,2,15,'2022-01-02 03:05:01',
    1,2,40,'2022-01-03 01:30:01',
    1,2,20,'2022-01-03 02:30:01',
    1,2,50,'2022-01-04 07:30:01',
    1,2,70,'2022-01-05 06:30:01',
    1,2,45,'2022-01-05 07:30:01',
    1,2,90,'2022-01-05 07:30:01',
    1,2,40,'2022-01-06 07:30:01'
    ];
    MySourceTable
    | extend EventUTCDate = bin(EventUTCDateTime,1d)
    | project col1,col2,Value,EventUTCDate
    | summarize Median=percentile(Value,50), tdigest = tdigest(Value) by col1,col2,EventUTCDate
    | project col1,col2, Median, MedianFromTdigest = percentile_tdigest(tdigest, 50), EventUTCDate
    
    col1 col2 Median MedianFromTdigest EventUTCDate
    1 2 10 10 2022-01-01 00:00:00.0000000
    1 2 15 15 2022-01-02 00:00:00.0000000
    1 2 20 20 2022-01-03 00:00:00.0000000
    1 2 50 50 2022-01-04 00:00:00.0000000
    1 2 70 70 2022-01-05 00:00:00.0000000
    1 2 40 40 2022-01-06 00:00:00.0000000

    Overall:

    let MySourceTable = datatable(col1:int, col2:int, Value:int, EventUTCDateTime:datetime)
    [
    1,2,10,'2022-01-01 01:00:01',
    1,2,20,'2022-01-01 02:01:01',
    1,2,15,'2022-01-02 03:05:01',
    1,2,40,'2022-01-03 01:30:01',
    1,2,20,'2022-01-03 02:30:01',
    1,2,50,'2022-01-04 07:30:01',
    1,2,70,'2022-01-05 06:30:01',
    1,2,45,'2022-01-05 07:30:01',
    1,2,90,'2022-01-05 07:30:01',
    1,2,40,'2022-01-06 07:30:01'
    ];
    MySourceTable
    | extend EventUTCDate = bin(EventUTCDateTime,1d)
    | project col1,col2,Value,EventUTCDate
    | summarize tdigest = tdigest(Value) by col1,col2,EventUTCDate
    | summarize Median = percentile_tdigest(tdigest_merge(tdigest), 50)
    
    Median
    40