Search code examples
elasticsearchaggregationcardinalitydate-histogram

Pipeline aggregation with Date histogram doesn’t return expected result


I'm facing an issue regarding to use Pipeline aggregation with Date histogram. I need to filter data from: "2019-03-08T06:00:00Z" to "2019-03-09T10:00:00Z" and do histogram aggregation on that. Then calculate avg value after aggregating by cardinality agg.

{
  "size": 0,
  "query": {
        "bool" : {
            "filter": {
                "range" : {
                    "recordTime" : {
                        "gte" : "2019-03-08T06:00:00Z",
                        "lte" : "2019-03-09T10:00:00Z"
                    }
                }
            }
        }
    }, 
    "aggs" : {
        "events_per_bucket" : {
            "date_histogram" : {
                "field" : "eventTime",
                "interval" : "1h"
            },
            "aggs": {
                "cards_per_bucket": {
                    "cardinality": {
                        "field": "KANBAN_PKKEY.keyword"
                    }
                }
            }
        },
        "avg_cards_per_bucket": {
            "avg_bucket": {
                "buckets_path": "events_per_bucket>cards_per_bucket.value"
            }
        }
    }
}

Result:

{
    "took": 4,
    "timed_out": false,
    "_shards": {
        "total": 5,
        "successful": 5,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": 2,
        "max_score": 0,
        "hits": []
    },
    "aggregations": {
        "events_per_bucket": {
            "buckets": [
                {
                    "key_as_string": "2019-03-08T06:00:00.000Z",
                    "key": 1552024800000,
                    "doc_count": 1,
                    "cards_per_bucket": {
                        **"value": 1**
                    }
                },
                {
                    "key_as_string": "2019-03-08T07:00:00.000Z",
                    "key": 1552028400000,
                    "doc_count": 0,
                    "cards_per_bucket": {
                        **"value": 0**
                    }
                },
                {
                    "key_as_string": "2019-03-08T08:00:00.000Z",
                    "key": 1552032000000,
                    "doc_count": 1,
                    "cards_per_bucket": {
                        **"value": 1**
                    }
                }
            ]
        },
        "avg_cards_per_bucket": {
            **"value": 1**
        }
    }
}

The problem is why avg value is "1"? It should be: 2/3 = 0.6666 Why 0 value cardinality bucket is ignored? If i remove cardinality agg and do avg on doc_count (events_per_bucket>_count) then it works fine. The same thing happens for MAX, MIN, SUM as well. Any help would be appreciated! Thank you.


Solution

  • you should tell the aggregation pipeline what to do in the case of gaps in your buckets, like your bucket with key 1552028400000. By default, gaps are ignored. You might want instead to replace the missing values with a zero. This can be done by adding the gap_policy parameter to your aggregation pipeline:

    ...
      "avg_cards_per_bucket": {
        "avg_bucket": {
          "buckets_path": "events_per_bucket>cards_per_bucket.value",
          "gap_policy": "insert_zeros"
        }
      }
    ...
    

    More details in the Elastic documentation.