Search code examples
elasticsearchelasticsearch-aggregationelasticsearch-query

Fetch all time date_histogram buckets results


I have the below query to fetch aggregations using Elasticsearch 7.1.

{ 
  "query": { 
    "bool": { 
      "filter": [ 
        { 
          "bool": { 
            "must": [ 
              { 
                "match": { 
                  "viewedInFeed": true
                } 
              }
            ] 
          } 
        } 
      ] 
    } 
  },
  "size": 0, 
  "aggs": { 
    "viewed_in_feed_by_day": { 
      "date_histogram": { 
        "field": "createdDate", 
        "interval" : "day",
        "format" : "yyyy-MM-dd",
        "min_doc_count": 1
      } 
    } 
  } 
}

The results are greater than 10,000 and I am not sure how to work since scroll is not available for aggregations. See the response below.

{
    "took": 3,
    "timed_out": false,
    "_shards": {
        "total": 5,
        "successful": 5,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 10000,
            "relation": "gte"
        },
        "max_score": null,
        "hits": []
    },
    "aggregations": {
        "viewed_in_feed_by_day": {
            "buckets": [
                {
                    "key_as_string": "2020-03-19",
                    "key": 1584576000000,
                    "doc_count": 3028
                },
                {
                    "key_as_string": "2020-03-20",
                    "key": 1584662400000,
                    "doc_count": 5384
                },
                {
                    "key_as_string": "2020-03-21",
                    "key": 1584748800000,
                    "doc_count": 3521
                }
            ]
        }
    }
}

When using _count the count of documents is greater than 10,000 and even without the "min_doc_count": 1 doesn't return results, I know there are more data anyway.


Solution

  • Building on top of Jaspreet's comments I suggest the following:

    • Use track_total_hits=true to get the exact counts (since 7.0) while keeping the size=0 to only aggregate.
    • Use the stats aggregation to gain more insights before running your histograms.
    GET dates/_search
    { 
      "track_total_hits": true,               
      "size": 0, 
      "aggs": { 
        "dates_insights": {
          "stats": {
            "field": "createdDate"
          }
        },
        "viewed_in_feed_by_day": { 
          "date_histogram": { 
            "field": "createdDate", 
            "interval" : "month",
            "format" : "yyyy-MM-dd",
            "min_doc_count": 1
          } 
        } 
      } 
    }
    

    yielding

    ...
    "hits" : {
        "total" : {
          "value" : 3,
          "relation" : "eq"
        },
        "max_score" : null,
        "hits" : [ ]
      },
      "aggregations" : {
        "viewed_in_feed_by_day" : {
          "buckets" : [
            {
              "key_as_string" : "2020-01-01",
              "key" : 1577836800000,
              "doc_count" : 1
            },
            {
              "key_as_string" : "2020-02-01",
              "key" : 1580515200000,
              "doc_count" : 1
            },
            {
              "key_as_string" : "2020-03-01",
              "key" : 1583020800000,
              "doc_count" : 1
            }
          ]
        },
        "dates_insights" : {
          "count" : 3,
          ...
          "min_as_string" : "2020-01-22T13:09:21.588Z",
          "max_as_string" : "2020-03-22T13:09:21.588Z",
          ...
        }
      }
    ...