Search code examples
elasticsearchaggregatekibanaelasticsearch-aggregationelasticsearch-dsl

Elasticsearch Bucket count


this is my query which groups by per date and user and I need to extract the following information from it:

  1. How many requests to endpoint user submitted. I already have this information. It results in 6 from the example below.

  2. For the date range, how many days did the user submit at least 1 request to the endpoint? It means if the user requested the endpoint 50 times on one day, that would still only count as 1 day  I need to achieve this by augmenting the query to return the count for each of the buckets.

{
  "query": {
    "bool": {
      "filter": [
        {
          "range": {
            "json.@timestamp": {
              "gt": "2021-08-22T00:00:00.000Z",
              "lt": "2022-10-22T13:41:09.000Z"
            }
          }
        },
        {
          "term": {
            "json.path": "/api/v1/discover"
          }
        },
        {
          "wildcard": {
            "container.image.name": {
              "value": "*prod*"
            }
          }
        }
      ]
    }
  },
  "aggs": {
    "group_by_userId": {
      "terms": {
        "field": "json.userId"
      },
      "aggs": {
        "group_by_timestamp": {
          "date_histogram": {
            "field": "@timestamp",
        "interval" : "1d"
          }
        }
      }
    }
  }
}

 This results in:

    {
      "aggregations": {
        "group_by_userId": {
          "doc_count_error_upper_bound": 0,
          "sum_other_doc_count": 0,
          "buckets": [
            {
              "key": "1283",
              "doc_count": 6,
              "group_by_timestamp": {
                "buckets": [
                  {
                    "key_as_string": "2022-10-07T00:00:00.000Z",
                    "key": 1665100800000,
                    "doc_count": 4
                  },
                  {
                    "key_as_string": "2022-10-08T00:00:00.000Z",
                    "key": 1665187200000,
                    "doc_count": 0
                  },
                  {
                    "key_as_string": "2022-10-09T00:00:00.000Z",
                    "key": 1665273600000,
                    "doc_count": 0
                  },
                  {
                    "key_as_string": "2022-10-10T00:00:00.000Z",
                    "key": 1665360000000,
                    "doc_count": 2
                  }
                ]
              }
            }
          ]
        }
      }
    }

How can I augment it to return count for each of the buckets as well?

For the given example, bucket count should be 2 (doc_count greater than 0)

And this is a sample document for recreation purposes:

{
  "_index": "filebeat-7.16.3-2022.10.10",
  "_type": "_doc",
  "_id": "jsWEwoMBBB8VHDQ_esJw",
  "_version": 1,
  "_score": 1,
  "_source": {
    "@timestamp": "2022-10-10T15:30:01.000Z",
    "json": {
      "userId": 4479,
      "@timestamp": "2022-10-10T15:30:01Z",
      "bodySize": 118,
      "caller": "middlewares/logger.go:65",
      "error": "Error #01: user addresses are required.\n",
      "transaction.id": "76312bca3aa68f1b",
      "rawQuery": "",
      "latency": "64.561µs",
      "trace.id": "76312bca3aa68f1b8ec1cdeb141ad6fd",
      "log.level": "warning",
      "path": "/api/v1/discover",
      "method": "GET",
      "message": "",
      "clientIP": "172.31.20.20",
      "status": 400,
      "referrer": ""
    },
    "container": {
      "id": "34965221589",
      "runtime": "docker",
      "image": {
        "name": "amazonaws.com/app:prod-97149bd4-1999999999"
      }
    }
  }
}

Solution

  • I have used "min_doc_count": 1 in date histogram to remove buckets with zero count and stats_bucket to get bucket count

      "query": {
        "bool": {
          "filter": [
            {
              "range": {
                "json.@timestamp": {
                  "gt": "2021-08-22T00:00:00.000Z",
                  "lt": "2022-10-22T13:41:09.000Z"
                }
              }
            },
            {
              "term": {
                "json.path.keyword": "/api/v1/discover"
              }
            },
            {
              "wildcard": {
                "container.image.name": {
                  "value": "*prod*"
                }
              }
            }
          ]
        }
      },
      "aggs": {
        "group_by_userId": {
          "terms": {
            "field": "json.userId"
          },
          "aggs": {
            "group_by_timestamp": {
              "date_histogram": {
                "field": "@timestamp",
                "interval": "1d",
                "min_doc_count": 1
              }
            },
            "count_buckets": {
              "stats_bucket": {
                "buckets_path": "group_by_timestamp._count"
              }
            }
          }
        }
      }
    }