Search code examples
elasticsearchelasticsearch-aggregationelasticsearch-dsl

Elasticsearch filter documents where aggregation bucket DOES NOT EXIST


I have a query which gives me the results I want, but I need to filter further so that only records MISSING a specific bucket are shown.

My query is this:

{
"size": 0,
"query": 
{
    "bool": 
    {
        "must": [{"match_all": {}}],
        "filter": 
        [
            {
                "bool": 
                {
                    "should": 
                    [
                        {"match_phrase": {"user": "bob_user"}},
                        {"match_phrase": {"user": "tom_user"}}
                    ],"minimum_should_match": 1
                }
            },
            {
                "bool": 
                {
                    "should": 
                    [
                        {"match_phrase": {"result_code": "403"}},
                        {"match_phrase": {"result_code": "200"}}
                    ],"minimum_should_match": 1
                }
            },
            {
                "range": {"time": {"gte": "2021-05-12T18:51:22.512Z","lte": "2021-05-13T18:51:22.512Z","format": "strict_date_optional_time"}}}
        ]
    }
},
"aggs": 
{
    "stats": 
    {
        "terms": {"field": "host.keyword","order": {"total_distinct_ip_count": "desc"},"size": 10000},
        "aggs": 
        {
            "total_distinct_ip_count": {"cardinality": {"field": "ip.keyword"}},
            "status_codes": 
            {
                "terms": {"field": "result_code.keyword","order": {"distinct_ip_count_by_status_code": "desc"},"size": 2},
                "aggs": 
                {
                    "distinct_ip_count_by_status_code": {"cardinality": {"field": "ip.keyword"}}
                }
            }
        }
    }
}

}

Which yields the following results:

{
  "key" : "dom.com",
  "doc_count" : 92974,
  "status_codes" : {
    "buckets" : [
      {
        "key" : "200",
        "doc_count" : 92965,
        "distinct_ip_count_by_status_code" : {"value" : 51269}
      },
      {
        "key" : "403",
        "doc_count" : 9,
        "distinct_ip_count_by_status_code" : {"value" : 2}
      }
    ]
  },
  "total_distinct_ip_count" : {"value" : 51269}
},
{
  "key" : "dom2.com",
  "doc_count" : 1420,
  "status_codes" : {
    "buckets" : [
      {
        "key" : "403",
        "doc_count" : 1420,
        "distinct_ip_count_by_status_code" : {"value" : 5}
      }
    ]
  },
  "total_distinct_ip_count" : {"value" : 500}
},
{
  "key" : "dom3.com",
  "doc_count" : 171097,
  "status_codes" : {
    "buckets" : [
      {
        "key" : "200",
        "doc_count" : 127437,
        "distinct_ip_count_by_status_code" : {"value" : 735}
      },
      {
        "key" : "403",
        "doc_count" : 43660,
        "distinct_ip_count_by_status_code" : {"value" : 73}
      }
    ]
  },
  "total_distinct_ip_count" : {"value" : 808}
}

I need a way to return only the record that is missing the 200 bucket. In this case, it would be dom2.com record ONLY based on the fact that it has a 403 bucket, but no 200 bucket. I've messed around with a bucket_selector, but that can only exclude a bucket from the results.. I want to exclude the records that have both 200 and 403 records from the entire results.


Solution

  • {
    "size": 0,
    "query": 
    {
        "bool": 
        {
            "must": [{"match_all": {}}],
            "filter": 
            [
                {
                    "bool": 
                    {
                        "should": 
                        [
                            {"match_phrase": {"user": "bob_user"}},
                            {"match_phrase": {"user": "tom_user"}}
                        ],"minimum_should_match": 1
                    }
                },
                {
                    "bool": 
                    {
                        "should": 
                        [
                            {"match_phrase": {"result_code": "403"}},
                            {"match_phrase": {"result_code": "200"}}
                        ],"minimum_should_match": 1
                    }
                },
                {
                    "range": {"time": {"gte": "2021-05-12T18:51:22.512Z","lte": "2021-05-13T18:51:22.512Z","format": "strict_date_optional_time"}}}
            ]
        }
    },
    "aggs": 
    {
        "stats": 
        {
            "terms": {"field": "host.keyword","order": {"total_distinct_ip_count": "desc"},"size": 10000},
            "aggs": 
            {
                "total_distinct_ip_count": {"cardinality": {"field": "ip.keyword"}},
                "status_codes": 
                {
                    "terms": {"field": "result_code.keyword","order": {"distinct_ip_count_by_status_code": "desc"},"size": 2},
                    "aggs": 
                    {
                        "distinct_ip_count_by_status_code": {"cardinality": {"field": "ip.keyword"}}
                    }
                },
                "only_403":
                {
                    "bucket_selector":
                    {
                        "buckets_path":
                        {"var1": "status_codes['200']>_count"},
                        "script": "params.var1 == null"
                    }
                }
            }
        }
    }