elasticsearch elasticsearch-aggregation elasticsearch-dsl

Condition on `terms` aggregation

I would like to put a condition in other word filter data based on aggregated data.

currently, I have a query

GET sense/_search
{
  "size": 0,
  "aggs": {
    "dates": {
      "date_histogram": {
        "field": "@timestamp",
        "interval": "1d",
        "format": "yyyy-MM-dd",
        "offset": "+4h"
      },
      "aggs": {
        "unique_sessions": {
          "terms": {
            "field": "sessionId"
          }
        }
      }
    }
  }
}

which returns this kind of data

{
  "aggregations" : {
    "dates" : {
      "buckets" : [
        {
          "key_as_string" : "2019-03-31",
          "key" : 1554004800000,
          "doc_count" : 14,
          "unique_sessions" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "83e1c3a4-341c-4ac3-a81e-f00336ee1dfb",
                "doc_count" : 3
              },
              {
                "key" : "99c4d312-2477-4bf7-ad02-ef76f50443f9",
                "doc_count" : 3
              },
              {
                "key" : "425b840f-9604-4f1d-ab18-96a9a7ae44e0",
                "doc_count" : 1
              },
              {
                "key" : "580b1f6c-6256-4f38-9803-2cc79a0a63d7",
                "doc_count" : 2
              },
              {
                "key" : "8929d75d-153c-4b66-8dd7-2eacb7974b95",
                "doc_count" : 1
              },
              {
                "key" : "8da5d732-d1e7-4a63-8f02-2b84a8bdcb62",
                "doc_count" : 2
              }
            ]
          }
        },
        {
          "key_as_string" : "2019-04-01",
          "key" : 1554091200000,
          "doc_count" : 1,
          "unique_sessions" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "513d4532-304d-44c7-bdc7-398795800383",
                "doc_count" : 1
              },
              {
                "key" : "8da5d732-d1e7-4a63-8f02-2791poc34gq1",
                "doc_count" : 2
              }
            ]
          }
        }
      ]
    }
  }
}

So I would like to retrieve the count of unique sesssionId where doc_count equal to 1.

Which means I expect result where date histogram with key "2019-03-31" will show 2 (because of aggregation with name unique_sessions in buckets has only two sessions with doc_count equal to one) and accordingly "2019-04-01" will show 1 as a result.

Have no clue how to realize this aggregation.

Solution

You would need to make use of Bucket Selector Aggregation on the terms aggregation that you have.

Below is how your query would appear:

Sample Query

POST <your_index_name>/_search
{  
   "size":0,
   "aggs":{  
      "dates":{  
         "date_histogram":{  
            "field":"@timestamp",
            "interval":"1d",
            "format":"yyyy-MM-dd",
            "offset":"+4h"
         },
         "aggs":{  
            "unique_sessions":{  
               "terms":{  
                  "field":"sessionId"
               },
               "aggs":{  
                  "unique_buckets":{  
                     "bucket_selector":{  
                        "buckets_path":{  
                           "count":"_count"
                        },
                        "script":"params.count==1"
                     }
                  }
               }
            }
         }
      }
   }
}

Note that you'd end up with empty buckets in that situation as mentioned in the below response.

Sample Response

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 9,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "dates": {
      "buckets": [
        {
          "key_as_string": "2018-12-31",
          "key": 1546228800000,
          "doc_count": 3,
          "unique_sessions": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "83e1c3a4-3AFA1c-4ac3-a81e-f00336ee1dfb",
                "doc_count": 1
              }
            ]
          }
        },
        {
          "key_as_string": "2019-01-01",
          "key": 1546315200000,
          "doc_count": 0,
          "unique_sessions": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": []
          }
        },
        {
          "key_as_string": "2019-01-02",
          "key": 1546401600000,
          "doc_count": 3,
          "unique_sessions": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": []
          }
        },
        {
          "key_as_string": "2019-01-03",
          "key": 1546488000000,
          "doc_count": 3,
          "unique_sessions": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "83e1c3a4-3AFA1c-4ab3-a81e-f00336ee1dfb",
                "doc_count": 1
              }
            ]
          }
        }
      ]
    }
  }
}

In that case, if you would want to filter the buckets to only show the parent buckets which matches the child buckets having count==1 just make use of the below query where I've added another bucket selector clause.

Note carefully the structure of the query.

Refined Query Solution:

POST <your_index_name>/_search
{  
   "size":0,
   "aggs":{  
      "dates":{  
         "date_histogram":{  
            "field":"@timestamp",
            "interval":"1d",
            "format":"yyyy-MM-dd",
            "offset":"+4h"
         },
         "aggs":{  
            "unique_sessions":{  
               "terms":{  
                  "field":"sessionId"
               },
               "aggs":{  
                  "unique_buckets":{  
                     "bucket_selector":{  
                        "buckets_path":{  
                           "count":"_count"
                        },
                        "script":"params.count==1"
                     }
                  }
               }
            },
            "terms_bucket_clause": {
              "bucket_selector": {
                "buckets_path": {
                  "count": "unique_sessions._bucket_count"
                },
                "script": "params.count>0"
              }
            }
         }
      }
   }
}

Refined Query Response

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 9,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "dates": {
      "buckets": [
        {
          "key_as_string": "2018-12-31",
          "key": 1546228800000,
          "doc_count": 3,
          "unique_sessions": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "83e1c3a4-3AFA1c-4ac3-a81e-f00336ee1dfb",
                "doc_count": 1
              }
            ]
          }
        },
        {
          "key_as_string": "2019-01-03",
          "key": 1546488000000,
          "doc_count": 3,
          "unique_sessions": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": "83e1c3a4-3AFA1c-4ab3-a81e-f00336ee1dfb",
                "doc_count": 1
              }
            ]
          }
        }
      ]
    }
  }
}

Do note the difference in the results in both the query. Hope this helps!