Search code examples
elasticsearchelasticsearch-aggregationelasticsearch-7

log aggregation in elasticsearch


I am using elasticsearch 7.8 and I have the entries in the index like below,

{"_id" : 1,"sourceip":"1.1.1.1", "data" : "this is a sample input", "processedflag" : true}
{"_id" : 2,"sourceip":"1.1.1.1", "data" : "this is a sample input", "processedflag" : false}
{"_id" : 3,"sourceip":"1.1.1.1", "data" : "this is an another input", "processedflag" : false}
{"_id" : 4,"sourceip":"1.1.1.2", "data" : "this is a sample input", "processedflag" : false}

Now for the sourceip : 1.1.1.1, I want to aggregate and find the duplicates of "data",
For example in the above case, I want to get the _id of 1 and 2 entries since the data is matched.

Thanks,
Harry


Solution

  • Looking at your data, I've only considered the first three fields and based on it, created the mapping, documents, query and response.

    Mapping:

    PUT my_ip_index
    {
      "mappings": {
        "properties": {
          "id": {
            "type": "keyword"
          },
          "sourceip":{
            "type": "ip"
          },
          "data":{            
            "type": "keyword"              <----- Notice this though
          }
        }
      }
    }
    

    Sample Documents:

    POST my_ip_index/_doc/1
    {
      "id": 1,
      "sourceip": "1.1.1.1",
      "data": "this is a sample input"
    }
    
    POST my_ip_index/_doc/2
    {
      "id": 2,
      "sourceip": "1.1.1.1",
      "data": "this is a sample input"
    }
    
    POST my_ip_index/_doc/3
    {
      "id": 3,
      "sourceip": "1.1.1.1",
      "data": "this is an another input"
    }
    
    POST my_ip_index/_doc/4
    {
      "id": 4,
      "sourceip": "1.1.1.2",
      "data": "this is a sample input"
    }
    
    POST my_ip_index/_doc/5
    {
      "id": 5,
      "sourceip": "1.1.1.2",
      "data": "this is a sample another input"
    }
    

    Only the first two documents are equal i.e. having same ip as well as data

    Aggregation Request:

    POST my_ip_index/_search
    {
      "size": 0,
      "aggs": {
        "my_ip_address": {
          "terms": {
            "field": "sourceip",
            "min_doc_count": 2                          <---- Note this
          },
          "aggs": {
            "my_data": {
              "terms": {
                "field": "data",
                "min_doc_count": 2                      <---- Note this
              },
              "aggs": {
                "my_duplicate_ids":{
                  "terms": {
                    "field": "id",
                    "size": 10
                  }
                }
              }
            },
            "min_bucket_selector": {
              "bucket_selector": {
                "buckets_path": {
                  "count": "my_data._bucket_count" 
                },
                "script": {
                  "source": "params.count > 0"
                }
              }
            }
          }
        }
      }
    }
    

    Note that I've made use of the below aggregations and notice in particular the structure

    Also notice how I've made use of bucket_count special path in the bucket script aggregation part.

    Response:

    {
      "took" : 0,
      "timed_out" : false,
      "_shards" : {
        "total" : 1,
        "successful" : 1,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : {
          "value" : 5,
          "relation" : "eq"
        },
        "max_score" : null,
        "hits" : [ ]
      },
      "aggregations" : {
        "my_ip_address" : {
          "doc_count_error_upper_bound" : 0,
          "sum_other_doc_count" : 0,
          "buckets" : [
            {
              "key" : "1.1.1.1",                          <---- IP
              "doc_count" : 3,
              "my_data" : {
                "doc_count_error_upper_bound" : 0,
                "sum_other_doc_count" : 0,
                "buckets" : [
                  {
                    "key" : "this is a sample input",     <---- data
                    "doc_count" : 2,
                    "my_duplicate_ids" : {
                      "doc_count_error_upper_bound" : 0,
                      "sum_other_doc_count" : 0,
                      "buckets" : [
                        {
                          "key" : "1",                    <---- id you are looking for
                          "doc_count" : 1
                        },
                        {
                          "key" : "2",                    <---- id you are looking for
                          "doc_count" : 1
                        }
                      ]
                    }
                  }
                ]
              }
            }
          ]
        }
      }
    }
    

    Hope that helps!