Search code examples
elasticsearchkibanaelastic-stackelasticsearch-aggregationelasticsearch-dsl

How to Count Unique Assets with Specific Severity in Elasticsearch?


I have data as follows:

{
        "_index": "index",
        "_id": "...",
        "_score": 4.512486,
        "_source": {
          "division": {
            "name": "Ben",
            "id": "678"
          },
          "summary": {
            "severity": 2,
            "last_issue_at": "...",
            "first_issue_at": "...",
            "issues_count": 1
          },
          "company": {
            "id": "..."
          },
          "ingest_timestamp": "..",
          "asset": {
            "name": "Bus",
            "vin": "123",
            "id": "...",
            "manufacturer": ...
          },
          "timestamp": "..."
        } 
    {
        "_index": "index",
        "_id": "...",
        "_score": 4.512486,
        "_source": {
          "division": {
            "name": "Ben",
            "id": "678"
          },
          "summary": {
            "severity": 1,
            "last_issue_at": "...",
            "first_issue_at": "...",
            "issues_count": 1
          },
          "company": {
            "id": "..."
          },
          "ingest_timestamp": "..",
          "asset": {
            "name": "Bus",
            "vin": "123",
            "id": "...",
            "manufacturer": ...
          },
          "timestamp": "..."
        }
      

Goal: Count unique assets with a severity of 1 or 2. If an asset has both severities, only count the one with severity 1.

Here is my query so far:

{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "company.id": "..."
          }
        },
        {
          "range": {
            "timestamp": {
              "gte": "...",
              "lte": "..."
            }
          }
        }
      ]
    }
  },
  "runtime_mappings": {
    "asset_id_and_vin": {
      "type": "keyword",
      "script": {
        "source": "if (doc.containsKey('asset.vin')) { emit(doc['asset.id'] + ' ' + doc['asset.vin']) } else { emit(doc['asset.id'].value + ' ' + 'N/A') }"
      }
    }
  },
  "aggs": {
    "asset_count": {
      "cardinality": {
        "field": "asset_id_and_vin"
      }
    },
    "assets": {
      "multi_terms": {
        "terms": [
          {
            "field": "asset.name"
          },
          {
            "field": "asset.vin",
            "missing": "N/A"
          },
          {
            "field": "division.name"
          },
          {
            "field": "asset.id"
          },
          {
            "field": "division.id"
          }
        ],
        "order": {
          "earliest_date": "desc"
        },
        "size": 100
      },
      "aggs": {
        "severity": {
          "min": {
            "field": "summary.severity"
          }
        },
        "earliest_date": {
          "min": {
            "field": "summary.first_issue_at"
          }
        },
        "latest_date": {
          "max": {
            "field": "summary.last_issue_at"
          }
        },
        "issues": {
          "sum": {
            "field": "summary.issues_count"
          }
        },
        "filtered_assets": {
          "bucket_selector": {
            "buckets_path": {
              "min_severity": "severity"
            },
            "script": "params.min_severity == 1 || params.min_severity == 2"
          }
        },
        "pagination": {
          "bucket_sort": {
            "size": 100,
            "from": 0
          }
        }
      }
    }
  }
} 

I added the bucket_selector to return severities that are either 1 or 2, not both. However, this doesn't account for assets that can have both severities, as it checks one document at a time. How can I modify my query to ensure I only count the asset with severity 1 if it has both severities?

Any advice or suggestions for improving my query would be greatly appreciated. Thanks in advance!


Solution

  • Instead of bucket_selector, you need to use bucket_sort, it is a bit tricky since bucket sort cannot work with terms directly, so you need to add another metric aggregation underneath that would simply mirror the keys in terms (min_value in the example below).

    Unfortunately, your requirement of sorting by earliest date is not feasible in this setup. The earliest date can only be found in a pipeline aggregation, which cannot be used for composite aggs sorting. To sort by earliest date we would need to how you index the data.

    DELETE test
    
    PUT test
    {
      "mappings": {
        "properties": {
          "division": {
            "properties": {
              "id": {
                "type": "keyword"
              },
              "name": {
                "type": "keyword"
              }
            }
          },
          "asset": {
            "properties": {
              "id": {
                "type": "keyword"
              },
              "vin": {
                "type": "keyword"
              },
              "name": {
                "type": "keyword"
              }
            }
          },
          "company": {
            "properties": {
              "id": {
                "type": "keyword"
              }
            }
          }
        }
      }
    }
    
    
    POST test/_bulk?refresh
    { "index": { "_id": "1" } }
    {"division":{"name":"Ben","id":"678"},"summary":{"severity":1,"last_issue_at":"2016-01-01","first_issue_at":"2015-01-01","issues_count":1},"company":{"id":"..."},"ingest_timestamp":"..","asset":{"name":"Bus","vin":"123","id":"...","manufacturer":"..."},"timestamp":"..."}
    { "index": { "_id": "2" } }
    {"division":{"name":"Ben","id":"678"},"summary":{"severity":2,"last_issue_at":"2024-01-01","first_issue_at":"2020-01-01","issues_count":1},"company":{"id":"..."},"ingest_timestamp":"..","asset":{"name":"Bus","vin":"123","id":"...","manufacturer":"..."},"timestamp":"..."}
      
    
    POST test/_search
    {
      "size": 0,
      "query": {
        "bool": {
          "must": [
            {
              "match": {
                "company.id": "..."
              }
            }
          ]
        }
      },
      "runtime_mappings": {
        "asset_id_and_vin": {
          "type": "keyword",
          "script": {
            "source": "if (doc.containsKey('asset.vin')) { emit(doc['asset.id'] + ' ' + doc['asset.vin']) } else { emit(doc['asset.id'].value + ' ' + 'N/A') }"
          }
        }
      },
      "aggs": {
        "asset_count": {
          "cardinality": {
            "field": "asset_id_and_vin"
          }
        },
        "assets": {
          "multi_terms": {
            "terms": [
              {
                "field": "asset.name"
              },
              {
                "field": "asset.vin",
                "missing": "N/A"
              },
              {
                "field": "division.name"
              },
              {
                "field": "asset.id"
              },
              {
                "field": "division.id"
              }
            ],
            "size": 100
          },
          "aggs": {
            "severity": {
              "terms": {
                "field": "summary.severity"
              },
              "aggs": {
                "min_value": {
                  "min": {
                    "field": "summary.severity"
                  }
                },
                "earliest_date": {
                  "min": {
                    "field": "summary.first_issue_at"
                  }
                },
                "latest_date": {
                  "max": {
                    "field": "summary.last_issue_at"
                  }
                },
                "issues": {
                  "sum": {
                    "field": "summary.issues_count"
                  }
                },
                "min_severity": {
                  "bucket_sort": {
                    "sort": [{
                      "min_value": {
                        "order": "asc"
                      }
                    }],
                    "size": 1
                  }
                }
              }
            }
          }
        }
      }
    }