Search code examples
elasticsearch

Elasticsearch doc null pointer exception error for scripted metric on date_histogram


In elastisearch I have an key that is an array I need to count the unique items in the array. Without overlapping items of other documents. So I made this:

The scripted_metric works when it isn't in the date_histogram

'aggs' => [
    'groupByWeek' => [
        'date_histogram' => [
            'field' => 'date',
            'calendar_interval' => '1w',
        ],
        'aggs' => [
            'count_unique_locations' => [
                'scripted_metric' => [
                     'init_script' => 'state.locations = []',
                     'map_script' => 'state.locations.addAll(doc.unique_locations_with_error)',
                     'combine_script' => 'return state.locations',
                     'reduce_script' => "
                         def locations = [];
                         for (state in states) {
                             for(location in state) {
                                 if(!locations.contains(location) && location != '' ) {
                                     locations.add(location); 
                                 }
                             }
                        }
                        return locations.length;
                    ",
                ],
            ],
        ],
    ],
],

When I run the query I get this error:

{
  "error": {
    "root_cause":[],
    "type":"search_phase_execution_exception",
    "reason":"",
    "phase":"fetch",
    "grouped":true,
    "failed_shards":[],
    "caused_by":{
      "type":"script_exception",
      "reason":"runtime error",
      "script_stack":[
        "locations = []; ","            ^---- HERE"],
        "script":"def locations = []; for (state in states) { for(location in state){if(!locations.contains(location) && location != '' ) {locations.add(location); }}} return locations.length;",
      "lang":"painless",
      "position":{
        "offset":16,
        "start":4,
        "end":20
      },
      "caused_by":{
       "type":"null_pointer_exception",
       "reason":"cannot access method/field [iterator] from a null def reference"
      }
    }
  },
  "status":400
}

I think it has something to do with that doc now null is but I don't know why of how to fix it.


Solution

  • If you want to count unique array items in documents from a date range, you can use an unscripted aggregation

    Mapping

    PUT /unique_array_item
    {
        "mappings": {
            "properties": {
                "text": {
                    "type": "keyword"
                },
                "date": {
                    "type": "date"
                }
            }   
        }
    }
    

    Documents

    PUT /unique_array_item/_bulk
    {"create":{"_id":1}}
    {"date":"2024-03-28","text":["banana","banana"]}
    {"create":{"_id":2}}
    {"date":"2024-03-28","text":["apple","apple","apple","apple","apple","apple","banana"]}
    {"create":{"_id":3}}
    {"date":"2024-03-14","text":["cherry","banana","apple"]}
    {"create":{"_id":4}}
    {"date":"2024-03-14","text":["pineapple"]}
    

    Query with aggregations

    GET /unique_array_item/_search?filter_path=aggregations
    {
        "aggs": {
            "by_week": {
                "date_histogram": {
                    "field": "date",
                    "calendar_interval": "1w",
                    "min_doc_count": 1
                },
                "aggs": {
                    "date_interval_unique_array_item_count": {
                        "cardinality": {
                            "field": "text"
                        }
                    }
                }
            }
        }
    }
    

    Response

    {
        "aggregations" : {
            "by_week" : {
                "buckets" : [
                    {
                        "key_as_string" : "2024-03-11T00:00:00.000Z",
                        "key" : 1710115200000,
                        "doc_count" : 2,
                        "date_interval_unique_array_item_count" : {
                            "value" : 4
                        }
                    },
                    {
                        "key_as_string" : "2024-03-25T00:00:00.000Z",
                        "key" : 1711324800000,
                        "doc_count" : 2,
                        "date_interval_unique_array_item_count" : {
                            "value" : 2
                        }
                    }
                ]
            }
        }
    }
    

    Test it:

    • "2024-03-11T00:00:00.000Z": ["cherry","banana","apple","pineapple"] = 4
    • "2024-03-25T00:00:00.000Z": ["apple","banana"] = 2

    Response is correct