Search code examples
elasticsearchelasticsearch-aggregationelasticsearch-dsl

elasticsearch nested field array aggregation script


I am super new to ES, so please, bear with me. I've searched everywhere online + tried different things but can't get an answer.

I have a structure mapping like this

"index_1": { ... 
},
"index_2": { ... 
},
"index_3": {
    "mappings": {
        "dynamic": "strict",
        "properties": { 
...
"keywords": {
                "type": "nested",
                "properties": {
                    "id": {
                        "type": "keyword",
                        "index": false,
                        "ignore_above": 256
                    },
                    "term": {
                        "type": "text",
                        "copy_to": [
                            "keywordsSearchField"
                        ],
                        "term_vector": "with_positions_offsets",
                        "analyzer": "pasc_index_autocomplete_analyzer",
                        "search_analyzer": "pasc_standard_analyzer"
                    },
                    "vocab": {
                        "type": "keyword",
                        "ignore_above": 256,
                        "copy_to": [
                            "keywordsSearchField"
                        ]
                    },
                    "vocabUri": {
                        "type": "keyword",
                        "ignore_above": 256,
                        "copy_to": [
                            "keywordsSearchField"
                        ]
                    }
                }
            },
            "keywordsSearchField": {
                "type": "text",
                "analyzer": "pasc_standard_analyzer"
            },
...
}

All indexes have the same mappings. What I'm trying to do, is calculate nested keywords array size for each document in every index, and group it by categories, like: keywords 1-5: 500 docs, keywords 6-10: 1000 docs, etc.

I was going around looking at script_fields initially before I discovered that they cant be used when aggregating. This is an example

{
"_source": "*",
"query": {
    "bool": {
        "must": [
            {
                "match_all": {}
            }
        ]
    }
},
"script_fields": {
    "keywords_size": {
        "script": {
            "lang": "painless",
            "source": "params['_source']['keywords'].size() > 1 && params['_source']['keywords'].size() <= 5"
        }
    },
    "keywords_size1": {
        "script": {
            "lang": "painless",
            "source": "params['_source']['keywords'].size() > 6 && params['_source']['keywords'].size() <= 10"
        }
    },
    "keywords_size2": {
        "script": {
            "lang": "painless",
            "source": "params['_source']['keywords'].size() > 11 && params['_source']['keywords'].size() <= 15"
        }
    },
    "size": {
        "script": {
            "lang": "painless",
            "source": "params['_source']['keywords'].size()"
        }
    }
}

Which works well enough adding some fields for every doc. I tried implementing the script to aggs as well, trying to create buckets for every category I require, but cant get it to work.


Solution

  • okay so i managed to solve this by using scripts. I will post the answer here if it helps anyone, however i would like to know what would the answer be by using nested fields aggregations without script. So, here goes

    {
    "query": {
        "bool": {
            "must": [
                {
                    "match_all": {}
                }
            ]
        }
    },
    "size": 0,
      "aggs": {
        "avg_keywords_per_study": {
          "terms": {
            "script": "if (params['_source']['keywords'] != null && (params['_source']['keywords'].length>=1 && params['_source']['keywords'].length<=5)){return '1 to 5'} else if (params['_source']['keywords'] != null && (params['_source']['keywords'].length>=6 && params['_source']['keywords'].length<=10)){return '6 to 10'} else if (params['_source']['keywords'] != null && (params['_source']['keywords'].length>=6 && params['_source']['keywords'].length<=10)){return '6 to 10'} else if (params['_source']['keywords'] != null && (params['_source']['keywords'].length>=11 && params['_source']['keywords'].length<=15)){return '11 to 15'} else if (params['_source']['keywords'] != null && (params['_source']['keywords'].length>=16 && params['_source']['keywords'].length<=20)){return '16 to 20'} else if (params['_source']['keywords'] != null && (params['_source']['keywords'].length>=21 && params['_source']['keywords'].length<=30)){return '16 to 20'} else return '31 or more';"
          }
        }
      }
    }
    

    Sample Response:

        "aggregations": {
        "avg_keywords_per_study": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
                {
                    "key": "1 to 5",
                    "doc_count": 487
                },
                {
                    "key": "6 to 10",
                    "doc_count": 254
                },
                {
                    "key": "11 to 15",
                    "doc_count": 28
                },
                {
                    "key": "16 to 20",
                    "doc_count": 18
                },
                {
                    "key": "31 or more",
                    "doc_count": 8
                }
            ]
        }
    }