Search code examples
elasticsearchelasticsearch-aggregationelasticsearch-analyzers

ElasticSearch Terms Aggregation not working with custom Analyzer and Pattern Tokenizer


I am trying the Terms Aggregation for the first time and there seems to be an issue with the custom pattern tokenizer I am using.

Here is the Mapping:

{
  "mappings": {
    "properties": {
      "contentItemType": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        },
        "analyzer": "patternAnalyzer"
      },
      "theme": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        },
        "analyzer": "patternAnalyzer"
      }
    }
  },
  "settings": {
    "analysis": {
      "analyzer": {
        "patternAnalyzer": {
          "tokenizer": "patternTokenizer"
        }
      },
      "tokenizer": {
        "patternTokenizer": {
          "type": "pattern",
          "pattern": ";"
        }
      }
    }
  }
}

When I am trying to search with the aggregation API http://my_server/index_name/_search here is the result:

{
  "aggregations": {
    "group_by_contentItemType": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "Correspondence; Reports",
          "doc_count": 3
        },
        {
          "key": "Correspondence",
          "doc_count": 2
        },
        {
          "key": "Meeting Minutes; Administrative Records; Reports",
          "doc_count": 2
        },
        {
          "key": "Correspondence; Legal and Treaty Material; Reports",
          "doc_count": 1
        },
        {
          "key": "Correspondence; Memoranda",
          "doc_count": 1
        },
        {
          "key": "Memoranda",
          "doc_count": 1
        },
        {
          "key": "Reports",
          "doc_count": 1
        }
      ]
    },
    "group_by_theme": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "International Relations",
          "doc_count": 2
        },
        {
          "key": "Key Events; Dissent; Dissent; Resistance; Human Rights",
          "doc_count": 2
        },
        {
          "key": "Border Security and Migration; Key Events",
          "doc_count": 1
        },
        {
          "key": "Border Security and Migration; Second World War Aftermath",
          "doc_count": 1
        },
        {
          "key": "Domestic Politics",
          "doc_count": 1
        },
        {
          "key": "Domestic Politics; Border Security and Migration",
          "doc_count": 1
        },
        {
          "key": "Economics and Trade; International Relations",
          "doc_count": 1
        },
        {
          "key": "Embassy and Consulate Administration; Industry and Agriculture; International Relations",
          "doc_count": 1
        },
        {
          "key": "Populations and Social Policy; Second World War Aftermath; International Relations",
          "doc_count": 1
        }
      ]
    }
  }
}

As you can see the issue with the aggregation. I have been stuck on this problem for quite a few days. I have seen so many examples and all but still not able to solve this issue. Please help. Thanks in Advance!!!

EDIT!!! Here is the full mapping after @CatalinM answer:

{
    "local_cwee": {
        "mappings": {
            "dynamic": "false",
            "properties": {
                "author": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "commentaries": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "contentDateEndMonth": {
                    "type": "integer"
                },
                "contentDateEndSpecified": {
                    "type": "boolean"
                },
                "contentDateEndYear": {
                    "type": "integer"
                },
                "contentDateMonth": {
                    "type": "integer"
                },
                "contentDateMonthSpecified": {
                    "type": "boolean"
                },
                "contentDateStartMonth": {
                    "type": "integer"
                },
                "contentDateStartSpecified": {
                    "type": "boolean"
                },
                "contentDateStartYear": {
                    "type": "integer"
                },
                "contentDateYear": {
                    "type": "integer"
                },
                "contentDoi": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "contentItemType": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "contentItemTypeFacets": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "contentTitle": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "copyrightNotices": {
                    "type": "nested",
                    "properties": {
                        "imageName": {
                            "type": "text",
                            "fields": {
                                "keyword": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        },
                        "text": {
                            "type": "text",
                            "fields": {
                                "keyword": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        }
                    }
                },
                "countries": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "country": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "coverDateEndMonth": {
                    "type": "integer"
                },
                "coverDateEndSpecified": {
                    "type": "boolean"
                },
                "coverDateEndYear": {
                    "type": "integer"
                },
                "coverDateMonth": {
                    "type": "integer"
                },
                "coverDateMonthSpecified": {
                    "type": "boolean"
                },
                "coverDateStartMonth": {
                    "type": "integer"
                },
                "coverDateStartSpecified": {
                    "type": "boolean"
                },
                "coverDateStartYear": {
                    "type": "integer"
                },
                "coverDateYear": {
                    "type": "integer"
                },
                "displayName": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "documentDoi": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "documentLevel": {
                    "type": "integer"
                },                
                "keyEvents": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "language": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "languageFacets": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "languages": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "languagesFacets": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "moduleNumber": {
                    "type": "integer"
                },
                "notes": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "pageTranscript": {
                    "type": "text",
                    "term_vector": "with_positions",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    },
                    "analyzer": "whiteSpaceAnalyzer"
                },
                "people": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "publicationDate": {
                    "type": "integer"
                },
                "publicationDateEndMonth": {
                    "type": "integer"
                },
                "publicationDateEndSpecified": {
                    "type": "boolean"
                },
                "publicationDateEndYear": {
                    "type": "integer"
                },
                "publicationDateMonth": {
                    "type": "integer"
                },
                "publicationDateMonthSpecified": {
                    "type": "boolean"
                },
                "publicationDateStartMonth": {
                    "type": "integer"
                },
                "publicationDateStartSpecified": {
                    "type": "boolean"
                },
                "publicationDateStartYear": {
                    "type": "integer"
                },
                "publicationDateYear": {
                    "type": "integer"
                },
                "publicationDoi": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "publicationId": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "publicationIdFacet": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "publicationTitle": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "publicationType": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "publicationTypeFacets": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "publicationYear": {
                    "type": "integer"
                },
                "publisherName": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "publisherNameFacet": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                }
                "subject": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "subjectAreas": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "subjectAreasFacets": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "subjectCountries": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "subjectCountriesFacets": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "subjectKeyword": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "subjectKeywordFacets": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "subthemeFacets": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "subthemes": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "theme": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "themeFacets": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                },
                "themes": {
                    "type": "text",
                    "analyzer": "patternAnalyzer",
                    "fielddata": true
                }
            }
        }
    }
}

Solution

  • Using your custom tokenizer, the tokens in the text field are "Correspondence", "Meeting Minutes", "Administrative Records", ..etc. So i don't think you need the keyword field.

    To make aggregations work on the text field, you'll have to add "fielddata": true in the mapping. This is by default disabled because aggregations on large text fields are not wanted, but in your case the tokens are exactly the values you want to aggregate on.

    here's the simplified configuration

    {
      "mappings": {
        "properties": {
          "contentItemType": {
            "type": "text",
            "fielddata": true,
            "analyzer": "patternAnalyzer"
          }
        }
      },
      "settings": {
        "analysis": {
          "analyzer": {
            "patternAnalyzer": {
              "tokenizer": "patternTokenizer"
            }
          },
          "tokenizer": {
            "patternTokenizer": {
              "type": "pattern",
              "pattern": ";"
            }
          }
        }
      }
    }
    

    the query:

    {
      "aggregations" : {
          "test" : {
              "terms" : { "field" : "contentItemType" }
          }
      }
    }
    

    and result:

    "aggregations": {
        "test": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
                {
                    "key": " Administrative Records",
                    "doc_count": 1
                },
                {
                    "key": "Meeting Minutes",
                    "doc_count": 1
                },
                {
                    "key": " Reports",
                    "doc_count": 1
                }
            ]
        }
    }