Search code examples
elasticsearchnlpelasticsearch-analyzers

Using different language analyzers with ngram Analyzer in one mapping in Elasticsearch


i want to use english and german custom analyzers together with other analyzers for example ngram. Is the following mapping correct? i am getting error for german analyzer. [unknown setting [index.filter.german_stop.type]. i searched but i did not find any information about using multiple language analyzers in custom type. Is it possible to use language specific ngram-filter?

PUT test  {
    "settings": {
        "analysis": {
            "analyzer": {
                "english_analyzer": {
                    "type": "custom",
                    "filter": [
                        "lowercase",
                        "english_stop",
                        "ngram_filter_en"
                    ],
                    "tokenizer": "whitespace"
                }
            },
            "filter": {
                "english_stop": {
                    "type": "stop"
                },
                "ngram_filter_en": {
                    "type": "edge_ngram",
                    "min_gram": 1,
                    "max_gram": 25
                }
              },
                  "german_analyzer" : {
                    "type" : "custom",
                    "filter" : [
                         "lowercase",
                         "german_stop",
                        "ngram_filter_de"
                          ],
                    "tokenizer" : "whitespace"
              }
            },
            "filter" : {
                "german_stop" : {
                    "type" : "stop"
              },
                "ngram_filter_de" : {
                    "type" : "edge_ngram",
                    "min_ngram" : "1",
                    "max_gram" : 25
              }
        }
    },
    "mappings" : {
      "dynamic" : true,
      "properties": {
        "content" : {
          "tye" : "text",
          "properties" : {
            "en" : {
              "type" : "text",
              "analyzer" : "english_analyzer"
            },
            "de" : {
              "type" : "text",
              "analyzer" : "german_analyzer"
            }
        }
      }
    } 

Solution

  • There are small syntax errors.

    1. You have your last filter object outside the analysis context.
    2. You cannot have same keys multiple times in a JSON.

    So, below settings would help

    {
      "analysis": {
        "analyzer": {
          "english_analyzer": {
            "type": "custom",
            "filter": [
              "lowercase",
              "english_stop",
              "ngram_filter_en"
            ],
            "tokenizer": "whitespace"
          }
        },
        "filter": {
          "english_stop": {
            "type": "stop"
          },
          "ngram_filter_en": {
            "type": "edge_ngram",
            "min_gram": 1,
            "max_gram": 25
          },
          "german_stop": {
            "type": "stop"
          },
          "ngram_filter_de": {
            "type": "edge_ngram",
            "min_ngram": "1",
            "max_gram": 25
          }
        },
        "german_analyzer": {
          "type": "custom",
          "filter": [
            "lowercase",
            "german_stop",
            "ngram_filter_de"
          ],
          "tokenizer": "whitespace"
        }
      }
    }
    

    To understand the error in your mapping

    {
            "analysis": {
                "analyzer": {
                "filter": {
                    "english_stop": {
                        "type": "stop"
                    },
                    "ngram_filter_en": {
                        "type": "edge_ngram",
                        "min_gram": 1,
                        "max_gram": 25
                    }
                  },
                      "german_analyzer" : {
                        "type" : "custom",
                        "filter" : [
                             "lowercase",
                             "german_stop",
                            "ngram_filter_de"
                              ],
                        "tokenizer" : "whitespace"
                  }
                }, 
                "filter" : {//**This is outside analysis, you cannot simply add another filter key inside analysis, so you can merge both as above**
                    "german_stop" : {
                        "type" : "stop"
                  },
                    "ngram_filter_de" : {
                        "type" : "edge_ngram",
                        "min_ngram" : "1",
                        "max_gram" : 25
                  }
            }