Search code examples
elasticsearch

Tokenizer that splits uppercase in two words


How would I configure a tokenizer that splits:

LoremIpsum

into "lorem", "ipsum" and "loremipsum"?

My settings look like this:

{ "settings": {
"index": {
  "analysis": {
    "analyzer": {
      "split_words": {
        "tokenizer": "keyword",
        "type": "custom",
        "filter": [
          "asciifolding",
          "split_words",
          "lowercase"
        ]
      },
    },
    "filter": {
      "split_words": {
        "type": "word_delimiter",
        "preserve_original": true,
        "split_on_case_change": true,
        "generate_word_parts": true,
        "stem_english_possessive": true,
        "type_table": [
          "- => ALPHA",
          "+ => ALPHA"
        ]
      }
    }
  }
}}}

The index gets created without errors, but when I use my analyzer during search (multimatch), I don't get the expected results.


Solution

  • Tdlr;

    It does not feel like you analyser is wrong maybe we should take a look at both your mapping or search query ?

    Testing the analyser

    PUT 79457368/
    {
      "settings": {
        "index": {
          "analysis": {
            "analyzer": {
              "split_words": {
                "tokenizer": "keyword",
                "type": "custom",
                "filter": [
                  "asciifolding",
                  "split_words",
                  "lowercase"
                ]
              }
            },
            "filter": {
              "split_words": {
                "type": "word_delimiter",
                "preserve_original": true,
                "split_on_case_change": true,
                "generate_word_parts": true,
                "stem_english_possessive": true,
                "type_table": [
                  "- => ALPHA",
                  "+ => ALPHA"
                ]
              }
            }
          }
        }
      }
    }
    
    POST 79457368/_analyze
    {
      "analyzer": "split_words",
      "text": "LorelIpsum"
    }
    

    Will return the following:

    {
      "tokens": [
        {
          "token": "lorelipsum",
          "start_offset": 0,
          "end_offset": 10,
          "type": "word",
          "position": 0
        },
        {
          "token": "lorel",
          "start_offset": 0,
          "end_offset": 5,
          "type": "word",
          "position": 0
        },
        {
          "token": "ipsum",
          "start_offset": 5,
          "end_offset": 10,
          "type": "word",
          "position": 1
        }
      ]
    }
    

    Which is what you are looking for.

    Mapping

    In order to use this analyser on you dataset you need to do the following

    PUT 79457368/
    {
      "settings": {
        "index": {
          "analysis": {
            "analyzer": {
              "split_words": {
                "tokenizer": "keyword",
                "type": "custom",
                "filter": [
                  "asciifolding",
                  "split_words",
                  "lowercase"
                ]
              }
            },
            "filter": {
              "split_words": {
                "type": "word_delimiter",
                "preserve_original": true,
                "split_on_case_change": true,
                "generate_word_parts": true,
                "stem_english_possessive": true,
                "type_table": [
                  "- => ALPHA",
                  "+ => ALPHA"
                ]
              }
            }
          }
        }
      },
      "mappings":{
        "properties": {
          "data":{
            "type": "text",
            "analyzer": "split_words" ### This is what needs to be add to your text fields
          }
        }
      }
    }
    

    Using a specific analyser in the search query

    It is possible in some case to specify the analyser to be use to process the text in the search query.

    It should default to the analyser defined on the mapping except if you specify otherwise.

    GET 79457368/_search
    {
      "query": {
        "multi_match": {
          "query": "LorelIpsum",
          "fields": ["data"],
          "analyzer": "custom_analyzer"  ### Will use the analyser `custom_analyzer` instead of `split_words` defined on the mapping
        }
      }
    }