Search code examples
elasticsearchtokenizeanalysisanalyzer

Elasticsearch custom analyzer with ngram and without word delimiter on hyphens


I am trying to index strings that contain hyphens but do not contain spaces, periods or any other punctuation. I do not want to split up the words based on hyphens, instead I would like to have the hyphens be part of the indexed text.

For example, my 6 text strings would be:

  • magazineplayon
  • magazineofhorses
  • online-magazine
  • best-magazine
  • friend-of-magazines
  • magazineplaygames

I would like to be able to search these string for the text containing "play" or for the text starting with "magazine".

I have been able to use ngram to make the text containing "play" work properly. However, the hyphen is causing text to split and it is including results where "magazine" is in the word after a hyphen. I only want words starting at the beginning of the string with "magazine" to appear.

Based on the sample above, only these 3 should appear when beginning with "magazine":

  • magazineplayon
  • magazineofhorses
  • magazineplaygames

Please help with my ElasticSearch Index Sample:

DELETE /sample

PUT /sample
{
    "settings": {
        "index.number_of_shards":5,
        "index.number_of_replicas": 0,
        "analysis": {
            "filter": {
                "nGram_filter": {
                   "type": "nGram",
                   "min_gram": 2,
                   "max_gram": 20,
                   "token_chars": [
                      "letter",
                      "digit"
                   ]
                },
                "word_delimiter_filter": {
                    "type": "word_delimiter",
                    "preserve_original": true,
                    "catenate_all" : true
                }
             },
          "analyzer": {
            "ngram_index_analyzer": {
              "type" : "custom",
              "tokenizer": "lowercase",
              "filter" : ["nGram_filter", "word_delimiter_filter"]
            }
          }
        }
    }
}
PUT /sample/1/_create
{
    "name" : "magazineplayon"
}
PUT /sample/3/_create
{
    "name" : "magazineofhorses"
}
PUT /sample/4/_create
{
    "name" : "online-magazine"
}
PUT /sample/5/_create
{
    "name" : "best-magazine"
}
PUT /sample/6/_create
{
    "name" : "friend-of-magazines"
}
PUT /sample/7/_create
{
    "name" : "magazineplaygames"
}

GET /sample/_search
{
"query": {
        "wildcard": {
          "name": "*play*" 
        }
    }
}

GET /sample/_search
{
"query": {
        "wildcard": {
          "name": "magazine*" 
        }
    }
}

Update 1 I updated all my create statements to use TEST after sample:

PUT /sample/test/7/_create
{
    "name" : "magazinefairplay"
}

I then ran the following command to return only names that had the word "play" in them instead of doing the wildcard search. This worked correctly and returned only two records.

POST /sample/test/_search
{
    "query": {
        "bool": {
            "minimum_should_match": 1,
            "should": [
                {"match": { "name.substrings": "play" }}
            ]
        }
    }
}

I ran the following command to return only names that started with "magazine". My expectation was that "online-magazine", "best-magazine" and "friend-of-magazines" would not appear. However, all seven records were returned including these three.

POST /sample/test/_search
{
    "query": {
        "bool": {
            "minimum_should_match": 1,
            "should": [
                {"match": { "name.prefixes": "magazine" }}
            ]
        }
    }
}

Is there a way to filter out the prefix where the hyphen is used?


Solution

  • You're on the right path, however, you need to also add another analyzer that leverages the edge-ngram token filter in order to make the "starts with" contraint work. You can keep the ngram for checking fields that "contain" a given word, but you need edge-ngram to check that a field "starts with" some token.

    PUT /sample
    {
      "settings": {
        "index.number_of_shards": 5,
        "index.number_of_replicas": 0,
        "analysis": {
          "filter": {
            "nGram_filter": {
              "type": "nGram",
              "min_gram": 2,
              "max_gram": 20,
              "token_chars": [
                "letter",
                "digit"
              ]
            },
            "edgenGram_filter": {
              "type": "edgeNGram",
              "min_gram": 2,
              "max_gram": 20
            }
          },
          "analyzer": {
            "ngram_index_analyzer": {
              "type": "custom",
              "tokenizer": "keyword",
              "filter": [
                "lowercase",
                "nGram_filter"
              ]
            },
            "edge_ngram_index_analyzer": {
              "type": "custom",
              "tokenizer": "keyword",
              "filter": [
                "lowercase",
                "edgenGram_filter"
              ]
            }
          }
        }
      },
      "mappings": {
        "test": {
          "properties": {
            "name": {
              "type": "string",
              "fields": {
                "prefixes": {
                  "type": "string",
                  "analyzer": "edge_ngram_index_analyzer",
                  "search_analyzer": "standard"
                },
                "substrings": {
                  "type": "string",
                  "analyzer": "ngram_index_analyzer",
                  "search_analyzer": "standard"
                }
              }
            }
          }
        }
      }
    }
    

    Then your query will become (i.e. search for all documents whose name field contains play or starts with magazine)

    POST /sample/test/_search
    {
        "query": {
            "bool": {
                "minimum_should_match": 1,
                "should": [
                    {"match": { "name.substrings": "play" }},
                    {"match": { "name.prefixes": "magazine" }}
                ]
            }
        }
    }
    

    Note: don't use wildcard for searching for substrings, as it will kill the performance of your cluster (more info here and here)