Search code examples
elasticsearchelastic-stack

Elasticsearch does not match a partial query


I'm currently trying to create an analyzer that will match a part of a query. The main use case is with this term "3D mammogram", for some reason using my autocomplete analyzer down below, produces no results. Upon removing the "operator" : "AND" option, elastic started to return results but still the results that are expected are with less score for some reason.

Here are the settings and the mappings for my index:

MAPPINGS:

{
    "index": {
        "properties": {
            "code": {
                "type": "text"
            },
            "type": {
                "type": "text"
            },
            "term": {
                "type": "text",
                "analyzer": "autocomplete",
                "search_analyzer": "index_search"
            }
        }
    }
}

SETTINGS:

{
  "index" : {
    "settings" : {
      "index" : {
        "number_of_shards" : "5",
        "provided_name" : "index",
        "creation_date" : ".......",
        "analysis" : {
          "filter" : {
            "case_transition_filter" : {
              "split_on_numerics" : "true",
              "type" : "word_delimiter",
              "preserve_original" : "true",
              "stem_english_possessive" : "false"
            },
            "autocomplete_filter" : {
              "type" : "edge_ngram",
              "min_gram" : "2",
              "max_gram" : "15"
            },
            "hyphen-filter" : {
              "pattern" : "-",
              "type" : "pattern_replace",
              "replacement" : " "
            }
          },
          "analyzer" : {
            "autocomplete" : {
              "filter" : [ "case_transition_filter", "lowercase", "hyphen-filter", "autocomplete_filter" ],
              "type" : "custom",
              "tokenizer" : "keyword"
            },
            "index_search" : {
              "type" : "standard"
            }
          }
        },
        "number_of_replicas" : "1",
        "uuid" : ".....g",
        "version" : {
          "created" : "..."
        }
      }
    }
  }
}

As you can see I'm using two different analyzers - the autocomplete one for indexing and a standard one for search.

From my backend I'm hitting the elastic index with these two match queries wrapped in a bool query:

{
  "bool" : {
    "should" : [
      {
        "match" : {
          "term" : {
            "query" : "3d mammogram",
            "operator" : "AND",
            "analyzer" : "keyword",
            "fuzziness" : "1",
            "prefix_length" : 1,
            "max_expansions" : 50,
            "fuzzy_transpositions" : true,
            "lenient" : false,
            "zero_terms_query" : "NONE",
            "auto_generate_synonyms_phrase_query" : true,
            "boost" : 2.0
          }
        }
      },
      {
        "match" : {
          "term" : {
            "query" : "3d mammogram",
            "operator" : "AND",
            "fuzziness" : "1",
            "prefix_length" : 1,
            "max_expansions" : 50,
            "fuzzy_transpositions" : true,
            "lenient" : false,
            "zero_terms_query" : "NONE",
            "auto_generate_synonyms_phrase_query" : true,
            "boost" : 1.0
          }
        }
      }
    ],
    "adjust_pure_negative" : true,
    "minimum_should_match" : "1",
    "boost" : 1.0
  }
}

Both of the queries like that produce no results but upon removing the "operator" : "AND" from the second query I'm starting to get good results but not the ones that I expect. Here are the results from the second query:

{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 93,
    "max_score" : 20.951433,
    "hits" : [
      {
        "_index" : "index",
        "_type" : "index",
        "_id" : ".....",
        "_score" : 20.951433,
        "_source" : {
          "id" : null,
          "careNeedCode" : "...",
          "careNeedType" : "...",
          "term" : "Routine mammogram"
        }
      },
      {
        "_index" : "...",
        "_type" : "...",
        "_id" : "...",
        "_score" : 19.059473,
        "_source" : {
          "id" : null,
          "careNeedCode" : "...",
          "careNeedType" : "...",
          "term" : "Mammogram"
        }
      },
      {
        "_index" : "....",
        "_type" : "...",
        "_id" : "...",
        "_score" : 18.515629,
        "_source" : {
          "id" : null,
          "careNeedCode" : "...",
          "careNeedType" : "...",
          "term" : "Screening mammogram"
        }
      },
      {
        "_index" : "...",
        "_type" : "search-term",
        "_id" : "....",
        "_score" : 18.515629,
        "_source" : {
          "id" : null,
          "careNeedCode" : "...",
          "careNeedType" : "treatment procedures",
          "term" : "Diagnostic mammogram"
        }
      },
      {
        "_index" : "....",
        "_type" : "...",
        "_id" : "...",
        "_score" : 18.515629,
        "_source" : {
          "id" : null,
          "careNeedCode" : "...",
          "careNeedType" : "...",
          "term" : "Digital mammogram"
        }
      },
      {
        "_index" : "...",
        "_type" : "...",
        "_id" : "...",
        "_score" : 18.480751,
        "_source" : {
          "id" : null,
          "careNeedCode" : "...",
          "careNeedType" : "...",
          "term" : "Screening 3D mammogram"
        }
      },
      {
        "_index" : "...",
        "_type" : "...",
        "_id" : "...",
        "_score" : 18.376223,
        "_source" : {
          "id" : null,
          "careNeedCode" : "...",
          "careNeedType" : "t...",
          "term" : "Diagnostic 3D mammogram"
        }
      },
      {
        "_index" : "...",
        "_type" : "...",
        "_id" : "...",
        "_score" : 17.930023,
        "_source" : {
          "id" : null,
          "careNeedCode" : "...",
          "careNeedType" : "...",
          "term" : "Mammography"
        }
      },
      {
        "_index" : "...",
        "_type" : "...",
        "_id" : "....",
        "_score" : 17.287262,
        "_source" : {
          "id" : null,
          "careNeedCode" : "...",
          "careNeedType" : "...",
          "term" : "Screening mammography"
        }
      },
      {
        "_index" : "....",
        "_type" : "...",
        "_id" : "...",
        "_score" : 17.287262,
        "_source" : {
          "id" : null,
          "careNeedCode" : "...",
          "careNeedType" : "...",
          "term" : "Abnormal mammography"
        }
      }
    ]
  }
}

As you can see the results containing "3d mammogram" are way below than results that have only "mammogram" in them. I'm not sure what I am missing here.


Solution

  • Based on your index mapping and settings, the tokens generated for "Screening 3D mammogram" will be

    {
      "tokens": [
        {
          "token": "sc",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "scr",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "scre",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "scree",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "screen",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "screeni",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "screenin",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "screening",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "screening ",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "screening 3",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "screening 3d",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "screening 3d ",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "screening 3d m",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "screening 3d ma",
          "start_offset": 0,
          "end_offset": 22,
          "type": "word",
          "position": 0
        },
        {
          "token": "sc",
          "start_offset": 0,
          "end_offset": 9,
          "type": "word",
          "position": 0
        },
        {
          "token": "scr",
          "start_offset": 0,
          "end_offset": 9,
          "type": "word",
          "position": 0
        },
        {
          "token": "scre",
          "start_offset": 0,
          "end_offset": 9,
          "type": "word",
          "position": 0
        },
        {
          "token": "scree",
          "start_offset": 0,
          "end_offset": 9,
          "type": "word",
          "position": 0
        },
        {
          "token": "screen",
          "start_offset": 0,
          "end_offset": 9,
          "type": "word",
          "position": 0
        },
        {
          "token": "screeni",
          "start_offset": 0,
          "end_offset": 9,
          "type": "word",
          "position": 0
        },
        {
          "token": "screenin",
          "start_offset": 0,
          "end_offset": 9,
          "type": "word",
          "position": 0
        },
        {
          "token": "screening",
          "start_offset": 0,
          "end_offset": 9,
          "type": "word",
          "position": 0
        },
        {
          "token": "ma",
          "start_offset": 13,
          "end_offset": 22,
          "type": "word",
          "position": 3
        },
        {
          "token": "mam",
          "start_offset": 13,
          "end_offset": 22,
          "type": "word",
          "position": 3
        },
        {
          "token": "mamm",
          "start_offset": 13,
          "end_offset": 22,
          "type": "word",
          "position": 3
        },
        {
          "token": "mammo",
          "start_offset": 13,
          "end_offset": 22,
          "type": "word",
          "position": 3
        },
        {
          "token": "mammog",
          "start_offset": 13,
          "end_offset": 22,
          "type": "word",
          "position": 3
        },
        {
          "token": "mammogr",
          "start_offset": 13,
          "end_offset": 22,
          "type": "word",
          "position": 3
        },
        {
          "token": "mammogra",
          "start_offset": 13,
          "end_offset": 22,
          "type": "word",
          "position": 3
        },
        {
          "token": "mammogram",
          "start_offset": 13,
          "end_offset": 22,
          "type": "word",
          "position": 3
        }
      ]
    }
    

    There is no token generated for 3d. This is because you have taken "tokenizer" : "keyword" for the autocomplete analyzer. You need to modify your index mapping and change the tokenizer from keyword to standard

    Modified index mapping will be

    "analyzer" : {
                "autocomplete" : {
                  "filter" : [ "case_transition_filter", "lowercase", "hyphen-filter", "autocomplete_filter" ],
                  "type" : "custom",
                  "tokenizer" : "standard"       // note this
                },
    

    You need to reindex the data again with this new index mapping.


    Adding a working example with index data,index mapping, search query, and search result

    Index Mapping:

    {
      "settings": {
        "analysis": {
          "filter": {
            "case_transition_filter": {
              "split_on_numerics": "true",
              "type": "word_delimiter",
              "preserve_original": "true",
              "stem_english_possessive": "false"
            },
            "autocomplete_filter": {
              "type": "edge_ngram",
              "min_gram": "2",
              "max_gram": "15"
            },
            "hyphen-filter": {
              "pattern": "-",
              "type": "pattern_replace",
              "replacement": " "
            }
          },
          "analyzer": {
            "autocomplete": {
              "filter": [
                "case_transition_filter",
                "lowercase",
                "hyphen-filter",
                "autocomplete_filter"
              ],
              "type": "custom",
              "tokenizer": "standard"           // note this
            },
            "search_term_search": {
              "type": "standard" 
            }
          }
        },
        "max_ngram_diff": 20
      },
      "mappings": {
        "properties": {
          "term": {
            "type": "text",
            "analyzer": "autocomplete",
            "search_analyzer": "search_term_search"
          }
        }
      }
    }
    

    The tokens generated will include "3d" and "mammogram" both.

    Index Data:

    {
      "term": "Screening mammogram"
    }
    {
      "term": "Diagnostic 3D mammogram"
    }
    {
      "term": "Mammography"
    }
    

    Search Query:

    {
      "query": {
        "match": {
          "term": {
            "query": "3D mammogram",
            "operator": "and"
          }
        }
      }
    }
    

    Search Result:

    "hits": [
          {
            "_index": "67607194",
            "_type": "_doc",
            "_id": "4",
            "_score": 1.4572026,
            "_source": {
              "term": "Diagnostic 3D mammogram"
            }
          }
        ]