Search code examples
elasticsearchnestedanalyzersynonym

Some Multi word synonyms are not working in elasticsearch for nested fields


I am trying to use synonym analyzer at query time and not getting expected results. Can someone throw some light on this?

Here is my mapping for the index:

{
  "jobs_user_profile_v2": {
    "mappings": {
      "profile": {
        "_all": {
          "enabled": false
        },
        "_ttl": {
          "enabled": true
        },
        "properties": {

          "rsa": {
            "type": "nested",
            "properties": {
              "answer": {
                "type": "string",
                "index_analyzer": "autocomplete",
                "search_analyzer": "synonym",
                "position_offset_gap": 100
              },
              "answerId": {
                "type": "long"
              },
              "answerOriginal": {
                "type": "string",
                "index": "not_analyzed"
              },
              "createdAt": {
                "type": "long"
              },
              "label": {
                "type": "string",
                "index": "not_analyzed"
              },
              "labelOriginal": {
                "type": "string",
                "index": "not_analyzed"
              },
              "question": {
                "type": "string",
                "index": "not_analyzed"
              },
              "questionId": {
                "type": "long"
              },
              "questionOriginal": {
                "type": "string"
              },
              "source": {
                "type": "integer"
              },
              "updatedAt": {
                "type": "long"
              }
            }
          }

        }
      }
    }
  }
}

The field to focus on is rsa.answer, which is the field I am querying.

My synonym mapping:

Beautician,Stylist,Make up artist,Massage therapist,Therapist,Spa,Hair Dresser,Salon,Beauty Parlour,Parlor => Beautician
Carpenter,Wood Worker,Furniture Carpenter => Carpenter
Cashier,Store Manager,Store Incharge,Purchase Executive,Billing Executive,Billing Boy => Cashier
Content Writer,Writer,Translator,Writing,Copywriter,Content Creation,Script Writer,Freelance Writer,Freelance Content Writer => Content Writer

My Search Query:

http://{{domain}}/jobs_user_profile_v2/_search

{
  "query": {
      "nested":{
           "path": "rsa",
           "query":{
    "query_string": {
      "query": "hair dresser",
      "fields": ["answer"],
      "analyzer" :"synonym"



    }
    },
     "inner_hits": {
          "explain": true
      }

  }
  },
  "explain" : true,
  "sort" : [ {
    "_score" : { }
  } ]
}

It is showing proper Beautician and 'Cashierprofiles for search queryHair Dresserandbilling executivebut not showing anything forwood worker => carpenter` case.

My analyzer results:

http://{{domain}}/jobs_user_profile_v2/_analyze?analyzer=synonym&text=hair dresser


{
  "tokens": [
    {
      "token": "beautician",
      "start_offset": 0,
      "end_offset": 12,
      "type": "SYNONYM",
      "position": 1
    }
  ]
}

and for wood worker case

http://{{domain}}/jobs_user_profile_v2/_analyze?analyzer=synonym&text=wood worker


{
  "tokens": [
    {
      "token": "carpenter",
      "start_offset": 0,
      "end_offset": 11,
      "type": "SYNONYM",
      "position": 1
    }
  ]
}

It is also not working a few other cases.

My analyzer setting for index:

 "analysis": {
          "filter": {
            "synonym": {
              "ignore_case": "true",
              "type": "synonym",
              "synonyms_path": "synonym.txt"
            },
            "autocomplete_filter": {
              "type": "edge_ngram",
              "min_gram": "3",
              "max_gram": "10"
            }
          },
          "analyzer": {
            "text_en_splitting_search": {
              "type": "custom",
              "filter": [
                "stop",
                "lowercase",
                "porter_stem",
                "word_delimiter"
              ],
              "tokenizer": "whitespace"
            },
            "synonym": {
              "filter": [
                "stop",
                "lowercase",
                "synonym"
              ],
              "type": "custom",
              "tokenizer": "standard"
            },
            "autocomplete": {
              "filter": [
                "lowercase",
                "autocomplete_filter"
              ],
              "type": "custom",
              "tokenizer": "standard"
            },
            "text_en_splitting": {
              "filter": [
                "lowercase",
                "porter_stem",
                "word_delimiter"
              ],
              "type": "custom",
              "tokenizer": "whitespace"
            },
            "text_general": {
              "filter": [
                "lowercase"
              ],
              "type": "custom",
              "tokenizer": "standard"
            },
            "edge_ngram_analyzer": {
              "filter": [
                "lowercase"
              ],
              "type": "custom",
              "tokenizer": "edge_ngram_tokenizer"
            },
            "autocomplete_analyzer": {
              "filter": [
                "lowercase"
              ],
              "tokenizer": "whitespace"
            }
          },
          "tokenizer": {
            "edge_ngram_tokenizer": {
              "token_chars": [
                "letter",
                "digit"
              ],
              "min_gram": "2",
              "type": "edgeNGram",
              "max_gram": "10"
            }
          }
        }

Solution

  • For the above case one multi-match is more ideal than query-string. Multi-Match unlike query string does not tokenize the query terms before analyzing it . As a result multi-word synonyms may not work as expected.

    Example:

    {
       "query": {
          "nested": {
             "path": "rsa",
             "query": {
                "multi_match": {
                   "query": "wood worker",
                   "fields": [
                      "rsa.answer"
                   ],
                   "type" : "cross_fields",
                   "analyzer": "synonym"
                }
             }
          }
       }
    }
    

    If for some reason you prefer query-string then you would need to pass the entire query in double quotes to ensure it is not tokenized:

    example :

    post test/_search
    {
       "query": {
          "nested": {
             "path": "rsa",
             "query": {
                "query_string": {
                   "query": "\"wood worker\"",
                   "fields": [
                      "rsa.answer"
                   ],
                   "analyzer": "synonym"
                }
             }
          }
       }
    }