Search code examples
elasticsearchfuzzy-searchelasticsearch-query

Elasticsearch Similar Text Query


Given the following documents in an index (lets call it addresses):

{
    ADDRESS: {
        ID: 1,
        LINE1: "steet 1",
        CITY: "kuala lumpur",
        COUNTRY: "MALAYSIA",
        ...
    } 
}
{
    ADDRESS: {
        ID: 2,
        LINE1: "steet 1",
        CITY: "kualalumpur city",
        COUNTRY: "MALAYSIA",
        ...
    }
}
{
    ADDRESS: {
        ID: 3,
        LINE1: "steet 1",
        CITY: "kualalumpur",        
        COUNTRY: "MALAYSIA",
        ...
    }
}
{
    ADDRESS: {
        ID: 4,
        LINE1: "steet 1",
        CITY: "kuala lumpur city",      
        COUNTRY: "MALAYSIA",
        ...
    }
}

At this point, I found the query to grab "kualalumpur", "kuala lumpur", "kualalumpur city" with the search text "kualalumpur".
But "kuala lumpur city" is missing from the result despite near similarity with "kualalumpur city".

Here is my query so far:

{
  "query": {
    "bool": {
      "should": [
          {"match": {"ADDRESS.STREET": {"query": "street 1", "fuzziness": 1, "operator": "AND"}}},
          {
            "bool": {
              "should": [
                {"match": {"ADDRESS.CITY": {"query": "kualalumpur", "fuzziness": 1, "operator": "OR"}}},
                {"match": {"ADDRESS.CITY.keyword": {"query": "kualalumpur", "fuzziness": 1, "operator": "OR"}}}
              ]
            }
          }
        ],
      "filter": {
        "bool": {
          "must": [
            {"term": {"ADDRESS.COUNTRY.keyword": "MALAYSIA"}}
          ]
        }
      },
      "minimum_should_match": 2
    }
  }
}

Given the condition, is it possible at all for Elasticsearch to return all four documents with search text "kualalumpur"?


Solution

  • You can use edge-n gram tokenizer on the country field to get the all four docs, tried it in my local and adding below working example.

    Create custom analyzer and apply it on your field

    {
        "settings": {
            "index": {
                "analysis": {
                    "analyzer": {
                        "ngram_analyzer": {
                            "type": "custom",
                            "filter": [
                                "lowercase"
                            ],
                            "tokenizer": "edgeNGramTokenizer"
                        }
                    },
                    "tokenizer": {
                        "edgeNGramTokenizer": {
                            "token_chars": [
                                "letter",
                                "digit"
                            ],
                            "min_gram": "1",
                            "type": "edgeNGram",
                            "max_gram": "40"
                        }
                    }
                },
                "max_ngram_diff": "50"
            }
        },
        "mappings": {
            "properties": {
                "country": {
                    "type": "text",
                    "analyzer" : "ngram_analyzer"
                }
            }
        }
    }
    

    Index your all four sample docs, like below

    {
      "country" : "kuala lumpur"
    }
    

    search query with term kualalumpur matches all four docs

    {
        "query": {
            "match" : {
                "country" : "kualalumpur"
            }
        }
    }
    
     "hits": [
          {
            "_index": "fuzzy",
            "_type": "_doc",
            "_id": "3",
            "_score": 5.0003963,
            "_source": {
              "country": "kualalumpur"
            }
          },
          {
            "_index": "fuzzy",
            "_type": "_doc",
            "_id": "2",
            "_score": 4.4082437,
            "_source": {
              "country": "kualalumpur city"
            }
          },
          {
            "_index": "fuzzy",
            "_type": "_doc",
            "_id": "1",
            "_score": 0.5621849,
            "_source": {
              "country": "kuala lumpur"
            }
          },
          {
            "_index": "fuzzy",
            "_type": "_doc",
            "_id": "4",
            "_score": 0.4956103,
            "_source": {
              "country": "kuala lumpur city"
            }
          }
        ]