Search code examples
amazon-web-serviceselasticsearchelastic-stackelasticsearch-analyzers

Elastic search Phonetic analyzer returns zero results?


I am getting 0 results using ES phonetic analyzer.

Using the built-in plugin in AWS - https://aws.amazon.com/about-aws/whats-new/2016/12/amazon-elasticsearch-service-now-supports-phonetic-analysis/.

Before indexing, I used this code to set up the phonetic analyzer.

PUT enpoint/courts_2
{
  "settings": {
    "index": {
      "analysis": {
        "analyzer": {
          "my_analyzer": {
            "tokenizer": "standard",
            "filter": [
              "lowercase",
              "my_metaphone"
            ]
          }
        },
        "filter": {
          "my_metaphone": {
            "type": "phonetic",
            "encoder": "metaphone",
            "replace": true
          }
        }
      }
    }
  }
}

Note: I have not downloaded it specifically as AWS has it pre-built (Check above link). Now, I am using this code to make the query to the endpoint -

{
    "query": {
        "multi_match" : {
            "query" : "Abhijith",
            "fields" : ["content", "title^10"],
             "analyzer": "my_analyzer"


        }
    },
     "size": "1",
     "_source": [ "title", "bench", "court" ],
     "highlight": {
        "fields" : {
            "title" : {},
            "content":{}
        }
    }

}

But I am getting zero results back. I am getting the below output:

{
    "took": 1,
    "timed_out": false,
    "_shards": {
        "total": 5,
        "successful": 5,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 0,
            "relation": "eq"
        },
        "max_score": null,
        "hits": []
    }
}

I can confirm that when not using the analyzer, I am getting back hits.

When I use this code, it returns normal output though.

GET courts_2/_analyze
{
  "analyzer": "my_analyzer",
  "text": "Abhijith"
}

Response

{
    "tokens": [
        {
            "token": "ABHJ",
            "start_offset": 0,
            "end_offset": 8,
            "type": "<ALPHANUM>",
            "position": 0
        }
    ]
}

Index Mapping

{
    "courts_2": {
        "mappings": {
            "properties": {
                "author": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "bench": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "citation": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "content": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "court": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "date": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "id_": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "title": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "verdict": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                }
            }
        }
    }
}

Solution

  • it seems you didn't specify a mapping for your courts_2 index. So all text fields are using the standard analyzer for indexation.

    So the phonetics tokens are not indexed, so they cant be match at query time.

    To configure your text field to use your analyzer your need to use such a mapping

    PUT enpoint/courts_2
    {
      "settings": {
        "index": {
          "analysis": {
            "analyzer": {
              "my_analyzer": {
                "tokenizer": "standard",
                "filter": [
                  "lowercase",
                  "my_metaphone"
                ]
              }
            },
            "filter": {
              "my_metaphone": {
                "type": "phonetic",
                "encoder": "metaphone",
                "replace": true
              }
            }
          }
        }
      },
      "mappings": {
        "properties": {
          "content": {
            "type": "text",
            "analyzer": "my_analyzer"
          },
          "title": {
            "type": "text",
            "analyzer": "my_analyzer"
          }
        }
      }
    }
    

    Here the documentation about mapping parameters

    Regards.