Search code examples
elasticsearchautocompletekibanan-gramelasticsearch-aggregation

How to get Elasticsearch terms aggregation for multi valued fields using NGram filter for autocompletion?


I am doing my autocompletion project and new to Elasticsearch. I have used Edge NGram filter for autocompletion. I am trying to get unique results for autocompletion, So I have used terms aggregation for all the fields. I am getting good results for the field having 1 value, but for the fields having more than one value.. if the query matches at least one value from that field.. it is giving me all the values from that field(whether the query matches or not in other values).

My settings and mapping under the garments index are :

PUT /garments
{
  "settings" : 
  {
    "number_of_replicas": 3,
    "number_of_shards": 2,
    "analysis": 
    {
      "analyzer": 
      {
        "autocomplete": 
        {
          "tokenizer": "autocomplete",
          "filter": 
          [
            "lowercase"
          ]
        },
        "autocomplete_search": 
        {
          "tokenizer": "lowercase"
        }
      },
      "tokenizer": 
      {
        "autocomplete": 
        {
          "type": "edge_ngram",
          "min_gram": 2,
          "max_gram": 10,
          "token_chars": 
          [
            "letter"
          ]
        }
      }

    }
  },
  "mappings":
  {
    "properties":
    {
      "color": 
      {
        "type": "text",
        "analyzer": "autocomplete",
        "search_analyzer": "autocomplete_search",
        "fields": 
        {
          "keyword": 
          { 
            "type": "keyword"
          }
        }

      }
........
........
........
   }
}

(note that i am using text type) suppose I have a color field in a doc having multi values like :["blue","black","orange","marble","jet black"] and my search query is :

GET /garments/_search
{
  "size": 0, 
  "query": 
  {
      "query_string": {
        "query": "bl"
      }
  },
  "aggs":
  {
    "Term_aggregation": 
    {
      "terms": 
      {
        "field": "color.keyword", 
        "size": 100
      }
    }
  }
}

this gives me all the outputs i.e.: "blue","black","orange","marble","jet black". But i wanted only blue, black, jet black as my results(query is "bl"). Later I used

"include": " .*bl.*" 

filter in my terms aggs.. which gave me blue,black,marble,jet black as my results.. this include filter is case sensitive... Please help!


Solution

  • If you want to do case insensitive match on a keyword field, you can use normalizer with a lowercase filter

    The normalizer property of keyword fields is similar to analyzer except that it guarantees that the analysis chain produces a single token.

    {
      "settings": {
        "analysis": {
          "normalizer": {
            "lowercase_normalizer": {
              "type": "custom",
              "filter": [
                "lowercase"
              ]
            }
          }
        }
      },
      "mappings": {
        "properties": {
          "color": {
            "type": "text",
            "analyzer": "autocomplete",
            "search_analyzer": "autocomplete_search",
            "fields": {
              "keyword": {
                "type": "keyword",
                "normalizer": "lowercase_normalizer"
              }
            }
          }
        }
      }
    }
    

    "include": " .bl." will work even if actual value has uppercase leters

    EDIT 1

    As per your comment if you don't want to use include in terms. You need to index your color with nested type, so that each color is treated as separate object

    Mappings:

    PUT index64
    {
      "settings": {
        "number_of_replicas": 3,
        "number_of_shards": 2,
        "analysis": {
          "analyzer": {
            "autocomplete": {
              "tokenizer": "autocomplete",
              "filter": [
                "lowercase"
              ]
            },
            "autocomplete_search": {
              "tokenizer": "lowercase"
            }
          },
          "tokenizer": {
            "autocomplete": {
              "type": "edge_ngram",
              "min_gram": 2,
              "max_gram": 10,
              "token_chars": [
                "letter"
              ]
            }
          }
        }
      },
      "mappings": {
        "properties": {
          "color": {
            "type": "nested",
            "properties": {
              "name": {
                "type": "text",
                "analyzer": "autocomplete",
                "search_analyzer": "autocomplete_search",
                "fields": {
                  "keyword": {
                    "type": "keyword"
                  }
                }
              }
            }
          }
        }
      }
    }
    
    

    Query:

    POST index64/_doc
    {
      "color": [
        {
          "name": "blue"
        },
        {
          "name": "black"
        },
        {
          "name": "orange"
        },
        {
          "name": "marble"
        },
        {
          "name": "jet black"
        }
      ]
    }
    
    

    Result:

    GET index64/_search
    {
      "size": 0, 
      "aggs": {
        "color": {
          "nested": {
            "path": "color"
          },
          "aggs": {
            "select_color": {
              "filter": {
                "match":{
                  "color.name":"bl"
                }
              },
              "aggs": {
                "distinct_colors": {
                  "terms": {
                    "field": "color.name.keyword",
                    "size": 10
                  }
                }
              }
            }
          }
        }
      }
    }
    

    Result

    "aggregations" : {
        "color" : {
          "doc_count" : 5,
          "select_color" : {
            "doc_count" : 3,
            "distinct_colors" : {
              "doc_count_error_upper_bound" : 0,
              "sum_other_doc_count" : 0,
              "buckets" : [
                {
                  "key" : "black",
                  "doc_count" : 1
                },
                {
                  "key" : "blue",
                  "doc_count" : 1
                },
                {
                  "key" : "jet black",
                  "doc_count" : 1
                }
              ]
            }
          }
        }
      }