Search code examples
elasticsearchelasticsearch-aggregation

Elasticsearch aggregations with filter match


I have a document with a collection of nested documents:

{
  "_source": {
    ...
    "groups": [
      {
        "group_id": 100,
        "parent_group_id": 1,
        "title": "Wheel",
        "parent_group_title": "Parts"
      },
      {
        "group_id": 200,
        "parent_group_id": 2,
        "title": "Seat",
        "parent_group_title": "Parts"
      }
    ]
    ...
  }
}

Mapping looks the next:

{
  ...,

  "groups": {
    "type": "nested",
    "properties": {
      "group_id": {
        "type": "long"
      },
      "title": {
        "type": "text",
        "analyzer": "my_custom_analyzer",
        "term_vector": "with_positions_offsets",
        "fields": {
          "keyword": {
            "type": "keyword"
          }
        }
      },
      "parent_group_id": {
        "type": "long"
      },
      "parent_group_title": {
        "type": "text",
        "analyzer": "my_custom_analyzer",
        "term_vector": "with_positions_offsets",
        "fields": {
          "keyword": {
            "type": "keyword"
          }
        }
      }
    }
  },

  ...
}

What I'm trying to do is the next aggregation:

{
  "query": {
    "bool": {
      "must": [
        {
          "nested": {
            "path": "groups",
            "query": {
              "match": {
                "groups.title": {
                  "query": "whe"
                }
              }
            }
          }
        }
      ]
    }
  },
  "size": 0,
  "aggs": {
    "filtered": {
      "filter": {
        "bool": {
          "must": [
            {
              "nested": {
                "path": "groups",
                "query": {
                  "match": {
                    "groups.title": {
                      "query": "whe"
                    }
                  }
                }
              }
            }
          ]
        }
      },
      "aggs": {
        "groups": {
          "nested": {
            "path": "groups"
          },
          "aggs": {
            "titles": {
              "terms": {
                "field": "groups.title.keyword",
                "size": 5
              },
              "aggs": {
                "parents": {
                  "terms": {
                    "field": "groups.parent_group_title.keyword",
                    "size": 3
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}

With such query I'm getting results something like the next:

  "aggregations" : {
    "filtered" : {
      "doc_count" : ...,
      "groups" : {
        "doc_count" : ...,
        "titles" : {
          "doc_count_error_upper_bound" : ...,
          "sum_other_doc_count" : ...,
          "buckets" : [
            {
              "key" : "Seat",
              "doc_count" : 10,
              "parents" : {
                "doc_count_error_upper_bound" : 0,
                "sum_other_doc_count" : 10,
                "buckets" : [
                  {
                    "key" : "Parts",
                    "doc_count" : 6
                  },
                  {
                    "key" : "Other",
                    "doc_count" : 4
                  }
                ]
              }
            },
            {
              "key" : "Wheel",
              "doc_count" : 3,
              "parents" : {
                "doc_count_error_upper_bound" : 0,
                "sum_other_doc_count" : 3,
                "buckets" : [
                  {
                    "key" : "Parts",
                    "doc_count" : 2
                  },
                  {
                    "key" : "Other",
                    "doc_count" : 1
                  }
                ]
              }
            }
          ]
        }
      }
    }
  }

But what I want is that only result with key Wheel is to appear in the result buckets (or any other results matching to whe search string).

Hope the question is clear enough. What I'm doing wrong? Any suggestings or changing data structure or query?

UPD:
Adding a my_custom_analyzer for reference:

{
  "my_custom_analyzer": {
    "type": "custom",
    "tokenizer": "ngram",
    "filter": [
      "lowercase",
      "asciifolding"
    ],
    "char_filter": [
      "html_strip"
    ],
    "min_gram": 2,
    "max_gram": 15,
    "token_chars": [
      "letter",
      "digit"
    ]
  }
}


Solution

  • You may want to filter just before the groups.title group. This means you don't need your top-level query at all nor the filtered-level query.

    I don't have your my_custom_analyzer available so I used a basic match but you get the gist:

    GET groups/_search
    {
      "size": 0,
      "aggs": {
        "groups": {
          "nested": {
            "path": "groups"
          },
          "aggs": {
            "titles": {
              "filter": {
                "match": {
                  "groups.title": {
                    "query": "wheel"
                  }
                }
              },
              "aggs": {
                "group_title_terms": {
                  "terms": {
                    "field": "groups.title.keyword",
                    "size": 5
                  },
                  "aggs": {
                    "parents": {
                      "terms": {
                        "field": "groups.parent_group_title.keyword",
                        "size": 3
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
    

    UPDATE:

    There's an issue w/ your analyzer -- let's use _analyze to determine how whe would get tokenized:

    GET groups/_analyze
    {
      "text": "whe",
      "analyzer": "my_custom_analyzer"
    }
    

    yielding

    {
      "tokens" : [
        {
          "token" : "w",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "wh",
          "start_offset" : 0,
          "end_offset" : 2,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "h",
          "start_offset" : 1,
          "end_offset" : 2,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "he",
          "start_offset" : 1,
          "end_offset" : 3,
          "type" : "word",
          "position" : 3
        },
        {
          "token" : "e",
          "start_offset" : 2,
          "end_offset" : 3,
          "type" : "word",
          "position" : 4
        }
      ]
    }
    

    I suspect based on the token e, Seats get matched.


    My suggestion is to use the edge_ngram instead of n_gram as follows:

    PUT groups
    {
      "settings": {
        "analysis": {
          "analyzer": {
            "my_custom_analyzer": {
              "type": "custom",
              "tokenizer": "my_tokenizer",
              "filter": [
                "lowercase",
                "asciifolding"
              ],
              "char_filter": [
                "html_strip"
              ]
            }
          },
          "tokenizer": {
            "my_tokenizer": {
              "type": "edge_ngram",
              "min_gram": 2,
              "max_gram": 10,
              "token_chars": [
                "letter",
                "digit"
              ]
            }
          }
        }
      },
      "mappings": {
        "properties": {
          "groups": {
            "type": "nested",
            "properties": {
              "group_id": {
                "type": "long"
              },
              "title": {
                "type": "text",
                "analyzer": "my_custom_analyzer",
                "term_vector": "with_positions_offsets",
                "fields": {
                  "keyword": {
                    "type": "keyword"
                  }
                }
              },
              "parent_group_id": {
                "type": "long"
              },
              "parent_group_title": {
                "type": "text",
                "analyzer": "my_custom_analyzer",
                "term_vector": "with_positions_offsets",
                "fields": {
                  "keyword": {
                    "type": "keyword"
                  }
                }
              }
            }
          }
        }
      }
    }
    

    Apply the mapping, reindex & you're good to go!