Search code examples
elasticsearchelasticsearch-dsl

word count or get document from doc_count


Is there a way to get document after composite aggregation? Assume after I aggregated and I get doc_count = 5. I want to know these 5 documents in doc_count because I have to analyze these documents. or is there a way to count word like my example below.

I want to count how many user[ant,bird,cat,elep] have bought product a. and my data look like these

{"Date":"20200515","product":["a","a","a","b","c"],"user":"ant","rank":"silver"}
{"Date":"20200515","product":["a","b","c","e","f"],"user":"ant","rank":"silver"}
{"Date":"20200515","product":["a","a","c","c","d"],"user":"bird","rank":"silver"}
{"Date":"20200515","product":["a","a","c","d","e"],"user":"cat","rank":"silver"}
{"Date":"20200515","product":["a","a","a","b","f"],"user":"cat","rank":"silver"}
{"Date":"20200515","product":["a","a","b","c","d"],"user":"elep","rank":"silver"}

And my query look like this

{
  "aggs":{
      "comp":{
         "composite":{
            "sources":[
               {
                  "log_date":{
                     "terms":{
                        "field":"Date.keyword"
                     }
                  }
               },
               {
                  "product":{
                     "terms":{
                        "field":"product.keyword",
                        "missing_bucket":true
                     }
                  }
               },
               {
                  "rank":{
                     "terms":{
                        "field":"rank.keyword",
                        "missing_bucket":true
                     }
                  }
               },
               {
                  "user":{
                     "terms":{
                        "field":"user.keyword",
                        "missing_bucket":true
                     }
                  }
               }
            ]
         }
      }
  }
}

and this is my result

Date      user rank    product doc_count
20200515  ant  silver    a        2
20200515  bird silver    a        1  
20200515  cat  silver    a        2
20200515  elep silver    a        1
...

And this is my expect result

Date      user rank    product doc_count amount
20200515  ant  silver    a        2        4
20200515  bird silver    a        1        2
20200515  cat  silver    a        2        5
20200515  elep silver    a        1        2

Solution

  • You need to convert your product field to nested type

    Mapping:

    {
      "mappings": {
        "properties": {
          "product":{
            "type": "nested",
            "properties": {
              "name":{
                "type":"text",
                "fields":{
                  "keyword":{
                    "type":"keyword"
                  }
                }
              }
            }
          }
        }
      }
    }
    

    Data:

    {
      "Date": "20200515",
      "product": [
        {
          "name": "a"
        },
        {
          "name": "a"
        },
        {
          "name": "a"
        },
        {
          "name": "a"
        },
        {
          "name": "b"
        },
        {
          "name": "c"
        }
      ],
      "user": "ant",
      "rank": "silver"
    }
    

    Query:

    {
      "query": {
        "bool": {
          "filter": {
            "nested": {  --> use nested type to filter on product
              "path": "product",
              "query": {
                "match": {
                  "product.name": "a"
                }
              }
            }
          }
        }
      },
      "aggs": {
        "user_count": { --> total count of users
          "cardinality": {
            "field": "user.keyword"
          }
        },
        "users": {
          "terms": {
            "field": "user.keyword",
            "size": 10
          },
          "aggs": {
            "product": {
              "nested": {
                "path": "product"
              },
              "aggs": {
                "product_name": {
                  "terms": {
                    "field": "product.name.keyword",
                    "include":"a", --> include only specific value, accepts array
                    "size": 10
                  },
                  "aggs": {
                    "amount": {
                      "value_count": {
                        "field": "product.name.keyword"
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
    

    Result

    "hits" : {
        "total" : {
          "value" : 1,
          "relation" : "eq"
        },
        "max_score" : 0.0,
        "hits" : [
          {
            "_index" : "index44",
            "_type" : "_doc",
            "_id" : "WtSYJXIBEIlbGJUZf3Ve",
            "_score" : 0.0,
            "_source" : {
              "Date" : "20200515",
              "product" : [
                {
                  "name" : "a"
                },
                {
                  "name" : "a"
                },
                {
                  "name" : "a"
                },
                {
                  "name" : "a"
                },
                {
                  "name" : "b"
                },
                {
                  "name" : "c"
                }
              ],
              "user" : "ant",
              "rank" : "silver"
            }
          }
        ]
      },
      "aggregations" : {
        "user_count" : {
          "value" : 1
        },
        "users" : {
          "doc_count_error_upper_bound" : 0,
          "sum_other_doc_count" : 0,
          "buckets" : [
            {
              "key" : "ant",
              "doc_count" : 1,
              "product" : {
                "doc_count" : 6,
                "product_name" : {
                  "doc_count_error_upper_bound" : 0,
                  "sum_other_doc_count" : 0,
                  "buckets" : [
                    {
                      "key" : "a",
                      "doc_count" : 4,
                      "amount" : {
                        "value" : 4
                      }
                    }
                  ]
                }
              }
            }
          ]
        }
      }