Search code examples
elasticsearchelasticsearch-pluginelasticsearch-5

Sum and count aggregations over Elasticsearch fields


I am new to Elasticsearch and I am looking to perform certain aggregations over the fields from an Elasticsearch 5.x index. I have an index that contains the documents with fields langs (which have nested structure) and docLang. These are dynamically mapped fields. Following are the examples documents

DOC 1:

{
   "_index":"A",
   "_type":"document",
   "_id":"1",
   "_source":{
      "text":"This is a test sentence.",
      "langs":{
         "X":{
            "en":1,
            "es":2,
            "zh":3
         },
        "Y":{
            "en":4,
            "es":5,
            "zh":6
         } 
      },
      "docLang": "en"
   }
}

DOC 2:

{
   "_index":"A",
   "_type":"document",
   "_id":"2",
   "_source":{
      "text":"This is a test sentence.",
      "langs":{
         "X":{
            "en":1,
            "es":2
         },
         "Y":{
            "en":3,
            "es":4
         } 
      },
      "docLang": "es"
   }
}

DOC 3:

{
   "_index":"A",
   "_type":"document",
   "_id":"2",
   "_source":{
      "text":"This is a test sentence.",
      "langs":{
         "X":{
            "en":1
         },
         "Y":{
            "en":2
         } 
      },
      "docLang": "en"
   }
}

I want to perform sum aggregation over the langs field in a way that for each key (X/Y) and for each language, I can get the sum across all documents in an index. Also, I want to produce the counts of documents for each type of language from docLang field.

e.g.: For above 3 documents, sum aggregation over langs field would look like below:

"langs":{  
      "X":{  
         "en":3,
         "es":4,
         "zh":3
      },
      "Y":{  
         "en":9,
         "es":9,
         "zh":6
      }
   }

And the docLang count would look like below:

 "docLang":{
    "en" : 2,
    "es" : 1
   }

Also because of some production env restrictions, I cannot use scripts in Elasticsearch. So, I was wondering if it is possible to use just field aggregation type for above fields?


Solution

  • {
      "size": 0,
      "aggs": {
        "X": {
          "nested": {
            "path": "langs.X"
          },
          "aggs": {
            "X_sum_en": {
              "sum": {
                "field": "langs.X.en"
              }
            },
            "X_sum_es": {
              "sum": {
                "field": "langs.X.es"
              }
            },
            "X_sum_zh": {
              "sum": {
                "field": "langs.X.zh"
              }
            }
          }
        },
        "Y": {
          "nested": {
            "path": "langs.Y"
          },
          "aggs": {
            "Y_sum_en": {
              "sum": {
                "field": "langs.Y.en"
              }
            },
            "Y_sum_es": {
              "sum": {
                "field": "langs.Y.es"
              }
            },
            "Y_sum_zh": {
              "sum": {
                "field": "langs.Y.zh"
              }
            }
          }
        },
        "sum_docLang": {
          "terms": {
            "field": "docLang.keyword",
            "size": 10
          }
        }
      }
    }
    

    Since you didn't mention, but I think it's important. I made X and Y as nested fields:

        "langs": {
          "properties": {
            "X": {
              "type": "nested",
              "properties": {
                "en": {
                  "type": "long"
                },
                "es": {
                  "type": "long"
                },
                "zh": {
                  "type": "long"
                }
              }
            },
            "Y": {
              "type": "nested",
              "properties": {
                "en": {
                  "type": "long"
                },
                "es": {
                  "type": "long"
                },
                "zh": {
                  "type": "long"
                }
              }
            }
          }
        }
    

    But, if you fields are not nested at all and here I mean actually the nested field type in Elasticsearch, a simple aggregation like this one should be enough:

    {
      "size": 0,
      "aggs": {
        "X_sum_en": {
          "sum": {
            "field": "langs.X.en"
          }
        },
        "X_sum_es": {
          "sum": {
            "field": "langs.X.es"
          }
        },
        "X_sum_zh": {
          "sum": {
            "field": "langs.X.zh"
          }
        },
        "Y_sum_en": {
          "sum": {
            "field": "langs.Y.en"
          }
        },
        "Y_sum_es": {
          "sum": {
            "field": "langs.Y.es"
          }
        },
        "Y_sum_zh": {
          "sum": {
            "field": "langs.Y.zh"
          }
        },
        "sum_docLang": {
          "terms": {
            "field": "docLang.keyword",
            "size": 10
          }
        }
      }
    }