Search code examples
elasticsearchelasticsearch-aggregationopensearchelasticsearch-painlesselasticsearch-nested

Concatenating fields in OpenSearch / ElasticSearch aggregate


I have an OpenSearch index with the following mapping (simplified):

PUT /house
{
  "mappings": {
    "properties": {
      "house": { "type": "keyword" },
      "people": {
        "type": "nested",
        "properties": {
          "forename": { "type": "keyword" },
          "surname": { "type": "keyword" }
        }
      }
    }
  }
}

I'd like to retrieve an aggregate where the bucket key is "[forename] [surname]".

Toy data:

PUT /house/_doc/1
{
  "house": "house1",
  "people": [
    { "forename": "Dave", "surname": "Daveson" },
    { "forename": "Jeff", "surname": "Jeffson" }
  ]
}

PUT /house/_doc/2
{
  "house": "house1",
  "people": [
    { "forename": "Dave", "surname": "Daveson" },
    { "forename": "Jeffs", "surname": "Jeffsons" }
  ]
}

The following doesn't return what I'd expect, and I can't figure out what object paths to put in the script to get it to work:

GET house/_search
{
  "aggs": {
    "people": {
      "nested": {
        "path": "people"
      },
      "aggs": {
        "people.name": {
          "terms": {
            "script": "[params._source['forename'], params._source['surname']].join(' ')"
          }
        }
      }
    }
  },
  "size": 0
}

Returns:

{
  "took" : 5,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "people" : {
      "doc_count" : 4,
      "people.name" : {
        "doc_count_error_upper_bound" : 0,
        "sum_other_doc_count" : 0,
        "buckets" : [
          {
            "key" : "null null",
            "doc_count" : 4
          }
        ]
      }
    }
  }
}

Without script I can aggregate correctly on forename, surname or both, but using both I can't reliably "join" the results since they can be sorted only on the doc_count or key:

GET house/_search
{
  "aggs": {
    "people": {
      "nested": {
        "path": "people"
      },
      "aggs": {
        "people.forename": {
          "terms": { "field": "people.forename" }
        },
        "people.surname": {
          "terms": { "field": "people.surname" }
        }
      }
    }
  },
  "size": 0
}

Returns:

{
  "took" : 4,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "people" : {
      "doc_count" : 4,
      "people.surname" : {
        "doc_count_error_upper_bound" : 0,
        "sum_other_doc_count" : 0,
        "buckets" : [
          {
            "key" : "Daveson",
            "doc_count" : 2
          },
          {
            "key" : "Jeffson",
            "doc_count" : 1
          },
          {
            "key" : "Jeffsons",
            "doc_count" : 1
          }
        ]
      },
      "people.forename" : {
        "doc_count_error_upper_bound" : 0,
        "sum_other_doc_count" : 0,
        "buckets" : [
          {
            "key" : "Dave",
            "doc_count" : 2
          },
          {
            "key" : "Jeff",
            "doc_count" : 1
          },
          {
            "key" : "Jeffs",
            "doc_count" : 1
          }
        ]
      }
    }
  }
}

Solution

  • You want this results:

    GET house/_search
    {
      "aggs": {
        "people": {
          "nested": {
            "path": "people"
          },
          "aggs": {
            "people.name": {
              "terms": {
                "script": "doc['people.forename'].value + ' ' +  doc['people.surname'].value"
              }
            }
          }
        }
      },
      "size": 0
    }
    

    Results:

    "aggregations" : {
        "people" : {
          "doc_count" : 4,
          "people.name" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "Dave Daveson",
                "doc_count" : 2
              },
              {
                "key" : "Jeff Jeffson",
                "doc_count" : 1
              },
              {
                "key" : "Jeffs Jeffsons",
                "doc_count" : 1
              }
            ]
          }
        }
      }