Search code examples
loopselasticsearchelasticsearch-painless

Loop though ElasticSearch documents source array in painless


I have the following ElasticSearch data structure for products in a webshop:

{
  "_index": "vue_storefront_catalog_1_product_1617378559",
  "_type": "_doc",
  "_source": {
    "configurable_children": [
      {
        "price": 49.99,
        "special_price": 34.99,
        "special_from_date": "2020-11-27 00:00:00",
        "special_to_date": "2020-11-30 23:59:59",
        "stock": {
          "qty": 0,
          "is_in_stock": false,
          "stock_status": 0
        }
      }
      {
        "price": 49.99,
        "special_price": null,
        "special_from_date": null,
        "special_to_date": null,
        "stock": {
          "qty": 0,
          "is_in_stock": false,
          "stock_status": 0
        }
      }
    ]
}

Using the following mapping:

{
  "vue_storefront_catalog_1_product_1614928276" : {
    "mappings" : {
      "properties" : {
        "configurable_children" : {
          "properties" : {
            "price" : {
              "type" : "double"
            },
            "special_from_date" : {
              "type" : "date",
              "format" : "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
            },
            "special_price" : {
              "type" : "double"
            },
            "special_to_date" : {
              "type" : "date",
              "format" : "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
            },
          }
        }
      }
    }
  }
}

I have created a Elasticsearch query to filter out only products that are in sale, that means: the special_price must be lower than the price and the current date must be between the special_from_date and special_to_date.

This is the Painless script I have created:

  boolean hasSale = false;

  long timestampNow = new Date().getTime();
  if (doc.containsKey('configurable_children.special_from_date') && !doc['configurable_children.special_from_date'].empty) {
    long timestampSpecialFromDate = doc['configurable_children.special_from_date'].value.toInstant().toEpochMilli();
    if (timestampSpecialFromDate > timestampNow) {
      hasSale = false;
    }
  } else if (doc.containsKey('configurable_children.special_to_date') && !doc['configurable_children.special_to_date'].empty) {
    long timestampSpecialToDate = doc['configurable_children.special_to_date'].value.toInstant().toEpochMilli();
    if (timestampSpecialToDate < timestampNow) {
      hasSale = false;
    }
  } else if (doc.containsKey('configurable_children.stock.is_in_stock') && doc['configurable_children.stock.is_in_stock'].value == false) {
      hasSale = false;
  } else if (1 - (doc['configurable_children.special_price'].value / doc['configurable_children.price'].value) > params.fraction) {
    hasSale = true;
  }

  return hasSale

This returns the product as soon as one of the configurable_children has met the criteria to be a sale product. This is incorrect, because I need to loop through the whole set op configurable_children to determine if it's a sale product. How can I make sure all children are taken into the calculation? With a loop?


Here is the new query as suggested by Joe in the answers:

GET vue_storefront_catalog_1_product/_search
{
  "query": {
    "function_score": {
      "query": {
        "match_all": {}
      },
      "functions": [
        {
          "script_score": {
            "script": {
              "source": """
                int allEntriesAreTrue(def arrayList) {
                  return arrayList.stream().allMatch(Boolean::valueOf) == true ? 1 : 0
                } 
                
                ArrayList childrenAreMatching = [];
                
                long timestampNow = params.timestampNow;
                
                ArrayList children = params._source['configurable_children'];
                
                if (children == null || children.size() == 0) {
                  return allEntriesAreTrue(childrenAreMatching);
                }
                
                for (config in children) {
                  if (!config.containsKey('stock')) {
                    childrenAreMatching.add(false);
                    continue;
                  } else if (!config['stock']['is_in_stock']
                      || config['special_price'] == null
                      || config['special_from_date'] == null 
                      || config['special_to_date'] == null) {
                    childrenAreMatching.add(false);
                    continue;
                  } 
                  
                  if (config['special_from_date'] != null && config['special_to_date'] != null) {
                    SimpleDateFormat sf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                    def from_millis = sf.parse(config['special_from_date']).getTime();
                    def to_millis = sf.parse(config['special_to_date']).getTime();
                    
                    if (!(timestampNow >= from_millis && timestampNow <= to_millis)) {
                      childrenAreMatching.add(false);
                      continue;
                    }
                  }
                  
                  def sale_fraction = 1 - (config['special_price'] / config['price']);
                  if (sale_fraction <= params.fraction) {
                    childrenAreMatching.add(false);
                    continue;
                  }
                  
                  childrenAreMatching.add(true);
                }
                return allEntriesAreTrue(childrenAreMatching);
              """,
              "params": {
                "timestampNow": 1617393889567,
                "fraction": 0.1
              }
            }
          }
        }
      ],
      "min_score": 1
    }
  }
}

The response is as follows:

{
  "took" : 15155,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2936,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [... hits here ...]
  }
}

Any idea why the query takes around 15 seconds?


Solution

  • Your intuition is right — you'll need to use a for loop if you want to check all of the array list objects.

    Now, before I jump onto the iteration aspect, there's one important thing to know about arrays in Elasticsearch. When they're not defined as nested, their content will be flattened and the relationships between the individual key/value pairs will be lost. As such, you should definitely adjust your mapping like so:

    {
      "vue_storefront_catalog_1_product_1614928276" : {
        "mappings" : {
          "properties" : {
            "configurable_children" : {
              "type": "nested",        <---
              "properties" : {
                "price" : {
                  "type" : "double"
                },
                ...
              }
            }
          }
        }
      }
    }
    

    and reindex your data to ensure that the configurable_children are treated as separate, standalone entities.

    As soon as they're mapped as nested, you'll be able to retrieve just those children that do match your scripted condition:

    POST vue_storefront_catalog_1_product_1614928276/_search
    {
      "_source": "configurable_children_that_match", 
      "query": {
        "nested": {
          "path": "configurable_children",
          "inner_hits": {
            "name": "configurable_children_that_match"
          }, 
          "query": {
            "bool": {
              "must": [
                {
                  "script": {
                    "script": {
                      "source": """
                        boolean hasSale = false;
                        
                        long timestampNow = new Date().getTime();
                        
                        if (doc.containsKey('configurable_children.special_from_date') && !doc['configurable_children.special_from_date'].empty) {
                          long timestampSpecialFromDate = doc['configurable_children.special_from_date'].value.toInstant().toEpochMilli();
                          if (timestampSpecialFromDate > timestampNow) {
                           return false
                          }
                        } 
                        
                        if (doc.containsKey('configurable_children.special_to_date') && !doc['configurable_children.special_to_date'].empty) {
                          long timestampSpecialToDate = doc['configurable_children.special_to_date'].value.toInstant().toEpochMilli();
                          if (timestampSpecialToDate < timestampNow) {
                            return false
                          }
                        }
                        
                        if (doc.containsKey('configurable_children.stock.is_in_stock') && doc['configurable_children.stock.is_in_stock'].value == false) {
                            return false
                        }
                        
                        if (1 - (doc['configurable_children.special_price'].value / doc['configurable_children.price'].value) > params.fraction) {
                          hasSale = true;
                        }
                        
                        return hasSale
                      """,
                      "params": {
                        "fraction": 0.1
                      }
                    }
                  }
                }
              ]
            }
          }
        }
      }
    }
    

    Two things to note here:

    1. The inner_hits attribute of a nested query allows you to let Elasticsearch know that you're only interested in those children that truly matched. Otherwise, all configurable_children would be returned. When specified in the _source parameter, the original, full JSON document source would be skipped and only the named inner_hits would be returned.
    2. Due to the distributed nature of ES, it's not recommended to use java's new Date(). I've explained the reasoning behind it my answer to How to get current time as unix timestamp for script use. You'll see me use a parametrized now in the query at the bottom of this answer.

    Moving on, it's important to mention that nested objects are internally represented internally as separate subdocuments.

    A side effect of this fact is that once you're inside a nested query's context, you don't have access to other nested children of the very same document.

    In order to mitigate this, it's customary to regularly keep the nested children in sync such that when you do flatten one of the objects' attributes for use on the top-level, you can use a simply iterate the respective doc values. This flattening is usually done through the copy_to feature which I illustrated in my answer to How to iterate through a nested array in elasticsearch with filter script?

    In your particular use case, this'd mean that you'd, for instance, use copy_to on the field stock.is_in_stock which'd result in a top-level boolean array list which is easier to work with than an array list of objects.

    So far so good but you'd still be missing a way to filter based on the special_dates.

    Now, regardless of whether you're dealing with nested or regular object field types, accessing params._source in regular script queries doesn't work in ES since v6.4.

    However, there's still one type of query that does support iterating the _source — enter function_score queries.

    As stated in your question, you

    ..need to loop through the whole set of configurable_children to determine if it's a sale product..

    With that being said, here's how my query below works:

    1. The function_score query typically generates a custom calculated score, but it can, with the help of min_score, be used as a boolean yes/no filter to exclude docs whose configurable_children do not fulfil a certain condition.
    2. As the configurable_children are being iterated, each loop appends a boolean to childrenAreMatching which is then passed onto the allEntriesAreTrue helper which returns a 1 if they are, and a 0 if not.
    3. The dates are parsed and compared with the parametrized now; the fraction is compared too. If, at any point, some condition fails, the loop jumps to the next iteration.
    POST vue_storefront_catalog_1_product_1614928276/_search
    {
      "query": {
        "function_score": {
          "query": {
            "match_all": {}
          },
          "functions": [
            {
              "script_score": {
                "script": {
                  "source": """
                    // casting helper
                    int allEntriesAreTrue(def arrayList) {
                      return arrayList.stream().allMatch(Boolean::valueOf) == true ? 1 : 0
                    } 
                    
                    ArrayList childrenAreMatching = [];
                    
                    long timestampNow = params.timestampNow;
                    
                    ArrayList children = params._source['configurable_children'];
                    
                    if (children == null || children.size() == 0) {
                      return allEntriesAreTrue(childrenAreMatching);
                    }
                    
                    for (config in children) {
                      if (!config['stock']['is_in_stock']
                          || config['special_price'] == null
                          || config['special_from_date'] == null 
                          || config['special_to_date'] == null) {
                        // nothing to do here...
                        childrenAreMatching.add(false);
                        continue;
                      } 
                      
                      SimpleDateFormat sf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                      def from_millis = sf.parse(config['special_from_date']).getTime();
                      def to_millis = sf.parse(config['special_to_date']).getTime();
                      
                      if (!(timestampNow >= from_millis && timestampNow <= to_millis)) {
                        // not in date range
                        childrenAreMatching.add(false);
                        continue;
                      }
                      
                      def sale_fraction = 1 - (config['special_price'] / config['price']);
                      if (sale_fraction <= params.fraction) {
                        // fraction condition not met
                        childrenAreMatching.add(false);
                        continue;
                      }
                      
                      childrenAreMatching.add(true);
                    }
                    
                    // need to return a number because it's a script score query
                    return allEntriesAreTrue(childrenAreMatching);
                  """,
                  "params": {
                    "timestampNow": 1617393889567,
                    "fraction": 0.1
                  }
                }
              }
            }
          ],
          "min_score": 1
        }
      }
    }
    

    All in all, only those documents, whose all configurable_children fulfil the specified conditions, would be returned.


    P.S. If you learned something from this answer and want to learn more, I dedicated a whole chapter to ES scripts in my Elasticsearch Handbook.