Search code examples
elasticsearchhashmappipelineingest

Elasticsearch ingest pipeline: how to recursively modify values in a HashMap


Using an ingest pipeline, I want to iterate over a HashMap and remove underscores from all string values (where underscores exist), leaving underscores in the keys intact. Some values are arrays that must further be iterated over to do the same modification.

In the pipeline, I use a function to traverse and modify the values of a Collection view of the HashMap.

PUT /_ingest/pipeline/samples
{
    "description": "preprocessing of samples.json",
    "processors": [
        {
            "script": {
                "tag": "remove underscore from sample_tags values",
                "source": """
                    void findReplace(Collection collection) {
                    collection.forEach(element -> {
                        if (element instanceof String) {
                            element.replace('_',' ');
                        } else {
                            findReplace(element);
                        }
                        return true;
                        })
                    }

                    Collection samples = ctx.samples;
                    samples.forEach(sample -> { //sample.sample_tags is a HashMap
                        Collection sample_tags = sample.sample_tags.values();
                        findReplace(sample_tags);
                        return true;
                    })
                """
            }
        }
    ]
}

When I simulate the pipeline ingestion, I find the string values are not modified. Where am I going wrong?

POST /_ingest/pipeline/samples/_simulate
{
    "docs": [
        {
            "_index": "samples",
            "_id": "xUSU_3UB5CXFr25x7DcC",
            "_source": {
                "samples": [
                    {
                        "sample_tags": {
                            "Entry_A": [
                                "A_hyphentated-sample",
                                "sample1"
                            ],
                            "Entry_B": "A_multiple_underscore_example",
                            "Entry_C": [
                                        "sample2",
                                        "another_example_with_underscores"
                            ],
                            "Entry_E": "last_example"
                        }
                    }
                ]
            }
        }
    ]
}

\\Result

{
  "docs" : [
    {
      "doc" : {
        "_index" : "samples",
        "_type" : "_doc",
        "_id" : "xUSU_3UB5CXFr25x7DcC",
        "_source" : {
          "samples" : [
            {
              "sample_tags" : {
                "Entry_E" : "last_example",
                "Entry_C" : [
                  "sample2",
                  "another_example_with_underscores"
                ],
                "Entry_B" : "A_multiple_underscore_example",
                "Entry_A" : [
                  "A_hyphentated-sample",
                  "sample1"
                ]
              }
            }
          ]
        },
        "_ingest" : {
          "timestamp" : "2020-12-01T17:29:52.3917165Z"
        }
      }
    }
  ]
}


Solution

  • Here is a modified version of your script that will work on the data you provided:

    PUT /_ingest/pipeline/samples
    {
      "description": "preprocessing of samples.json",
      "processors": [
        {
          "script": {
            "tag": "remove underscore from sample_tags values",
            "source": """
              String replaceString(String value) {
                return value.replace('_',' ');
              }
          
              void findReplace(Map map) {
                map.keySet().forEach(key -> {
                  if (map[key] instanceof String) {
                      map[key] = replaceString(map[key]);
                  } else {
                      map[key] = map[key].stream().map(this::replaceString).collect(Collectors.toList());
                  }
                });
              }
    
              ctx.samples.forEach(sample -> {
                  findReplace(sample.sample_tags);
                  return true;
              });
              """
          }
        }
      ]
    }
    

    The result looks like this:

         {
          "samples" : [
            {
              "sample_tags" : {
                "Entry_E" : "last example",
                "Entry_C" : [
                  "sample2",
                  "another example with underscores"
                ],
                "Entry_B" : "A multiple underscore example",
                "Entry_A" : [
                  "A hyphentated-sample",
                  "sample1"
                ]
              }
            }
          ]
        }