Search code examples
springspring-bootelasticsearchspring-dataspring-data-elasticsearch

Create custom analyzer with asciifolding filter in Spring Data Elasticsearch


I want to retrieve the same object when I search with cozum or çözüm after recording with name çözüm. I've searched this and asciifolding filter is suggested. How can I implement this functionality using spring data elasticsearch?

    @Document(indexName = "erp")
    public class Company {
    
        @Id
        private String id;
    
        private String name;
    
        private String description;
    
        @Field(type = FieldType.Nested, includeInParent = true)
        private List<Employee> employees;

        // getters, setter
    }

Solution

  • You will need to create an asciifolding analyzer, see the Elasticsearch docs for that and add that to your index settings for your index.

    You then can reference this analyzer in the @Field annotation of the name property.

    Edit: complete example

    the first thing is a file for the index settings, I named it erp-company.json and saved it under src/main/resources:

    {
      "analysis": {
        "analyzer": {
          "custom_analyzer": {
            "type": "custom",
            "tokenizer": "standard",
            "char_filter": [
              "html_strip"
            ],
            "filter": [
              "lowercase",
              "asciifolding"
            ]
          }
        }
      }
    }
    

    Then you need to reference this file and the analyzer in your entity class, here named Company:

    @Document(indexName = "erp")
    @Setting(settingPath = "/erp-company.json")
    public class Company {
    
        @Id
        private String id;
    
        @Field(type = FieldType.Text, analyzer = "custom_analyzer")
        private String name;
    
        @Field(type = FieldType.Text, analyzer = "custom_analyzer")
        private String description;
    
        // getters, setter
    }
    

    The CompanyController that uses this:

    @RestController
    @RequestMapping("/company")
    public class CompanyController {
    
        private final CompanyRepository repository;
    
        public CompanyController(CompanyRepository repository) {
            this.repository = repository;
        }
    
    
        @PostMapping
        public Company put(@RequestBody Company company) {
            return repository.save(company);
        }
    
        @GetMapping("/{name}")
        public SearchHits<Company> get(@PathVariable String name) {
            return repository.searchByName(name);
        }
    }
    

    Saving some data that contains diacritic characters (using httpie):

    http POST :8080/company id=1 name="Renée et François"
    

    Searching without diacritic characters:

    http  GET :8080/company/francois
    
    HTTP/1.1 200
    Cache-Control: no-cache, no-store, max-age=0, must-revalidate
    Connection: keep-alive
    Content-Type: application/json
    Date: Wed, 09 Sep 2020 17:56:16 GMT
    Expires: 0
    Keep-Alive: timeout=60
    Pragma: no-cache
    Transfer-Encoding: chunked
    X-Content-Type-Options: nosniff
    X-Frame-Options: DENY
    X-XSS-Protection: 1; mode=block
    
    {
        "aggregations": null,
        "empty": false,
        "maxScore": 0.2876821,
        "scrollId": null,
        "searchHits": [
            {
                "content": {
                    "description": null,
                    "id": "1",
                    "name": "Renée et François"
                },
                "highlightFields": {},
                "id": "1",
                "index": "erp",
                "innerHits": {},
                "nestedMetaData": null,
                "score": 0.2876821,
                "sortValues": []
            }
        ],
        "totalHits": 1,
        "totalHitsRelation": "EQUAL_TO"
    }
    

    The index information that Elasticsearch returns for the index:

    {
        "erp": {
            "aliases": {},
            "mappings": {
                "properties": {
                    "_class": {
                        "fields": {
                            "keyword": {
                                "ignore_above": 256,
                                "type": "keyword"
                            }
                        },
                        "type": "text"
                    },
                    "description": {
                        "analyzer": "custom_analyzer",
                        "type": "text"
                    },
                    "id": {
                        "fields": {
                            "keyword": {
                                "ignore_above": 256,
                                "type": "keyword"
                            }
                        },
                        "type": "text"
                    },
                    "name": {
                        "analyzer": "custom_analyzer",
                        "type": "text"
                    }
                }
            },
            "settings": {
                "index": {
                    "analysis": {
                        "analyzer": {
                            "custom_analyzer": {
                                "char_filter": [
                                    "html_strip"
                                ],
                                "filter": [
                                    "lowercase",
                                    "asciifolding"
                                ],
                                "tokenizer": "standard",
                                "type": "custom"
                            }
                        }
                    },
                    "creation_date": "1599673911503",
                    "number_of_replicas": "1",
                    "number_of_shards": "1",
                    "provided_name": "erp",
                    "uuid": "lRwcKcPUQxKKGuNJ6G30uA",
                    "version": {
                        "created": "7090099"
                    }
                }
            }
        }
    }