Search code examples
c#filtertokennestfull-text-indexing

C# NEST Elasticsearch custom filter structure (tokenize)


I am trying to rewrite this specific query to C# NEST, but Im stuck on defining filters... Im confused...

{  
   "settings":{  
      "analysis":{  
         "filter":{  
            "lemmagen_filter_sk":{  
               "type":"lemmagen",
               "lexicon":"sk"
            },
            "synonym_filter":{  
               "type":"synonym",
               "synonyms_path":"synonyms/sk_SK.txt",
               "ignore_case":true
            },
            "stopwords_SK":{  
               "type":"stop",
               "stopwords_path":"stop-­‐words/stop­‐words-­slovak.txt",
               "ignore_case":true
            }
         },
        "analyzer":{  
            "slovencina_synonym":{  
               "type":"custom",
               "tokenizer":"standard",
               "filter":[  
                  "stopwords_SK",
                  "lemmagen_filter_sk",
                  "lowercase",
                  "stopwords_SK",
                  "synonym_filter",
                  "asciifolding"
               ]
            },
            "slovencina":{  
               "type":"custom",
               "tokenizer":"standard",
               "filter":[  
                  "stopwords_SK",
                  "lemmagen_filter_sk",
                  "lowercase",
                  "stopwords_SK",
                  "asciifolding"
               ]
            },

I expect to have right client.CreateIndex(...) command with right index settings. All I have now is this:

client.CreateIndex(indexName, c => c
    .InitializeUsing(indexConfig)
    .Mappings(m => m
        .Map<T>(mp => mp.AutoMap())));

I cannot find any informations how to do this. I will be gratefull for any kind of help.

EDIT:

client.CreateIndex(indexName, c => c
                .InitializeUsing(indexConfig)
                .Settings(s => s
                    .Analysis(a => a
                        .TokenFilters(t => t
                            .UserDefined("lemmagen_filter_sk",
                                new LemmagenTokenFilter { Lexicon = "sk" })
                            .Synonym("synonym_filter", ts => ts
                                .SynonymsPath("synonyms/sk_SK.txt")
                                .IgnoreCase(true))
                            .Stop("stopwords_sk", tst => tst
                                .StopWordsPath("stop-words/stop-words-slovak")
                                .IgnoreCase(true))
                         )
                         .Analyzers(aa => aa
                            .Custom("slovencina_synonym", acs => acs
                            .Tokenizer("standard")
                            .Filters("stopwords_SK", "lemmagen_filter_sk", "lowercase", "stopwords_SK", "synonym_filter", "asciifolding")
                            )
                            .Custom("slovencina", acs => acs
                            .Tokenizer("standard")
                            .Filters("stopwords_SK", "lemmagen_filter_sk", "lowercase", "stopwords_SK", "asciifolding")
                            )
                         )
                     )
                 )
                .Mappings(m => m
                    .Map<DealItem>(mp => mp.AutoMap()
                    .Properties(p => p
                        .Text(t => t
                            .Name(n => n.title_dealitem)
                            .Name(n => n.coupon_text1)
                            .Name(n => n.coupon_text2)
                            .Analyzer("slovencina_synonym")
            )
        ))));

This is what i have now, but im getting ERROR after trying to use one

POST dealitems/_analyze
{
  "analyzer": "slovencina",
  "text":     "Janko kúpil nové topánky"
}

ERROR:

{
  "error": {
    "root_cause": [
      {
        "type": "remote_transport_exception",
        "reason": "[myNode][127.0.0.1:9300][indices:admin/analyze[s]]"
      }
    ],
    "type": "illegal_argument_exception",
    "reason": "failed to find analyzer [slovencina]"
  },
  "status": 400
}

and GET _settings doesn't show any analyzers

RESULT: Problem was in missing files...wrong paths


Solution

  • Indeed, there is no lemmagen token filter available out of the box in NEST. Hopefully, you can easily create your own:

    public class LemmagenTokenFilter : ITokenFilter
    {
        public string Version { get; set; }
        public string Type => "lemmagen";
        [JsonProperty("lexicon")]
        public string Lexicon { get; set; }
    }
    
    
    var response = elasticClient.CreateIndex(_defaultIndex,
        d => d.Settings(s => s
            .Analysis(a => a
                .TokenFilters(t => t.UserDefined("lemmagen_filter_sk",
                    new LemmagenTokenFilter
                    {
                        Lexicon = "sk"
                    }))))
                    ..
                    );
    

    Hope that helps.