Search code examples
javahibernatelucenehibernate-search

How to do case insensitive sorting of Norwegian characters (Æ, Ø, and Å) using Hibernate Lucene Search?


æ, ø, å are latest letters in the norwegian alphabet

A B C D E F G H I J K L M N O P Q R S T U V W X Y Z Æ Ø Å

When we try to sort it using Hibernate Lucene then Å clubs with A, Ø clubs with Ø, Æ clibs with A which is wrong. For example:

Currrent Results:

Aaalu, Åaalu, Baalu, Zaalu,

Expected Results:

Aaalu, Baalu, Zaalu, Åaalu,

Following is working code:

@AnalyzerDef(name = "myOwnAnalyzer",
tokenizer = @TokenizerDef(factory = KeywordTokenizerFactory.class),
filters = {
    @TokenFilterDef(factory = ASCIIFoldingFilterFactory.class),
    @TokenFilterDef(factory = LowerCaseFilterFactory.class),
    @TokenFilterDef(factory = PatternReplaceFilterFactory.class, params = {
        @Parameter(name = "pattern", value = "('-&\\.,\\(\\))"),
        @Parameter(name = "replacement", value = " "),
        @Parameter(name = "replace", value = "all")
    }),
    @TokenFilterDef(factory = PatternReplaceFilterFactory.class, params = {
        @Parameter(name = "pattern", value = "([^0-9\\p{L} ])"),
        @Parameter(name = "replacement", value = ""),
        @Parameter(name = "replace", value = "all")
    }),
    @TokenFilterDef(factory = TrimFilterFactory.class)
}
)
public class KikaPaya implements Serializable {

@Fields({ @Field(index = Index.YES, store = Store.YES), @Field(name = "KikaPayaName_for_sort", index = Index.YES, analyzer = @Analyzer(definition = "myOwnAnalyzer")) })
@Column(name = "NAME", length = 100)
private String name;

Main:

  FullTextEntityManager ftem = Search.getFullTextEntityManager(factory.createEntityManager());
  QueryBuilder qb = ftem.getSearchFactory().buildQueryBuilder().forEntity( KikaPaya.class ).get();
  org.apache.lucene.search.Query query = qb.all().getQuery(); 
  FullTextQuery fullTextQuery = ftem.createFullTextQuery(query, KikaPaya.class);
  fullTextQuery.setSort(new Sort(new SortField("KikaPayaName_for_sort", SortField.STRING, true)));
  fullTextQuery.setFirstResult(0).setMaxResults(150);
  int size = fullTextQuery.getResultSize();
  List<KikaPaya> result = fullTextQuery.getResultList();
  for (KikaPayauser : result) {
    logger.info("KikaPaya Name:" + user.getName());
  }

Following are versions of Lucene (which i cannot change):

 <hibernate.version>4.2.8.Final</hibernate.version>
    <hibernate.search.version>4.3.0.Final</hibernate.search.version>

  <dependency>
        <groupId>org.hibernate</groupId>
        <artifactId>hibernate-entitymanager</artifactId>
        <version>4.2.8.Final</version>
    </dependency>
<dependency>
        <groupId>org.apache.lucene</groupId>
        <artifactId>lucene-core</artifactId>
        <version>3.6.2</version>
    </dependency>
    <dependency>
        <groupId>org.apache.lucene</groupId>
        <artifactId>lucene-analyzers</artifactId>
        <version>3.6.2</version>
    </dependency>

Could anyone suggests the way to get correct results?


Solution

  • You can use org.apache.lucene.collation.CollationKeyFilter class in Hibernate Search version 4.3.0.Final. Create your own collation filter factory:

    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.collation.CollationKeyFilter;
    import org.apache.solr.analysis.BaseTokenFilterFactory;
    
    import java.text.Collator;
    import java.util.Locale;
    
    public final class NorwegianCollationFactory extends BaseTokenFilterFactory {
    
        @Override
        public TokenStream create(TokenStream input) {
            Collator norwegianCollator = Collator.getInstance(new Locale("no", "NO"));
            return new CollationKeyFilter(input, norwegianCollator);
        }
    
    }
    

    And the use this collation factory in your AnalyzerDef:

    @AnalyzerDef(name = "myOwnAnalyzer",
    tokenizer = @TokenizerDef(factory = KeywordTokenizerFactory.class),
    filters = {
        @TokenFilterDef(factory = ASCIIFoldingFilterFactory.class),
        @TokenFilterDef(factory = LowerCaseFilterFactory.class),
        @TokenFilterDef(factory = PatternReplaceFilterFactory.class, params = {
            @Parameter(name = "pattern", value = "('-&\\.,\\(\\))"),
            @Parameter(name = "replacement", value = " "),
            @Parameter(name = "replace", value = "all")
        }),
        @TokenFilterDef(factory = PatternReplaceFilterFactory.class, params = {
            @Parameter(name = "pattern", value = "([^0-9\\p{L} ])"),
            @Parameter(name = "replacement", value = ""),
            @Parameter(name = "replace", value = "all")
        }),
        @TokenFilterDef(factory = TrimFilterFactory.class)
    ,
        @TokenFilterDef(factory = NorwegianCollationFactory .class)
    }
    )
    public class KikaPaya implements Serializable {
    

    More information about using this Collation filter with hibernate search version 5 - https://stackoverflow.com/a/60738067/7179509