Search code examples
c#.netlucene.netstemminglemmatization

How to remove plurals in Lucene.NET?


I'm trying to extract some keywords from a text. It works quite fine but I need to remove plurals.

As I'm already using Lucene for searching purpose, I'm trying to use it to extract keyword from indexed terms.

1st, I index the document in a RAMDirectory index,

RAMDirectory idx = new RAMDirectory();
using (IndexWriter writer = 
    new IndexWriter(
        idx,
        new CustomStandardAnalyzer(StopWords.Get(this.Language),
        Lucene.Net.Util.Version.LUCENE_30, this.Language),
        IndexWriter.MaxFieldLength.LIMITED))
{
    writer.AddDocument(createDocument(this._text));
    writer.Optimize();
}

Then, I extract the keywords:

var list = new List<KeyValuePair<int, string>>();
using (var reader = IndexReader.Open(directory, true))
{
    var tv = reader.GetTermFreqVector(0, "text");
    if (tv != null)
    {
        string[] terms = tv.GetTerms();
        int[] freq = tv.GetTermFrequencies();

        for (int i = 0; i < terms.Length; i++)
            list.Add(new KeyValuePair<int, string>(freq[i], terms[i]));
    }
}

in the list of terms I can have terms like "president" and "presidents"
How could I remove it?
My CustomStandardAnalyzer use this:

public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
{
    //create the tokenizer
    TokenStream result = new StandardTokenizer(this.version, reader);

    //add in filters
    result = new Lucene.Net.Analysis.Snowball.SnowballFilter(result, this.getStemmer()); 
    result = new LowerCaseFilter(result);
    result = new ASCIIFoldingFilter(result);
    result = new StopFilter(true, result, this.stopWords ?? StopWords.English);

    return result;
}

So I already use the SnowballFilter (with the correct language specific stemmer). How could I remove plurals?


Solution

  • My output from the following program is:

    text:and
    text:presid
    text:some
    text:text
    text:with
    
    class Program
    {
        private class CustomStandardAnalyzer : Analyzer
        {
            public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
            {
                //create the tokenizer
                TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader);
                //add in filters
                result = new Lucene.Net.Analysis.Snowball.SnowballFilter(result, new EnglishStemmer()); 
                result = new LowerCaseFilter(result);
                result = new ASCIIFoldingFilter(result);
                result = new StopFilter(true, result, new HashSet<string>());
                return result;
            }
        }
    
        private static Document createDocument(string text)
        {
            Document d = new Document();
            Field f = new Field("text", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
            f.SetValue(text);
            d.Add(f);
            return d;
        }
    
        static void Main(string[] args)
        {
    
            RAMDirectory idx = new RAMDirectory();
            using (IndexWriter writer =
                new IndexWriter(
                    idx,
                    new CustomStandardAnalyzer(),
                    IndexWriter.MaxFieldLength.LIMITED))
            {
                writer.AddDocument(createDocument("some text with president and presidents"));
                writer.Commit();
            }
    
            using (var reader = IndexReader.Open(idx, true))
            {
                var terms = reader.Terms(new Term("text", ""));
                if (terms.Term != null)
                    do
                        Console.WriteLine(terms.Term);
                    while (terms.Next());
            }
            Console.ReadLine();
    
        }
    }