Search code examples
indexinglucenetokenizen-gram

Generate N-grams while preserving spaces in apache lucene


I am trying to generate N-grams using apache Lucene 5.5.4 for a given set input text. Following is my java code to do the same.

public static void main( String[] args )
    {
        Analyzer analyzer = createAnalyzer( 2 );
        List<String> nGrams = generateNgrams( analyzer, "blah1  blah2  blah3" );

        for ( String nGram : nGrams ) {
            System.out.println( nGram );
        }
    }


    public static Analyzer createAnalyzer( final int shingles )
    {
        return new Analyzer() {
            @Override
            protected TokenStreamComponents createComponents( @NotNull String field )
            {
                final Tokenizer source = new WhitespaceTokenizer();
                final ShingleFilter shingleFilter = new ShingleFilter( new LowerCaseFilter( source ), shingles );
                shingleFilter.setOutputUnigrams( true );
                return new TokenStreamComponents( source, shingleFilter );
            }
        };
    }


    public static List<String> generateNgrams( Analyzer analyzer, String str )
    {
        List<String> result = new ArrayList<>();
        try {
            TokenStream stream = analyzer.tokenStream( null, new StringReader( str ) );
            stream.reset();
            while ( stream.incrementToken() ) {
                String nGram = stream.getAttribute( CharTermAttribute.class ).toString();
                result.add( nGram );
                LOG.debug( "Generated N-gram = {}", nGram );
            }
        } catch ( IOException e ) {
            LOG.error( "IO Exception occured! {}", e );
        }
        return result;
    }

For my input blah1 blah2 blah3, the output is as follows and i am okay with it.

blah1

blah1 blah2

blah2

blah2 blah3

blah3

However, when the input is Foo bar Foo2, my requirement is to generate the following output:

  1. Foo
  2. Foo bar
  3. bar
  4. bar Foo2
  5. Foo2

If you noticed, I have to preserve the spaces in between 2 words as it is in the input.(Foo bar and not Foo bar).

Can I make any tweaks and ask lucene to handle it internally?

May be its a minor tweak like adding a filter or something and since I am new to Lucene, I don't know where to start. Thanks in Advance.


Solution

  • I had to write custom tokenizers and and trim filters to achieve this.

    1) I created an abstract class DelimiterPreservingCharTokenizer by extending org.apache.lucene.analysis.Tokenizer class. Next, gave my implementation for incrementToken method. I would have extended org.apache.lucene.analysis.util.CharTokenizer if not the class was final. DelimiterPreservingCharTokenizer looks like below.

    package lucene.tokenizers;
    
    import java.io.IOException;
    
    import org.apache.lucene.analysis.Tokenizer;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
    import org.apache.lucene.analysis.util.CharTokenizer;
    import org.apache.lucene.analysis.util.CharacterUtils;
    import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
    import org.apache.lucene.util.AttributeFactory;
    
    
    /**
     * 
     * @author Arun Gowda.
     * This class is exactly same as {@link CharTokenizer}. Except that, the stream will have leading delimiters. This is to support N-gram vicinity matches.
     * 
     * We are creating a new class instead of extending CharTokenizer because,  incrementToken method is final and we can not override it.
     *
     */
    public abstract class DelimiterPreservingCharTokenizer extends Tokenizer
    {
    
    
        /**
         * Creates a new {@link DelimiterPreservingCharTokenizer} instance
         */
        public DelimiterPreservingCharTokenizer()
        {}
    
    
        /**
         * Creates a new {@link DelimiterPreservingCharTokenizer} instance
         * 
         * @param factory
         *          the attribute factory to use for this {@link Tokenizer}
         */
        public DelimiterPreservingCharTokenizer( AttributeFactory factory )
        {
            super( factory );
        }
    
        private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
        private static final int MAX_WORD_LEN = 255;
        private static final int IO_BUFFER_SIZE = 4096;
    
        private final CharTermAttribute termAtt = addAttribute( CharTermAttribute.class );
        private final OffsetAttribute offsetAtt = addAttribute( OffsetAttribute.class );
    
        private final CharacterUtils charUtils = CharacterUtils.getInstance();
        private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer( IO_BUFFER_SIZE );
    
    
        /**
         * Returns true iff a codepoint should be included in a token. This tokenizer
         * generates as tokens adjacent sequences of codepoints which satisfy this
         * predicate. Codepoints for which this is false are used to define token
         * boundaries and are not included in tokens.
         */
        protected abstract boolean isTokenChar( int c );
    
    
        /**
         * Called on each token character to normalize it before it is added to the
         * token. The default implementation does nothing. Subclasses may use this to,
         * e.g., lowercase tokens.
         */
        protected int normalize( int c )
        {
            return c;
        }
    
    
        @Override
        public final boolean incrementToken() throws IOException
        {
            clearAttributes();
            int length = 0;
            int start = -1; // this variable is always initialized
            int end = -1;
            char[] buffer = termAtt.buffer();
            while ( true ) {
                if ( bufferIndex >= dataLen ) {
                    offset += dataLen;
                    charUtils.fill( ioBuffer, input ); // read supplementary char aware with CharacterUtils
                    if ( ioBuffer.getLength() == 0 ) {
                        dataLen = 0; // so next offset += dataLen won't decrement offset
                        if ( length > 0 ) {
                            break;
                        } else {
                            finalOffset = correctOffset( offset );
                            return false;
                        }
                    }
                    dataLen = ioBuffer.getLength();
                    bufferIndex = 0;
                }
                // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
                final int c = charUtils.codePointAt( ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength() );
                final int charCount = Character.charCount( c );
                bufferIndex += charCount;
    
                if ( isTokenChar( c ) ) { // if it's a token char
                    if ( length == 0 ) { // start of token
                        assert start == -1;
                        start = offset + bufferIndex - charCount;
                        end = start;
                    } else if ( length >= buffer.length - 1 ) { // check if a supplementary could run out of bounds
                        buffer = termAtt.resizeBuffer( 2 + length ); // make sure a supplementary fits in the buffer
                    }
                    end += charCount;
                    length += Character.toChars( normalize( c ), buffer, length ); // buffer it, normalized
                    if ( length >= MAX_WORD_LEN ) // buffer overflow! make sure to check for >= surrogate pair could break == test
                        break;
                } else if ( length > 0 ) // at non-Letter w/ chars
                    break; // return 'em
            }
    
            if ( length > 0 && bufferIndex < ioBuffer.getLength() ) {//If at least one token is found,
    
                //THIS IS THE PART WHICH IS DIFFERENT FROM LUCENE's CHARTOKENIZER
    
                // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
                int c = charUtils.codePointAt( ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength() );
                int charCount = Character.charCount( c );
                bufferIndex += charCount;
    
                while ( !isTokenChar( c ) && bufferIndex < ioBuffer.getLength() ) {// As long as we find delimiter(not token char), keep appending it to output stream.
    
                    if ( length >= buffer.length - 1 ) { // check if a supplementary could run out of bounds
                        buffer = termAtt.resizeBuffer( 2 + length ); // make sure a supplementary fits in the buffer
                    }
    
                    end += charCount;
    
                    length += Character.toChars( normalize( c ), buffer, length ); // buffer it, normalized
    
                    if ( length >= MAX_WORD_LEN ) {// buffer overflow! make sure to check for >= surrogate pair could break == test
                        break;
                    }
    
                    c = charUtils.codePointAt( ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength() );
                    charCount = Character.charCount( c );
                    bufferIndex += charCount;
                }
                //ShingleFilter will add a delimiter. Hence, the last iteration is skipped.
                //That is, for input `abc   def   ghi`, this tokenizer will return `abc  `(2 spaces only). Then, Shingle filter will by default add another delimiter making it `abc   `(3 spaces as it is in the input).
                //If there are N delimiters, this token will at max return N-1 delimiters
    
                bufferIndex -= charCount;
            }
            termAtt.setLength( length );
            assert start != -1;
            offsetAtt.setOffset( correctOffset( start ), finalOffset = correctOffset( end ) );
            return true;
        }
    
    
        @Override
        public final void end() throws IOException
        {
            super.end();
            // set final offset
            offsetAtt.setOffset( finalOffset, finalOffset );
        }
    
    
        @Override
        public void reset() throws IOException
        {
            super.reset();
            bufferIndex = 0;
            offset = 0;
            dataLen = 0;
            finalOffset = 0;
            ioBuffer.reset(); // make sure to reset the IO buffer!!
        }
    }

    2) A concrete class WhiteSpacePreservingTokenizer extending the above abstract class to provide delimiter

    package spellcheck.lucene.tokenizers;
    
    import org.apache.lucene.analysis.Tokenizer;
    import org.apache.lucene.analysis.core.WhitespaceTokenizer;
    import org.apache.lucene.util.AttributeFactory;
    
    /**
     * 
     * @author Arun Gowda
     *
     * This class is exactly same as {@link WhitespaceTokenizer} Only difference is, it extends DelimiterPreservingCharTokenizer instead of CharTokenizer
     */
    public class WhiteSpacePreservingTokenizer extends DelimiterPreservingCharTokenizer
    {
    
        /**
         * Construct a new WhitespaceTokenizer.
         */
        public WhiteSpacePreservingTokenizer()
        {}
    
    
        /**
         * Construct a new WhitespaceTokenizer using a given
         * {@link org.apache.lucene.util.AttributeFactory}.
         *
         * @param factory
         *          the attribute factory to use for this {@link Tokenizer}
         */
        public WhiteSpacePreservingTokenizer( AttributeFactory factory )
        {
            super( factory );
        }
    
    
        /** Collects only characters which do not satisfy
         * {@link Character#isWhitespace(int)}.*/
        @Override
        protected boolean isTokenChar( int c )
        {
            return !Character.isWhitespace( c );
        }
    }

    3) The tokenizer above will result in tailing spaces. (Ex: blah____) we need to add a filter to trim those spaces. So we need DelimiterTrimFilter as folows.(We can also just trim by using java's trim. but doing so will be very inefficient since it creates new string)

    package spellcheck.lucene.filters;
    
    import java.io.IOException;
    
    import org.apache.lucene.analysis.TokenFilter;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    
    
    public class DelimiterTrimFilter extends TokenFilter
    {
    
    
        private final CharTermAttribute termAtt = addAttribute( CharTermAttribute.class );
    
        private char delimiter;
    
    
        /**
         * Create a new {@link DelimiterTrimFilter}.
         * @param in            the stream to consume
         * @param delimiterToTrim delimiter that should be trimmed
         */
        public DelimiterTrimFilter( TokenStream in, char delimiterToTrim )
        {
            super( in );
            this.delimiter = delimiterToTrim;
        }
    
    
        @Override
        public boolean incrementToken() throws IOException
        {
            if ( !input.incrementToken() )
                return false;
    
            char[] termBuffer = termAtt.buffer();
            int len = termAtt.length();
    
            if ( len == 0 ) {
                return true;
            }
            int start = 0;
            int end = 0;
    
            // eat the first characters
            for ( start = 0; start < len && termBuffer[start] == delimiter; start++ ) {
            }
            // eat the end characters
            for ( end = len; end >= start && termBuffer[end - 1] == delimiter; end-- ) {
            }
            if ( start > 0 || end < len ) {
                if ( start < end ) {
                    termAtt.copyBuffer( termBuffer, start, ( end - start ) );
                } else {
                    termAtt.setEmpty();
                }
            }
            return true;
        }
    
    
    }

    4) My createAnalyzer will look like below

    public static Analyzer createAnalyzer( final int shingles )
        {
            return new Analyzer() {
                @Override
                protected TokenStreamComponents createComponents( @NotNull String field )
                {
                    final Tokenizer source = new WhiteSpacePreservingTokenizer();
                    final TokenStream filter  = new ShingleFilter( new LowerCaseFilter( source ), shingles );
                    filter = new DelimiterTrimFilter( filter, ' ' );
                    return new TokenStreamComponents( source, filter );
                }
            };
        }

    Rest of the code will remain the same