I have looked at this, this and this but I'm not sure why they don't work for me.
I would normally use an analyzer like below.
import lucene
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.index import IndexWriterConfig, IndexWriter
from org.apache.lucene.store import SimpleFSDirectory
from java.nio.file import Paths
from org.apache.lucene.document import Document, Field, TextField
index_path = "./index"
lucene.initVM()
analyzer = WhitespaceAnalyzer()
config = IndexWriterConfig(analyzer)
store = SimpleFSDirectory(Paths.get(index_path))
writer = IndexWriter(store, config)
doc = Document()
doc.add(Field("title", "The quick brown fox.", TextField.TYPE_STORED))
writer.addDocument(doc)
writer.close()
store.close()
Instead of the WhitespaceAnalyzer()
I would like to use MyAnalyzer()
which should have LowerCaseFilter
and WhitespaceTokenizer
.
from org.apache.lucene.analysis.core import LowerCaseFilter, WhitespaceTokenizer
from org.apache.pylucene.analysis import PythonAnalyzer
class MyAnalyzer(PythonAnalyzer):
def __init__(self):
PythonAnalyzer.__init__(self)
def createComponents(self, fieldName):
# What do I write here?
Can you please help me write and use MyAnalyzer()
?
I found here and here that the below works.
from org.apache.lucene.analysis.core import LowerCaseFilter, WhitespaceTokenizer
from org.apache.pylucene.analysis import PythonAnalyzer
from org.apache.lucene.analysis import Analyzer
class MyAnalyzer(PythonAnalyzer):
def __init__(self):
PythonAnalyzer.__init__(self)
def createComponents(self, fieldName):
source = WhitespaceTokenizer()
result = LowerCaseFilter(source)
return Analyzer.TokenStreamComponents(source, result)
It would be great if someone can point me in the right direction to be able to find these answer correctly.