Search code examples
searchlucenefull-text-searchpylucene

Search Index - 12 = Twelve


I'm curious what is the best method do deal with tokenizing/indexing terms (In Lucene) or any search engine for that matter so that these searches would match corresponding terms.

"12" = "twelve"

"mx1" = "mx one"

Is there any built-in functionality I've overlooked?


Solution

  • The simplest way in Lucene is to create 2 separate token filters to use after the initial string has been tokenized. The first one needs to split between sequences of digits and non-digits. The second one would then convert numbers (digit strings) into their numerical (spelled) numbers.

    Here's an example with PyLucene (excluding offset and position attribute logic):

    class AlphaNumberBoundaryFilter(lucene.PythonTokenFilter):
        seq = re.compile(r"((?:\d+")|(?:\D+))")
    
        def __init__(self, in_stream):
            lucene.PythonTokenFilter.__init__(self, in_stream)
            term = self.term = self.addAttribute(lucene.TermAttribute.class_)
            # Get tokens.
            tokens = []
            while in_stream.incrementToken():
                tokens.append(term.term())
            # Filter tokens.
            self.tokens = self.filter(tokens)
            # Setup iterator.
            self.iter = iter(self.tokens)
    
        def filter(self, tokens):
            seq = self.seq
            return [split for token in tokens for split in seq.findall(token)]
    
        def incrementToken(self):
            try:
                self.term.setTermBuffer(next(self.iter))
            except StopIteration:
                return False
            return True
    
    
    class NumberToWordFilter(lucene.PythonTokenFilter):
        num_map = {0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine", 10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen", 17: "seventeen", 18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty", 60: "sixty", 70: "seventy", 80: "eighty", 90: "ninety", 100: "hundred", 1000: "thousand", 1000000: "million"}
        is_num = re.compile(r"^\d+$")
    
        def __init__(self, in_stream):
            lucene.PythonTokenFilter.__init__(self, in_stream)
            term = self.term = self.addAttribute(lucene.TermAttribute.class_)
            # Get tokens.
            tokens = []
            while in_stream.incrementToken():
                tokens.append(term.term())
            # Filter tokens.
            self.tokens = self.filter(tokens)
            # Setup iterator.
            self.iter = iter(self.tokens)
    
        def filter(self, tokens):
            num_map = self.num_map
            is_num = self.is_num
            final = []
            for token in tokens:
                if not is_num.match(token):
                    final.append(token)
                    continue
                # Reverse digits from token.
                digits = token.lstrip('0')[::-1]
                if not digits:
                    # We have a zero.
                    final.append(num_map[0])
                    continue
                # Group every 3 digits and iterate over digit groups in reverse
                # so that groups are yielded in the original order and in each
                # group: 0 -> ones, 1 -> tens, 2 -> hundreds
                groups = [digits[i:i+3] for i in xrange(0, len(digits), 3)][::-1]
                scale = len(groups) - 1
                result = []
                for oth in groups:
                    l = len(oth)
                    if l == 3 and oth[2] != '0':
                        # 2 -> x
                        # 1 -> .
                        # 0 -> .
                        result.append(num_map[int(oth[2])])
                        result.append(num_map[100])
                    if l >= 2:
                        if oth[1] == '1':
                            # 1 -> 1
                            # 0 -> x
                            result.append(num_map[int(oth[1::-1])])
                        else:
                            if oth[1] != '0':
                                # 1 -> x (x >= 2)
                                # 0 -> x
                                result.append(num_map[int(oth[1]) * 10])
                            if oth[0] != '0':
                                result.append(num_map[int(oth[0])])
                    elif oth[0] != '0':
                        # 0 -> x
                        result.append(num_map[int(oth[0])])
                    # Add scale modifier.
                    s = scale
                    if s % 2:
                        result.append(num_map[1000])
                    while s >= 2:
                        result.append(num_map[1000000])
                        s -= 2
                    scale -= 1
                final.extend(result)
            return final 
    
    
        def incrementToken(self):
            try:
                self.term.setTermBuffer(next(self.iter))
            except StopIteration:
                return False
            return True