Search Index - 12 = Twelve

I'm curious what is the best method do deal with tokenizing/indexing terms (In Lucene) or any search engine for that matter so that these searches would match corresponding terms.

"12" = "twelve"

"mx1" = "mx one"

Is there any built-in functionality I've overlooked?

Solution

The simplest way in Lucene is to create 2 separate token filters to use after the initial string has been tokenized. The first one needs to split between sequences of digits and non-digits. The second one would then convert numbers (digit strings) into their numerical (spelled) numbers.

Here's an example with PyLucene (excluding offset and position attribute logic):

class AlphaNumberBoundaryFilter(lucene.PythonTokenFilter):
    seq = re.compile(r"((?:\d+")|(?:\D+))")

    def __init__(self, in_stream):
        lucene.PythonTokenFilter.__init__(self, in_stream)
        term = self.term = self.addAttribute(lucene.TermAttribute.class_)
        # Get tokens.
        tokens = []
        while in_stream.incrementToken():
            tokens.append(term.term())
        # Filter tokens.
        self.tokens = self.filter(tokens)
        # Setup iterator.
        self.iter = iter(self.tokens)

    def filter(self, tokens):
        seq = self.seq
        return [split for token in tokens for split in seq.findall(token)]

    def incrementToken(self):
        try:
            self.term.setTermBuffer(next(self.iter))
        except StopIteration:
            return False
        return True


class NumberToWordFilter(lucene.PythonTokenFilter):
    num_map = {0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine", 10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen", 17: "seventeen", 18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty", 60: "sixty", 70: "seventy", 80: "eighty", 90: "ninety", 100: "hundred", 1000: "thousand", 1000000: "million"}
    is_num = re.compile(r"^\d+$")

    def __init__(self, in_stream):
        lucene.PythonTokenFilter.__init__(self, in_stream)
        term = self.term = self.addAttribute(lucene.TermAttribute.class_)
        # Get tokens.
        tokens = []
        while in_stream.incrementToken():
            tokens.append(term.term())
        # Filter tokens.
        self.tokens = self.filter(tokens)
        # Setup iterator.
        self.iter = iter(self.tokens)

    def filter(self, tokens):
        num_map = self.num_map
        is_num = self.is_num
        final = []
        for token in tokens:
            if not is_num.match(token):
                final.append(token)
                continue
            # Reverse digits from token.
            digits = token.lstrip('0')[::-1]
            if not digits:
                # We have a zero.
                final.append(num_map[0])
                continue
            # Group every 3 digits and iterate over digit groups in reverse
            # so that groups are yielded in the original order and in each
            # group: 0 -> ones, 1 -> tens, 2 -> hundreds
            groups = [digits[i:i+3] for i in xrange(0, len(digits), 3)][::-1]
            scale = len(groups) - 1
            result = []
            for oth in groups:
                l = len(oth)
                if l == 3 and oth[2] != '0':
                    # 2 -> x
                    # 1 -> .
                    # 0 -> .
                    result.append(num_map[int(oth[2])])
                    result.append(num_map[100])
                if l >= 2:
                    if oth[1] == '1':
                        # 1 -> 1
                        # 0 -> x
                        result.append(num_map[int(oth[1::-1])])
                    else:
                        if oth[1] != '0':
                            # 1 -> x (x >= 2)
                            # 0 -> x
                            result.append(num_map[int(oth[1]) * 10])
                        if oth[0] != '0':
                            result.append(num_map[int(oth[0])])
                elif oth[0] != '0':
                    # 0 -> x
                    result.append(num_map[int(oth[0])])
                # Add scale modifier.
                s = scale
                if s % 2:
                    result.append(num_map[1000])
                while s >= 2:
                    result.append(num_map[1000000])
                    s -= 2
                scale -= 1
            final.extend(result)
        return final 


    def incrementToken(self):
        try:
            self.term.setTermBuffer(next(self.iter))
        except StopIteration:
            return False
        return True