Search code examples
pythontreetrie

Count the number of words that appear before


I would like to ask how can we count the number of words that occur alphabetically before the given string in the trie?

Here is my implementation now.

class TrieNode:
    # Trie node class
    def __init__(self):
        self.children = [None] * 26
        # isEndOfWord is True if node represent the end of the word
        self.isEndOfWord = False
        self.word_count = 0
class Trie:
    # Trie data structure class
    def __init__(self):
        self.root = self.getNode()

    def getNode(self):
        # Returns new trie node (initialized to NULLs)
        return TrieNode()

    def _charToIndex(self, ch):
        # private helper function
        # Converts key current character into index
        # use only 'a' through 'z' and lower case
        return ord(ch) - ord('a')

    def insert(self, key):
        # If not present, inserts key into trie
        # If the key is prefix of trie node,
        # just marks leaf node
        pCrawl = self.root
        length = len(key)
        for level in range(length):
            index = self._charToIndex(key[level])
            # if current character is not present
            if not pCrawl.children[index]:
                pCrawl.children[index] = self.getNode()
            pCrawl = pCrawl.children[index]
            # mark last node as leaf
        pCrawl.isEndOfWord = True
        pCrawl.word_count += 1

    def search(self, key):
        # Search key in the trie
        # Returns true if key presents
        # in trie, else false
        pCrawl = self.root
        length = len(key)
        for level in range(length):
            index = self._charToIndex(key[level])
            if not pCrawl.children[index]:
                return False
            pCrawl = pCrawl.children[index]
        return pCrawl is not None and pCrawl.isEndOfWord

    def count_before(self, string):
        cur = self.root
        for b in string:
            index = self._charToIndex(b)
            print(index)
            cur = cur.children[index]
            if cur is None:
                return 0
        return cur.word_count
def total_before(text):
    t = Trie()
    for i in range(len(text)):
        t.insert(text[i])
    
    a_list = [] # A list to store the result that occur before the text[i]
    for i in range(len(text)):
        result = t.count_before(text[i])
        a_list.append(result)
    return a_list

total_before(["bac", "aaa", "baa", "aac"]) # Output will be [3, 0, 2, 1]

I would like to know how can I count the number of words that occur before the given string in the trie that I had created. Can someone give me an idea about it?


Solution

  • As word_count is currently initialised, it does not serve much purpose. It only is non-zero at nodes with isEndOfWord set to True. It would be more useful if it counted the number of words that depend on the current node, i.e. words that either end in that node (which your code counts now), or continue further down the trie (which are currently not counted).

    To make that happen, also increment word_count while you descend the trie:

        def insert(self, key):
            pCrawl = self.root
            length = len(key)
            for level in range(length):
                pCrawl.word_count += 1   # <-------------- added
                index = self._charToIndex(key[level])
                if not pCrawl.children[index]:
                    pCrawl.children[index] = self.getNode()
                pCrawl = pCrawl.children[index]
            pCrawl.isEndOfWord = True
            pCrawl.word_count += 1
    

    In count_before you would need to sum up all the word_count values of the child nodes the precede the child that you will select, as those represent words that come before the current word:

        def count_before(self, string):
            count = 0  # used to accumulate the word_counts
            cur = self.root
            for b in string:
                index = self._charToIndex(b)
                # add the word counts of the children that are to the left of this index:
                count += sum(node.word_count for node in cur.children[:index] if node)
                cur = cur.children[index]
                if cur is None:
                    break
            return count
    

    This line:

    count += sum(node.word_count for node in cur.children[:index] if node)
    

    Is a compact way of doing this:

    mysum = 0
    for node in cur.children[:index]:
        if node:
            mysum += node.word_count
    sum += mysum