I have a huge trie dictionary that I built from data from web. Although it is just 5MB when I write the trie into a file its' size is so big when I load it on the memory (more than 100 MB). So I've to compress the trie.
I am facing difficulties in writing a recursive function (preferably runs in linear time like a DFS) to remove the words whose frequency is < 5 and length > 15. Any help is appreciated
Here is my trie structure.
class TrieNode:
def __init__(self):
self.ch = '|'
self.score = 0
self.childs = [None]*26
self.isWord = False
class Trie:
def __init__(self):
self.root = TrieNode('$')
@staticmethod
def print_trie(node, level):
if node is None:
return
print(node.ch, " ", level, " ", node.isWord)
for i in range(26):
Trie.print_trie(node.childs[i], level+1)
def insert(self, word):
word = word.lower()
if not is_valid(word):
return
childs = self.root.childs
i = 0
while i < len(word):
idx = to_int(word[i])
if childs[idx] is not None:
t = childs[idx]
else:
t = TrieNode(word[i])
childs[idx] = t
childs = t.childs
if i == len(word)-1:
t.isWord = True
t.score += 1
i += 1
def search_node(self, word):
word = word.lower()
if not is_valid(word):
return False, 0
if self.root is None or word is None or len(word) == 0:
return False, 0
children = self.root.childs
for i in range(len(word)):
idx = to_int(word[i])
if children[idx] is not None:
t = children[idx]
children = t.childs
else:
return False, 0
if t.isWord:
return True, t.score
else:
return False, t.score
The following method takes a node and its level (initially pass in root and 0) and returns True if the node should remain alive after pruning and False if the node should be removed from the trie (with its subtrie).
def prune(node, level):
if node is None:
return False
canPruneNode = True
for idx in xrange(len(node.children)):
# If any of the children remains alive, don't prune current node.
if prune(children[idx], level + 1):
canPruneNode = False
else:
# Remove dead child.
node.children[idx] = None
if node.isWord and level > 15 and node.score < 5:
node.isWord = False
# Current node should be removed if and only if all of its children
# were removed and it doesn't represent a word itself after pruning.
return node.isWord or not canPruneNode