Imagine I have 4 files identified by a given list of unordered bu non-repeating tags ('A', 'B', 'C', 'D'). For example:
folder_tags_dict = {
'f1': ['A', 'B'],
'f2': ['A', 'C', 'D'],
'f3': ['D'],
'f4': ['C', 'A']
}
I would like to construct a data tree where every file is uniquely identified by the list of tags (just like a folder structure), provided by the configuration above. The solution is not unique. The following two trees are possible solutions to this problem:
Tree1:
A --- B --- f1
\--- C --- D --- f2
\--- f4
D --- f3
Tree2:
C --- A --- D --- f2
\--- A --- f4
A --- B --- f1
D --- f3
The questions is:
Is there an algorithm that takes the configuration in folder_tags_dict and optimizes the data tree generation hierarchy in order to minimize the number of total edges in that tree?
In the previous case, Tree1 is a better solution (7 edges) than Tree2 (8 edges).
I know how to code in Python, but my knowledge in data structures is limited. This could be used to find the optimal way of making a folder structure where every file is identified by a set of tags. However, when the number of files and tags are in the hundreds, an efficient algorithm is needed and brute force is not an option.
Try this:
from collections import Counter
from contextlib import suppress
from itertools import chain
from typing import Any
class Node:
def __init__(self, tag: Any, tags_dict) -> None:
self.tag = tag
self.item = None
self.children = {}
self.build_node(tags_dict)
def get_edges(self):
edges = sum([ch.get_edges() + 1 for _, ch in self.children.items()])
return edges + 1 if self.item is not None else edges
def get_children(self):
return {f"{key}-{ch.item}": ch.get_children() for key, ch in self.children.items()}
def build_node(self, tags_dict):
items = [key for key, value in tags_dict.items() if len(value) == 0]
if items:
self.item = items[0]
mult_tag = {key: value for key, value in tags_dict.items() if len(value) > 0}
while len(mult_tag.keys()) > 0:
tag = self.get_best_tag(mult_tag)
if not tag:
tag = [value[0] for key, value in mult_tag.items() if len(value) == 1][0]
contains_tag_dict = {key: value for key, value in mult_tag.items() if tag in value}
reduced_tags_dict = self.reduce_dict(contains_tag_dict, tag)
self.children[tag] = self.subtree(tag, reduced_tags_dict)
mult_tag = {key: value for key, value in mult_tag.items() if tag not in value}
@classmethod
def subtree(cls, tag, tags_dict):
return cls(tag, tags_dict)
@staticmethod
def reduce_dict(tags_dict, tag):
new_tags_dict = {}
for key, item in tags_dict.items():
if len(item) > 0:
new_tags_dict[key] = [t for t in item if t != tag]
return new_tags_dict
@staticmethod
def get_best_tag(tags_dict):
all_tag_lists = [item for _, item in tags_dict.items()]
tags_count = Counter(chain(*all_tag_lists))
with suppress(Exception):
return tags_count.most_common()[0][0]
folder_tags_dict = {
'f1': ['A', 'B'],
'f2': ['A', 'C', 'D'],
'f3': ['D'],
'f4': ['C', 'A']
}
tree = Node("root", folder_tags_dict)
print("TREE: ", tree.get_children())
print("EDGES(from root): ", tree.get_edges())