python string substring matching suffix-tree

Finding all common, non-overlapping substrings

Given two strings, I would like to identify all common sub-strings from longest to shortest.

I want to remove any "sub-"sub-strings. As an example, any substrings of '1234' would not be included in the match between '12345' and '51234'.

string1 = '51234'

string2 = '12345'

result = ['1234', '5']

I was thinking of finding the longest common substring, then recursively finding the longest substring(s) to the left/right. However, I do not want to remove a common substring after found. For example, the result below shares a 6 in the middle:

string1 = '12345623456'

string2 = '623456'

result = ['623456', '23456']

Lastly, I need to check one string against a fixed list of thousands of strings. I am unsure if there is a smart step I could take in hashing out all the substrings in these strings.

Previous Answers:

In this thread, a dynamic programming solution is found that takes O(nm) time, where n and m are the lengths of the strings. I am interested in a more efficient approach, which would use suffix trees.

Background:

I am composing song melodies from snippets of melodies. Sometimes, a combination manages to generate a melody matching too many notes in a row of an existing one.

I can use a string similarity measure, such as Edit Distance, but believe that tunes with very small differences to melodies are unique and interesting. Unfortunately, these tunes would have similar levels of similarity to songs that copy many notes of a melody in a row.

Solution

Let's start with the Tree

from collections import defaultdict
def identity(x):
    return x


class TreeReprMixin(object):
    def __repr__(self):
        base = dict(self)
        return repr(base)


class PrefixTree(TreeReprMixin, defaultdict):
    '''
    A hash-based Prefix or Suffix Tree for testing for
    sequence inclusion. This implementation works for any
    slice-able sequence of hashable objects, not just strings.
    '''
    def __init__(self):
        defaultdict.__init__(self, PrefixTree)
        self.labels = set()

    def add(self, sequence, label=None):
        layer = self
        if label is None:
            label = sequence
        if label:
            layer.labels.add(label)
        for i in range(len(sequence)):
            layer = layer[sequence[i]]
            if label:
                layer.labels.add(label)

        return self

    def add_ngram(self, sequence, label=None):
        if label is None:
            label = sequence
        for i in range(1, len(sequence) + 1):
            self.add(sequence[:i], label)

    def __contains__(self, sequence):
        layer = self
        j = 0
        for i in sequence:
            j += 1
            if not dict.__contains__(layer, i):
                break
            layer = layer[i]
        return len(sequence) == j

    def depth_in(self, sequence):
        layer = self
        count = 0
        for i in sequence:
            if not dict.__contains__(layer, i):
                print "Breaking"
                break
            else:
                layer = layer[i]
            count += 1
        return count

    def subsequences_of(self, sequence):
        layer = self
        for i in sequence:
            layer = layer[i]
        return layer.labels

    def __iter__(self):
        return iter(self.labels)


class SuffixTree(PrefixTree):
    '''
    A hash-based Prefix or Suffix Tree for testing for
    sequence inclusion. This implementation works for any
    slice-able sequence of hashable objects, not just strings.
    '''
    def __init__(self):
        defaultdict.__init__(self, SuffixTree)
        self.labels = set()

    def add_ngram(self, sequence, label=None):
        if label is None:
            label = sequence
        for i in range(len(sequence)):
            self.add(sequence[i:], label=label)

To populate the tree, you'd use the .add_ngram method.

The next part is a little trickier since you're looking for a concurrent traversal of strings whilst keeping track of tree coordinates. To pull all this off, we need some functions which operate on the tree and a query string

def overlapping_substrings(string, tree, solved=None):
    if solved is None:
        solved = PrefixTree()
    i = 1
    last = 0
    matching = True
    solutions = []
    while i < len(string) + 1:
        if string[last:i] in tree:
            if not matching:
                matching = True
            else:
                i += 1
                continue
        else:
            if matching:
                matching = False
                solutions.append(string[last:i - 1])
                last = i - 1
                i -= 1
        i += 1
    if matching:
        solutions.append(string[last:i])
    for solution in solutions:
        if solution in solved:
            continue
        else:
            solved.add_ngram(solution)
            yield solution

def slide_start(string):
    for i in range(len(string)):
        yield string[i:]

def seek_subtree(tree, sequence):
    # Find the node of the search tree which
    # is found by this sequence of items
    node = tree
    for i in sequence:
        if i in node:
            node = node[i]
        else:
            raise KeyError(i)
    return node

def find_all_common_spans(string, tree):
    # We can keep track of solutions to avoid duplicates
    # and incomplete prefixes using a Prefix Tree
    seen = PrefixTree()
    for substring in slide_start(string):
        # Drive generator forward
        list(overlapping_substrings(substring, tree, seen))
    # Some substrings are suffixes of other substrings which you do not
    # want
    compress = SuffixTree()
    for solution in sorted(seen.labels, key=len, reverse=True):
        # A substrings may be a suffix of another substrings, but that substrings
        # is actually a repeating pattern. If a solution is
        # a repeating pattern, `not solution in seek_subtree(tree, solution)` will tell us.
        # Otherwise, discard the solution
        if solution in compress and not solution in seek_subtree(tree, solution):
            continue
        else:
            compress.add_ngram(solution)
    return compress.labels

def search(query, corpus):
    tree = SuffixTree()
    if isinstance(corpus, SuffixTree):
        tree = corpus
    else:
        for elem in corpus:
            tree.add_ngram(elem)
    return list(find_all_common_spans(query, tree))

So now to do the thing you wanted, do this:

search("12345", ["51234"])
search("623456", ["12345623456"])

If something is unclear, please let me know, and I'll try to clarify.