Search code examples
pythonalgorithmstring-matchingknuth-morris-pratt

Implementing Knuth-Morris-Pratt (KMP) algorithm for string matching with Python


I am following Cormen Leiserson Rivest Stein (clrs) book and came across "kmp algorithm" for string matching. I implemented it using Python (as-is).

However, it doesn't seem to work for some reason. where is my fault?

The code is given below:

def kmp_matcher(t,p):
    n=len(t)
    m=len(p)
    # pi=[0]*n;
    pi = compute_prefix_function(p)
    q=-1
    for i in range(n):
        while(q>0 and p[q]!=t[i]):
            q=pi[q]
        if(p[q]==t[i]):
            q=q+1
        if(q==m):
            print "pattern occurs with shift "+str(i-m)
            q=pi[q]


def compute_prefix_function(p):
    m=len(p)
    pi =range(m)
    pi[1]=0
    k=0
    for q in range(2,m):
        while(k>0 and p[k]!=p[q]):
            k=pi[k]
        if(p[k]==p[q]):
            k=k+1
        pi[q]=k
    return pi

t = 'brownfoxlazydog'
p = 'lazy'
kmp_matcher(t,p)

Solution

  • This is a class I wrote based on CLRs KMP algorithm, which contains what you are after. Note that only DNA "characters" are accepted here.

    class KmpMatcher(object):
    def __init__(self, pattern, string, stringName):
        self.motif = pattern.upper()
        self.seq = string.upper()
        self.header = stringName
        self.prefix = []
        self.validBases = ['A', 'T', 'G', 'C', 'N']
    
    #Matches the motif pattern against itself.
    def computePrefix(self):
        #Initialize prefix array
        self.fillPrefixList()
        k = 0
    
        for pos in range(1, len(self.motif)):
            #Check valid nt
            if(self.motif[pos] not in self.validBases):
                self.invalidMotif()
    
            #Unique base in motif
            while(k > 0 and self.motif[k] != self.motif[pos]):
                k = self.prefix[k]
            #repeat in motif
            if(self.motif[k] == self.motif[pos]):
                k += 1
    
            self.prefix[pos] = k
    
    #Initialize the prefix list and set first element to 0
    def fillPrefixList(self):
        self.prefix = [None] * len(self.motif)
        self.prefix[0] = 0
    
    #An implementation of the Knuth-Morris-Pratt algorithm for linear time string matching
    def kmpSearch(self):
        #Compute prefix array
        self.computePrefix()
        #Number of characters matched
        match = 0
        found = False
    
        for pos in range(0, len(self.seq)):
            #Check valid nt
            if(self.seq[pos] not in self.validBases):
                self.invalidSequence()
    
            #Next character is not a match
            while(match > 0 and self.motif[match] != self.seq[pos]):
                match = self.prefix[match-1]
            #A character match has been found
            if(self.motif[match] == self.seq[pos]):
                match += 1
            #Motif found
            if(match == len(self.motif)):
                print(self.header)
                print("Match found at position: " + str(pos-match+2) + ':' + str(pos+1))
                found = True
                match = self.prefix[match-1]
    
        if(found == False):
            print("Sorry '" + self.motif + "'" + " was not found in " + str(self.header))
    
    #An invalid character in the motif message to the user
    def invalidMotif(self):
        print("Error: motif contains invalid DNA nucleotides")
        exit()
    
    #An invalid character in the sequence message to the user
    def invalidSequence(self):
        print("Error: " + str(self.header) + "sequence contains invalid DNA nucleotides")
        exit()