Search code examples
pythonmatrixalignmentbioinformatics

Smith-Waterman algorithm to generate matrix in Python


I'm using Python to generate a dynamic programming matrix using the Smith-Waterman algorithm.

Here's what I have so far:

def score(base1,base2):
    base1=base1.upper()
    base2=base2.upper()
    if base1 not in 'ACTG' or base2 not in 'ACTG':
        print 'Not DNA base!'
        sys.exit()
    elif base1==base2:
        return 3
    elif base1+base2=='AG' or base1+base2=='GA':
        return -1
    elif base1+base2=='CT' or base1+base2=='TC':
        return -1
    else:
        return -2
import sys

seq1 = sys.argv[1]
seq2 = sys.argv[2]
mRows = len(seq1)
nCols = len(seq2)
gap = int(sys.argv[3])
matrix = []

# generate empty matrix
for x in range(mRows + 1):
    matrix.append([])
    for y in range(nCols + 1):
        matrix[x].append(0)

for i in range(1, mRows + 1): 
    for j in range(1, nCols + 1):
        dscore = matrix[i-1][j-1] + score(seq1[i-1], seq2[j-1])
        vscore = matrix[i-1][j] + gap
        hscore = matrix[i][j-1] + gap
        matrix[i][j]=max(0, vscore, hscore, dscore)

With input: sw.py ATGCAT ACCT -1

I get the matrix output:

0       0       0       0       0
0       0       0       0       0
0       0       0       0       3
0       0       0       0       2
0       0       0       0       1
0       0       0       0       0
0       0       0       0       3

With some troubleshooting I was able to see that in the nested for loop, only the scores using the final value of j (for this particular input, 4) are stored in the matrix, i.e. just the last column.

My question is why is this happening and how can I get around it? Why does the for loop jump back and not continue to the variable scores?

Some of my troubleshooting:

for i in range(1, mRows + 1): 
    for j in range(1, nCols + 1):
        print 'this is i', i
        print 'this is j', j
        print 'seq1', seq1[i-1], 'seq2', seq2[j-1]
        dscore = matrix[i-1][j-1] + score(seq1[i-1], seq2[j-1])
        vscore = matrix[i-1][j] + gap
        hscore = matrix[i][j-1] + gap
        matrix[i][j]=max(0, vscore, hscore, dscore)
        print 'Vscore = ', vscore
        print 'Hscore = ', hscore
        print 'Dscore = ', dscore
        print '\n'

gives:

this is i 1
this is j 1
seq1 A seq2 A
this is i 1
this is j 2
seq1 A seq2 C
this is i 1
this is j 3
seq1 A seq2 C
this is i 1
this is j 4
seq1 A seq2 T
Vscore =  -1
Hscore =  -1
Dscore =  -2


this is i 2
this is j 1
seq1 T seq2 A
this is i 2
this is j 2
seq1 T seq2 C
this is i 2
this is j 3
seq1 T seq2 C
this is i 2
this is j 4
seq1 T seq2 T
Vscore =  -1
Hscore =  -1
Dscore =  3


this is i 3
this is j 1
seq1 G seq2 A
this is i 3
this is j 2
seq1 G seq2 C
this is i 3
this is j 3
seq1 G seq2 C
this is i 3
this is j 4
seq1 G seq2 T
Vscore =  2
Hscore =  -1
Dscore =  -2


this is i 4
this is j 1
seq1 C seq2 A
this is i 4
this is j 2
seq1 C seq2 C
this is i 4
this is j 3
seq1 C seq2 C
this is i 4
this is j 4
seq1 C seq2 T
Vscore =  1
Hscore =  -1
Dscore =  -1


this is i 5
this is j 1
seq1 A seq2 A
this is i 5
this is j 2
seq1 A seq2 C
this is i 5
this is j 3
seq1 A seq2 C
this is i 5
this is j 4
seq1 A seq2 T
Vscore =  0
Hscore =  -1
Dscore =  -2


this is i 6
this is j 1
seq1 T seq2 A
this is i 6
this is j 2
seq1 T seq2 C
this is i 6
this is j 3
seq1 T seq2 C
this is i 6
this is j 4
seq1 T seq2 T
Vscore =  -1
Hscore =  -1
Dscore =  3

Thanks!


Solution

  • I wrote a class for the Smith-Waterman algorithm, which you may find useful. I would recommend using numpy arrays for this task as they are generally more efficient than lists for larger and larger sequences.

    class LocalAligner(object):
    #Constructor
    def __init__(self, match=None, mismatch=None, gap=None, fileP=None, fileQ=None):
        #Default args
        #StringQ = GCTGGAAGGCAT
        #StringP = GCAGAGCACG
        #Match Score = +5, Mismatch Score = -4, Gap Penalty = -4
        if match is None and mismatch is None and gap is None and fileP is None and fileQ is None:
            #string1
            self.q = "GCTGGAAGGCAT"
            self.stringQName = "Sequence Q"
    
            #string2
            self.p = "GCAGAGCACG"
            self.stringPName = "Sequence P"
    
            #Scoring parameter
            self.gapPen = -4
            self.mismatchPen = -4
            self.matchScore = 5
    
        #User has given sequences and scoring arguments to the object
        elif match is not None and mismatch is not None and gap is not None and fileP is not None and fileQ is not None:
            #Default string name if one is not present in the file
            self.stringQName = "String Q"
            self.q = self.parseFile(fileQ, 1)
    
            #Default string name if one is not present in the file
            self.stringQName = "String P"
            self.p = self.parseFile(fileP, 2)
    
            #Scoring parameters given at the command line
            self.gapPen = int(gap)
            self.mismatchPen = int(mismatch)
            self.matchScore = int(match)
    
    
        #Final sequence alignments
        self.finalQ = ""
        self.finalP = ""
    
        #Create a table and initialize to zero
        #We will use numpy arrays as they are generally more efficient than lists for large amounts of data (ie sequences)
        self.MatrixA = np.empty(shape=[len(self.p)+1,len(self.q)+1])
    
        #Create b table
        self.MatrixB = np.empty(shape=[len(self.p)+1,len(self.q)+1])
    
        #Store max score and location
        self.maxScore = 0
        self.maxI = None
        self.maxJ =None
    
    #Populates the A and B tables
    #A table holds the scores and the B table holds the direction of the optimal solution for each sub problem
    def calcTables(self):
        #insert initial blank string 1
        try:
            self.q = '-' + self.q
        except IOError:
            print("Error with sequence 1")
    
        #insert initial blank string 2
        try:
            self.p = '-' + self.p
        except IOError:
            print("Error with sequence 2")
    
        #Initialize row and column 0 for A and B tables
        self.MatrixA[:,0] = 0
        self.MatrixA[0,:] = 0
        self.MatrixB[:,0] = 0
        self.MatrixB[0,:] = 0
    
        for i in range(1,len(self.p)):
            for j in range(1, len(self.q)):
    
                #Look for match
                if self.p[i] == self.q[j]:
                    #Match found
                    self.MatrixA[i][j] = self.MatrixA[i-1][j-1] + self.matchScore
                    #3 == "diagonal" for traversing solution
                    self.MatrixB[i][j] = 3
    
                    #Check for max score
                    if self.MatrixA[i][j] > self.maxScore:
                        self.maxScore = self.MatrixA[i][j]
                        self.maxI = i
                        self.maxJ = j
    
                #Match not found
                else:
                    self.MatrixA[i][j] = self.findMaxScore(i,j)
    
    
    #Finds the maximum score either in the north or west neighbor in the A table
    #Due to the ordering, gaps are checked first
    def findMaxScore(self, i, j):
    
        #North score
        qDelet = self.MatrixA[i-1][j] + self.gapPen
        #West score
        pDelet = self.MatrixA[i][j-1] + self.gapPen
        #Diagonal Score
        mismatch = self.MatrixA[i-1][j-1] + self.mismatchPen
    
        #Determine the max score
        maxScore = max(qDelet, pDelet, mismatch)
    
        #Set B table
        if qDelet == maxScore:
            self.MatrixB[i][j] = 2 #2 == "up" for traversing solution
    
        elif pDelet == maxScore:
            self.MatrixB[i][j] = 1 #1 == "left" for traversing solution
    
        elif mismatch == maxScore:
            self.MatrixB[i][j] = 3 #3 == "diagonal" for traversing solution
    
        return maxScore
    
    #Calculate the alignment with the highest score by tracing back the highest scoring local solution
    #Integers:
    #3 -> "DIAGONAL" -> match
    #2 -> "UP" -> gap in string q
    #1 -> "LEFT" -> gap in string p
    #were used in the B table
    def calcAlignemnt(self, i=None, j=None):
    
        #Default arguments to the maximum score in the A table
        if i is None and j is None:
            i = self.maxI
            j = self.maxJ
    
        #Base case, end of the local alignment
        if i == 0 or j == 0:
            return
    
        #Optimal solution "DIAGONAL"
        if self.MatrixB[i][j] == 3:
            self.calcAlignemnt(i-1 , j-1)
            self.finalQ += self.q[j]
            self.finalP += self.p[i]
    
        else:
            #Optimal solution "UP"
            if self.MatrixB[i][j] == 2:
                self.calcAlignemnt(i-1, j)
                self.finalQ += '-'
                self.finalP += self.p[i]
    
            else:
                #Optimal solution "LEFT"
                self.calcAlignemnt(i, j-1)
                self.finalP += '-'
                self.finalQ += self.p[j]
    
    #Parse the input sequence file for string
    #Assumes fasta format
    def parseFile(self, filePath, stringNumber):
        #Empty sequence
        seq = ""
        #Parse the file
        with open(filePath) as f:
            for line in f:
                #Remove new line characters
                line = line.replace('\r',"")   #Windows
                line = line.replace('\n', "")
    
                #Header encountered
                if line.startswith(">"):
                    if stringNumber == 2:
                        self.stringQName = line.replace('>',"")
                        continue
                    elif stringNumber == 1:
                        self.stringPName = line.replace('>',"")
                        continue
                    else:
                        continue
    
                #Append line
                seq += line
            f.close()
        return seq