I have the following t=5
DNA strings:
DNA = '''CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG
TAGTACCGAGACCGAAAGAAGTATACAGGCGT
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC
AATCCACCAGCTCCACGTGCAATGTTGGCCTA'''
k = 8
t = 5
I'm trying to find the best motifs of length k=8
from the collection of strings using a laplace transform to randomly sample chunks of length k from each of the t strings.
My helper functions are as follows:
def window(s, k):
for i in range(1 + len(s) - k):
yield s[i:i+k]
def HammingDistance(seq1, seq2):
if len(seq1) != len(seq2):
raise ValueError('Undefined for sequences of unequal length.')
return sum(ch1 != ch2 for ch1, ch2 in zip(seq1, seq2))
def score(motifs):
score = 0
for i in range(len(motifs[0])):
motif = ''.join([motifs[j][i] for j in range(len(motifs))])
score += min([HammingDistance(motif, homogeneous*len(motif)) for homogeneous in 'ACGT'])
return score
def profile(motifs):
prof = []
for i in range(len(motifs[0])):
col = ''.join([motifs[j][i] for j in range(len(motifs))])
prof.append([float(col.count(nuc))/float(len(col)) for nuc in 'ACGT'])
return prof
def profile_most_probable_kmer(dna, k, prof):
dna = dna.splitlines()
nuc_loc = {nucleotide:index for index,nucleotide in enumerate('ACGT')}
motif_matrix = []
max_prob = [-1, None]
for i in range(len(dna)):
motif_matrix.append(max_prob)
for i in range(len(dna)):
for chunk in window(dna[i],K):
current_prob = 1
for j, nuc in enumerate(chunk):
current_prob*=prof[j][nuc_loc[nuc]]
if current_prob>motif_matrix[i][0]:
motif_matrix[i] = [current_prob,chunk]
return list(list(zip(*motif_matrix))[1])
def profile_with_pseudocounts(motifs):
prof = []
for i in range(len(motifs[0])):
col = ''.join([motifs[j][i] for j in range(len(motifs))])
prof.append([float(col.count(nuc)+1)/float(len(col)+4) for nuc in 'ACGT'])
return prof
from random import randint
def SampleMotifs(Dna,k,t):
Dna = Dna.splitlines()
BestMotifs = []
for line in Dna:
position = randint(0,len(line)-k)
BestMotifs.append(line[position:position+k])
return BestMotifs
def motifs_from_profile(profile, dna, k):
return [profile_most_probable_kmer(seq,k,profile) for seq in dna]
def randomized_motif_search(dna,k,t):
from random import randint
dna = dna.splitlines()
rand_ints = [randint(0,len(dna[0])-k)) for a in range(len(dna))]
motifs = [dna[i][r:r+k] for i,r in enumerate(rand_ints)]
best_score = [score(motifs), motifs]
while True:
current_profile = profile_with_pseudocounts(motifs)
motifs = motifs_from_profile(current_profile, dna, k)
current_score = score(motifs)
if current_score < best_score[0]:
best_score = [current_score, motifs]
else:
return best_score[1]
def Laplace(dna,k,t):
i = 0
LastMotifs = randomized_motif_search(dna,k,t)
while i < 1000:
try:
BestMotifs = randomized_motif_search(dna,k,t)
if score(BestMotifs)<score(LastMotifs):
LastMotifs = BestMotifs
except:
pass
i+=1
print(*LastMotifs)
The output I should be getting for this is :
TCTCGGGG
CCAAGGTG
TACAGGCG
TTCAGGTG
TCCACGTG
I get different outputs every time which I expect with a method with a random element, but it should be converging when I iterate 1000 times and only update my best motifs if the score is lower. The fact I had to put in an error handler in the laplace since I get an index when I call randomized_motif_search(dna,k,t)
tells me that might be the source of the problem. I've spent the last two days scouring the code to make sure everything is the right shape, but the fact I'm getting the wrong answer or the following errors:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-811-ee735449bb8e> in <module>
----> 1 Laplace(DNA,K,T)
<ipython-input-810-31301d79eb95> in Laplace(dna, k, t)
3 LastMotifs = randomized_motif_search(dna,k,t)
4 while i < 2000:
----> 5 BestMotifs = randomized_motif_search(dna,k,t)
6 if score(BestMotifs)<score(LastMotifs):
7 LastMotifs = BestMotifs
<ipython-input-809-43600d882734> in randomized_motif_search(dna, k, t)
8 while True:
9 current_profile = profile_with_pseudocounts(motifs)
---> 10 motifs = motifs_from_profile(current_profile, dna, k)
11 current_score = score(motifs)
12 if current_score < best_score[0]:
<ipython-input-408-7c866045d839> in motifs_from_profile(profile, dna, k)
1 def motifs_from_profile(profile, dna, k):
----> 2 return [profile_most_probable_kmer(seq,k,profile) for seq in dna]
<ipython-input-408-7c866045d839> in <listcomp>(.0)
1 def motifs_from_profile(profile, dna, k):
----> 2 return [profile_most_probable_kmer(seq,k,profile) for seq in dna]
<ipython-input-795-56f83ba5ee2b> in profile_most_probable_kmer(dna, k, prof)
25 current_prob = 1
26 for j, nuc in enumerate(chunk):
---> 27 current_prob*=prof[j][nuc_loc[nuc]]
28 if current_prob>motif_matrix[i][0]:
29 motif_matrix[i] = [current_prob,chunk]
IndexError: list index out of range
It's more than a little vexing. Actual help would be greatly appreciated.
EDIT: The problem was my indexing for the random integers that were sampling random motifs from the dna strings, and that my motifs_from_profile
function returns a list of lists rather than just a list which the code kinda depends on. I have updated the functions below: While these fixes resolved the problems that were throwing up errors in the code and I'm now getting an output each time I run the Laplace
function, but the result is not what I expect even when I feed the correct answer in at the first iteration. I will try my hardest to debug what's going on in the scoring function and review the literature I guess. Maybe some more vague input from the community will help but who knows?
the updated randomized_motif_search
is:
def randomized_motif_search(dna,k,t):
from random import randint
from itertools import chain
dna = dna.splitlines()
# Randomly generate k-mers from each sequence in the dna list.
rand_ints = [randint(0,len(dna[0])-(k)) for a in range(len(dna))]
motifs = [dna[i][r:r+k] for i,r in enumerate(rand_ints)]
best_score = [score(motifs), motifs]
while True:
current_profile = profile_with_pseudocounts(motifs)
mfp = motifs_from_profile(current_profile,dna,k)
motifs = []
for i in range(len(mfp)):
motifs.append(mfp[i][0])
current_score = score(motifs)
if current_score < best_score[0]:
best_score = [current_score, motifs]
else:
return best_score[1]
and the new Laplace
:
def Laplace(dna,k,t):
i = 0
LastMotifs = randomized_motif_search(dna,k,t)
while i < 1000:
BestMotifs = randomized_motif_search(dna,k,t)
if score(BestMotifs)<score(LastMotifs):
LastMotifs = BestMotifs
i+=1
print(*LastMotifs)
EDIT: Dear journal,
I've messed around with the score method and figured out how to return a motif score, but still am not arriving at the correct answers for the question. I am officially stuck here is the updated score
function code with proper indexing:
def score(motifs):
score = 0
for i in range(len(motifs[0])):
motif = ''.join([Motifs[j][i] for j in range(len(Motifs))])
score+=min([HammingDistance(motif,homogenous*len(motif)) for homogenous in 'ACGT'])
return score
I figured it out! It was all indexing that originated from the call to splitlines()
that I had in the beginning of a lot of my helper functions. I also hadn't fully debugged my score()
function, so my scoring wasn't happening the way the literature had asked me to do it.
all the helper functions are as follows:
def window(s, k):
for i in range(1 + len(s) - k):
yield s[i:i+k]
def HammingDistance(seq1, seq2):
if len(seq1) != len(seq2):
raise ValueError('Undefined for sequences of unequal length.')
return sum(ch1 != ch2 for ch1, ch2 in zip(seq1, seq2))
def score(motifs):
score = 0
for i in range(len(motifs[0])):
motif = ''.join(motifs[j][i] for j in range(len(motifs)))
score += min([HammingDistance(motif,homogenous*len(motif)) for homogenous in 'ACGT'])
return score
def profile(motifs):
prof = []
for i in range(len(motifs[0])):
col = ''.join([motifs[j][i] for j in range(len(motifs))])
prof.append([float(col.count(nuc))/float(len(col)) for nuc in 'ACGT'])
return prof
def profile_most_probable_kmer(dna, k, prof):
nuc_loc = {nucleotide:index for index,nucleotide in enumerate('ACGT')}
motif_matrix = []
max_prob = [-1, None]
for i in range(len(dna)):
motif_matrix.append(max_prob)
for i in range(len(dna)):
for chunk in window(dna[i],k):
current_prob = 1
for j, nuc in enumerate(chunk):
current_prob*=prof[j][nuc_loc[nuc]]
if current_prob>motif_matrix[i][0]:
motif_matrix[i] = [current_prob,chunk]
return list(list(zip(*motif_matrix))[1])
def profile_with_pseudocounts(motifs):
prof = []
for i in range(len(motifs[0])):
col = ''.join([motifs[j][i] for j in range(len(motifs))])
prof.append([float(col.count(nuc)+1)/float(len(col)+4) for nuc in 'ACGT'])
return prof
from random import randint
def SampleMotifs(Dna,k,t):
Dna = Dna.splitlines()
BestMotifs = []
for line in Dna:
position = randint(0,len(line)-k)
BestMotifs.append(line[position:position+k])
return BestMotifs
def randomized_motif_search(dna,k,t):
from random import randint
from itertools import chain
dna = dna.splitlines()
# Randomly generate k-mers from each sequence in the dna list.
rand_ints = [randint(0,len(dna[0])-(k)) for a in range(len(dna))]
motifs = [dna[i][r:r+k] for i,r in enumerate(rand_ints)]
best_score = [score(motifs), motifs]
while True:
current_profile = profile_with_pseudocounts(motifs)
motifs = profile_most_probable_kmer(dna,k,current_profile)
# motifs = []
# for i in range(len(mfp)):
# motifs.append(mfp[i][0])
current_score = score(motifs)
if current_score < best_score[0]:
best_score = [current_score, motifs]
else:
return best_score[1]
def Laplace(dna,k,t):
import random
random.seed(0)
i = 0
LastMotifs = randomized_motif_search(dna,k,t)
while i < 1000:
BestMotifs = randomized_motif_search(dna,k,t)
if score(BestMotifs)<score(LastMotifs):
LastMotifs = BestMotifs
i+=1
print('\n'.join(LastMotifs))