Search code examples
pythonsearchrandombioinformaticsdna-sequence

Unexpected output in randomized motif search in DNA strings


I have the following t=5 DNA strings:

DNA = '''CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG
TAGTACCGAGACCGAAAGAAGTATACAGGCGT
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC
AATCCACCAGCTCCACGTGCAATGTTGGCCTA'''
k = 8
t = 5

I'm trying to find the best motifs of length k=8 from the collection of strings using a laplace transform to randomly sample chunks of length k from each of the t strings.

My helper functions are as follows:

def window(s, k):
    for i in range(1 + len(s) - k):
        yield s[i:i+k]

def HammingDistance(seq1, seq2):
    if len(seq1) != len(seq2):
        raise ValueError('Undefined for sequences of unequal length.')
    return sum(ch1 != ch2 for ch1, ch2 in zip(seq1, seq2))

def score(motifs):
    score = 0
    for i in range(len(motifs[0])):
        motif = ''.join([motifs[j][i] for j in range(len(motifs))])
        score += min([HammingDistance(motif, homogeneous*len(motif)) for homogeneous in 'ACGT'])
    return score

def profile(motifs):
    prof = []
    for i in range(len(motifs[0])):
        col = ''.join([motifs[j][i] for j in range(len(motifs))])
        prof.append([float(col.count(nuc))/float(len(col)) for nuc in 'ACGT'])
    return prof

def profile_most_probable_kmer(dna, k, prof):
    dna = dna.splitlines()
    nuc_loc = {nucleotide:index for index,nucleotide in enumerate('ACGT')}
    motif_matrix = []
    max_prob = [-1, None]
    for i in range(len(dna)):
        motif_matrix.append(max_prob)
    for i in range(len(dna)):
        for chunk in window(dna[i],K):
            current_prob = 1
            for j, nuc in enumerate(chunk):
                current_prob*=prof[j][nuc_loc[nuc]]
            if current_prob>motif_matrix[i][0]:
                motif_matrix[i] = [current_prob,chunk]
    return list(list(zip(*motif_matrix))[1])

def profile_with_pseudocounts(motifs):
    prof = []
    for i in range(len(motifs[0])):
        col = ''.join([motifs[j][i] for j in range(len(motifs))])
        prof.append([float(col.count(nuc)+1)/float(len(col)+4) for nuc in 'ACGT'])
    return prof

from random import randint

def SampleMotifs(Dna,k,t):
    Dna = Dna.splitlines()
    BestMotifs = []
    for line in Dna:
        position = randint(0,len(line)-k)
        BestMotifs.append(line[position:position+k])
    return BestMotifs

def motifs_from_profile(profile, dna, k):
    return [profile_most_probable_kmer(seq,k,profile) for seq in dna]


def randomized_motif_search(dna,k,t):
    from random import randint
    dna = dna.splitlines()
    rand_ints = [randint(0,len(dna[0])-k)) for a in range(len(dna))]
    motifs = [dna[i][r:r+k] for i,r in enumerate(rand_ints)]
    best_score = [score(motifs), motifs]
    while True:
        current_profile = profile_with_pseudocounts(motifs)
        motifs = motifs_from_profile(current_profile, dna, k)
        current_score = score(motifs)
        if current_score < best_score[0]:
            best_score = [current_score, motifs]
        else:
            return best_score[1]


def Laplace(dna,k,t):
    i = 0
    LastMotifs = randomized_motif_search(dna,k,t)
    while i < 1000:
        try:
            BestMotifs = randomized_motif_search(dna,k,t)
            if score(BestMotifs)<score(LastMotifs):
                LastMotifs = BestMotifs
        except:
            pass
        i+=1
    print(*LastMotifs)

The output I should be getting for this is :

TCTCGGGG
CCAAGGTG
TACAGGCG
TTCAGGTG
TCCACGTG

I get different outputs every time which I expect with a method with a random element, but it should be converging when I iterate 1000 times and only update my best motifs if the score is lower. The fact I had to put in an error handler in the laplace since I get an index when I call randomized_motif_search(dna,k,t) tells me that might be the source of the problem. I've spent the last two days scouring the code to make sure everything is the right shape, but the fact I'm getting the wrong answer or the following errors:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-811-ee735449bb8e> in <module>
----> 1 Laplace(DNA,K,T)

<ipython-input-810-31301d79eb95> in Laplace(dna, k, t)
      3     LastMotifs = randomized_motif_search(dna,k,t)
      4     while i < 2000:
----> 5         BestMotifs = randomized_motif_search(dna,k,t)
      6         if score(BestMotifs)<score(LastMotifs):
      7             LastMotifs = BestMotifs

<ipython-input-809-43600d882734> in randomized_motif_search(dna, k, t)
      8     while True:
      9         current_profile = profile_with_pseudocounts(motifs)
---> 10         motifs = motifs_from_profile(current_profile, dna, k)
     11         current_score = score(motifs)
     12         if current_score < best_score[0]:

<ipython-input-408-7c866045d839> in motifs_from_profile(profile, dna, k)
      1 def motifs_from_profile(profile, dna, k):
----> 2     return [profile_most_probable_kmer(seq,k,profile) for seq in dna]

<ipython-input-408-7c866045d839> in <listcomp>(.0)
      1 def motifs_from_profile(profile, dna, k):
----> 2     return [profile_most_probable_kmer(seq,k,profile) for seq in dna]

<ipython-input-795-56f83ba5ee2b> in profile_most_probable_kmer(dna, k, prof)
     25             current_prob = 1
     26             for j, nuc in enumerate(chunk):
---> 27                 current_prob*=prof[j][nuc_loc[nuc]]
     28             if current_prob>motif_matrix[i][0]:
     29                 motif_matrix[i] = [current_prob,chunk]

IndexError: list index out of range

It's more than a little vexing. Actual help would be greatly appreciated.

EDIT: The problem was my indexing for the random integers that were sampling random motifs from the dna strings, and that my motifs_from_profile function returns a list of lists rather than just a list which the code kinda depends on. I have updated the functions below: While these fixes resolved the problems that were throwing up errors in the code and I'm now getting an output each time I run the Laplace function, but the result is not what I expect even when I feed the correct answer in at the first iteration. I will try my hardest to debug what's going on in the scoring function and review the literature I guess. Maybe some more vague input from the community will help but who knows?

the updated randomized_motif_search is:

def randomized_motif_search(dna,k,t):
    from random import randint
    from itertools import chain
    dna = dna.splitlines()
#     Randomly generate k-mers from each sequence in the dna list.
    rand_ints = [randint(0,len(dna[0])-(k)) for a in range(len(dna))]
    motifs = [dna[i][r:r+k] for i,r in enumerate(rand_ints)]
    best_score = [score(motifs), motifs]
    while True:
        current_profile = profile_with_pseudocounts(motifs)
        mfp = motifs_from_profile(current_profile,dna,k)
        motifs = []
        for i in range(len(mfp)):
            motifs.append(mfp[i][0])
        current_score = score(motifs)
        if current_score < best_score[0]:
            best_score = [current_score, motifs]
        else:
            return best_score[1]

and the new Laplace:

def Laplace(dna,k,t):
    i = 0

    LastMotifs = randomized_motif_search(dna,k,t)
    while i < 1000:
        BestMotifs = randomized_motif_search(dna,k,t)
        if score(BestMotifs)<score(LastMotifs):
            LastMotifs = BestMotifs
        i+=1
    print(*LastMotifs)

EDIT: Dear journal, I've messed around with the score method and figured out how to return a motif score, but still am not arriving at the correct answers for the question. I am officially stuck here is the updated score function code with proper indexing:

def score(motifs):
    score = 0
    for i in range(len(motifs[0])):
        motif = ''.join([Motifs[j][i] for j in range(len(Motifs))])
        score+=min([HammingDistance(motif,homogenous*len(motif)) for homogenous in 'ACGT'])
    return score

Solution

  • I figured it out! It was all indexing that originated from the call to splitlines() that I had in the beginning of a lot of my helper functions. I also hadn't fully debugged my score() function, so my scoring wasn't happening the way the literature had asked me to do it.

    all the helper functions are as follows:

    def window(s, k):
        for i in range(1 + len(s) - k):
            yield s[i:i+k]
    
    def HammingDistance(seq1, seq2):
        if len(seq1) != len(seq2):
            raise ValueError('Undefined for sequences of unequal length.')
        return sum(ch1 != ch2 for ch1, ch2 in zip(seq1, seq2))
    
    def score(motifs):
        score = 0
        for i in range(len(motifs[0])):
            motif = ''.join(motifs[j][i] for j in range(len(motifs)))
            score += min([HammingDistance(motif,homogenous*len(motif)) for homogenous in 'ACGT'])
        return score
    
    def profile(motifs):
        prof = []
        for i in range(len(motifs[0])):
            col = ''.join([motifs[j][i] for j in range(len(motifs))])
            prof.append([float(col.count(nuc))/float(len(col)) for nuc in 'ACGT'])
        return prof
    
    def profile_most_probable_kmer(dna, k, prof):
        nuc_loc = {nucleotide:index for index,nucleotide in enumerate('ACGT')}
        motif_matrix = []
        max_prob = [-1, None]
        for i in range(len(dna)):
            motif_matrix.append(max_prob)
        for i in range(len(dna)):
            for chunk in window(dna[i],k):
                current_prob = 1
                for j, nuc in enumerate(chunk):
                    current_prob*=prof[j][nuc_loc[nuc]]
                if current_prob>motif_matrix[i][0]:
                    motif_matrix[i] = [current_prob,chunk]
        return list(list(zip(*motif_matrix))[1])
    
    def profile_with_pseudocounts(motifs):
        prof = []
        for i in range(len(motifs[0])):
            col = ''.join([motifs[j][i] for j in range(len(motifs))])
            prof.append([float(col.count(nuc)+1)/float(len(col)+4) for nuc in 'ACGT'])
        return prof
    
    from random import randint
    
    def SampleMotifs(Dna,k,t):
        Dna = Dna.splitlines()
        BestMotifs = []
        for line in Dna:
            position = randint(0,len(line)-k)
            BestMotifs.append(line[position:position+k])
        return BestMotifs
    
    def randomized_motif_search(dna,k,t):
        from random import randint
        from itertools import chain
        dna = dna.splitlines()
    #     Randomly generate k-mers from each sequence in the dna list.
        rand_ints = [randint(0,len(dna[0])-(k)) for a in range(len(dna))]
        motifs = [dna[i][r:r+k] for i,r in enumerate(rand_ints)]
        best_score = [score(motifs), motifs]
        while True:
            current_profile = profile_with_pseudocounts(motifs)
            motifs = profile_most_probable_kmer(dna,k,current_profile)
    #         motifs = []
    #         for i in range(len(mfp)):
    #             motifs.append(mfp[i][0])
            current_score = score(motifs)
            if current_score < best_score[0]:
                best_score = [current_score, motifs]
            else:
                return best_score[1]
    
    def Laplace(dna,k,t):
        import random
        random.seed(0)
        i = 0
    
        LastMotifs = randomized_motif_search(dna,k,t)
        while i < 1000:
            BestMotifs = randomized_motif_search(dna,k,t)
            if score(BestMotifs)<score(LastMotifs):
                LastMotifs = BestMotifs
            i+=1
        print('\n'.join(LastMotifs))