Search code examples
pythonpermutationbioinformaticsdna-sequence

Generate all Permutations with at most d Mismatches


I was solving a problem for pattern matching with hamming distance up to d for a DNA sequence. Regex saved me there. But now I've encountered a different problem. Given a long DNA sequence, I've to find most frequent mismatched k-mer with at most d mismatches. Here, k-mer refers to sub-sequence of length k.

Note: DNA sequence can be represented by only using four letters: {A, C, G, T}

For example,

DNA sequence= "AGGC"
k = 3
d = 1

Here, only two k-mers are possible: "AGG", "GGC"

Now, I can permute them individually with 1 mismatches by running following code for "AGG" and "GGC"

def permute_one_nucleotide(motif, alphabet={"A", "C", "G", "T"}):
    import itertools

    return list(
        set(
            itertools.chain.from_iterable(
                [
                    [
                        motif[:pos] + nucleotide + motif[pos + 1 :]
                        for nucleotide in alphabet
                    ]
                    for pos in range(len(motif))
                ]
            )
        )
    )

"AGG" will give:

['TGG', 'ATG', 'AGG', 'GGG', 'AGT', 'CGG', 'AGC', 'AGA', 'ACG', 'AAG']

And, "GCC" will give:

['GCC', 'GAC', 'GGT', 'GGA', 'AGC', 'GTC', 'TGC', 'CGC', 'GGG', 'GGC']

Then I can use Counter to find most frequent k-mer. But, this only works if d = 1. How to generalize this for any d <= k?


Solution

  • This is computationally expensive method. But yeah should fetch desired. What I did here is calculate all mismatch with hamming dist 1. then compute new mismatch with ham dist 1 from prev mismatch and recurse till d.

    import itertools
    
    all_c=set('AGCT')
    other = lambda x : list(all_c.difference(x))
    
    def get_changed(sub, i):
        return [sub[0:i]+c+sub[i+1:] for c in other(sub[i])]
    
    def get_mismatch(d, setOfMmatch):
        
        if d==0:
            return setOfMmatch
        
        newMmatches=[]
        for sub in setOfMmatch:
            newMmatches.extend(list(map(lambda x : ''.join(x), itertools.chain.from_iterable(([get_changed(sub, i)  for i, c in enumerate(sub)])))))
        
        setOfMmatch=setOfMmatch.union(newMmatches)
        
        return get_mismatch(d-1, setOfMmatch)
    
    dna='AGGC'
    hamm_dist=1
    length=3
    
    list(itertools.chain.from_iterable([get_mismatch(hamm_dist, {dna[i:i+length]}) for i in range(len(dna)-length+1)]))
    # without duplicates
    # set(itertools.chain.from_iterable([get_mismatch(hamm_dist, {dna[i:i+length]}) for i in range(len(dna)-length+1)]))
    

    found a better performance code almost 10-20X faster

    %%time
    
    import itertools, random
    from cacheout import Cache
    import time
    
    all_c=set('AGCT')
    get_other = lambda x : list(all_c.difference(x))
    
    other={}
    for c in all_c:
        other[c]=get_other(c) 
    
    
    def get_changed(sub, i):
        return [sub[0:i]+c+sub[i+1:] for c in other[sub[i]]]
    
    mmatchHash=Cache(maxsize=256*256, ttl=0, timer=time.time, default=None)
    
    def get_mismatch(d, setOfMmatch):
        
        if d==0:
            
            return setOfMmatch
        
        newMmatches=[]
        for sub in setOfMmatch:
            newMmatches.extend(list(map(lambda x : ''.join(x), itertools.chain.from_iterable(([get_changed(sub, i)  for i, c in enumerate(sub)])))))
        
        setOfMmatch=setOfMmatch.union(newMmatches)
        
        if not mmatchHash.get((d-1, str(setOfMmatch)), 0):
            mmatchHash.set((d-1, str(setOfMmatch)), get_mismatch(d-1, setOfMmatch))
            
        return mmatchHash.get((d-1, str(setOfMmatch)))
    
    
    length_of_DNA=1000
    dna=''.join(random.choices('AGCT', k=length_of_DNA))
    hamm_dist=4
    length=9
    
    len(list(itertools.chain.from_iterable([get_mismatch(hamm_dist, {dna[i:i+length]}) for i in range(len(dna)-length+1)])))
    # set(itertools.chain.from_iterable([get_mismatch(hamm_dist, {dna[i:i+length]}) for i in range(len(dna)-length+1)]))
    

    CPU times: user 1min 32s, sys: 1.81 s, total: 1min 34s Wall time: 1min 34s