Got this function, which generates all possible kmers over the four Bases in python:
def generate_kmers(k):
bases = ['A', 'C', 'T', 'G'] # in task (a) we only should wirte a function that generates k-mers of the four Bases
kmer = [''.join(p) for p in itertools.product(bases, repeat=length_kmer)]
# itertools.product returns a Cartesian product of input iterables, in our case it generates over bases and joined
# all string combinations together over a length of k-mers
return kmer
now what I want is, to look over a list of Sequences of a fastq file (e.g. ['GTATACACTAGTCCAGGATGTGCTTCTTGTAGAAAAGTAAAACAATGGTTAAAAGATCACAATCTTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN', 'CCTGTAGAGTCATAAAGACCTCTTGGGTCCATCCTAGAAATTTTTCAGCTGAGAATAACGGGTCTGTTTCAGTTATTGCTTCTACTATNNNNNNNNNNNNNNNNNNNNNNNNNNN']) and count the occurences of all my kmers of the function generate_kmer in my list of Sequences and to save it in a dictionary. (e.g. {AAAA: 2, AAAC: 1...}) First I tried to modify generate_kmer, so that it gives all k-mers of the sequence file, and iterate over kmerSequences and kmerBases but that doesn't worked.
Does somebody has any ideas on how I can do it?
You could try this with count
:
import itertools
def generate_kmers(k):
bases = ['A', 'C', 'T', 'G'] # in task (a) we only should wirte a function that generates k-mers of the four Bases
kmer = [''.join(p) for p in itertools.product(bases, repeat=k)]
# itertools.product returns a Cartesian product of input iterables, in our case it generates over bases and joined
# all string combinations together over a length of k-mers
return kmer
seqs=['GTATACACTAGTCCAGGATGTGCTTCTTGTAGAAAAGTAAAACAATGGTTAAAAGATCACAATCTTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN', 'CCTGTAGAGTCATAAAGACCTCTTGGGTCCATCCTAGAAATTTTTCAGCTGAGAATAACGGGTCTGTTTCAGTTATTGCTTCTACTATNNNNNNNNNNNNNNNNNNNNNNNNNNN']
k=4
mers4= generate_kmers(k)
dcts=[{kmer:seq.count(kmer) for kmer in mers4}for seq in seqs]
print(dcts)
Edit:
import itertools
import re
def generate_kmers(k):
bases = ['A', 'C', 'T', 'G'] # in task (a) we only should wirte a function that generates k-mers of the four Bases
kmer = [''.join(p) for p in itertools.product(bases, repeat=k)]
# itertools.product returns a Cartesian product of input iterables, in our case it generates over bases and joined
# all string combinations together over a length of k-mers
return kmer
k=4
mers4= generate_kmers(k)
#given sequence
s='GTATACACTAGTCCAGGATGTGCTTCTTGTAGAAAAGTAAAACAATGGTTAAAAGATCACAATCTTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'
#function that returns the dictionary with ocurrences
def dct_count(seq):
return {mer:len(re.findall(mer, s)) for mer in mers4}
dc=dct_count(s)
print(dc)