I have asked a question on another platform (here) - it would be great to get your input in order to make my Python code run in a very short time. Currently, it has been taking more than 3 hours for a file with millions of entries.
from Bio import SeqIO
import sys
def QIAseq_UMI_correction():
script=sys.argv[0]
file_name=sys.argv[1]
dicts1 = {}
dicts2 = {}
lst = []
with open(file_name, "r") as Fastq:
for record in SeqIO.parse(Fastq,'fastq'):
#print(record.id)
#print(record.seq)
#looking for the 3 prime adapter
if "AACTGTAGGCACCATCAAT" in record.seq:
adapter_pos = record.seq.find('AACTGTAGGCACCATCAAT')
#Only record is used to be able to save the all atributes like phred score in the fastq file
miRNAseq = record[:adapter_pos]
adapter_seq=record[adapter_pos:adapter_pos+19]
umi_seq = record[adapter_pos+19:adapter_pos+19+12]
i = record.id
x = miRNAseq.seq+umi_seq.seq
#print((miRNAseq+umi_seq).format("fastq"))
dicts1[i]=x
#write ids and seq in a dictionary and keep one if there are multiple seqs with miRNA-seq + UMI
for k,v in dicts1.items():
if v not in dicts2.values():
dicts2[k] = v
#making a list
for keys in dicts2:
lst.append(keys)
#print(dicts1)
#print(dicts2)
#print(lst)
with open(file_name, "r") as Fastq:
for record in SeqIO.parse(Fastq,'fastq'):
#based on the saved ids in the list print the entries (miRNA + 3' adapter + UMI)
if record.id in lst:
adapter_pos = record.seq.find('AACTGTAGGCACCATCAAT')
miRNAseq = record[:adapter_pos]
adapter_seq=record[adapter_pos:adapter_pos+19]
umi_seq = record[adapter_pos+19:adapter_pos+19+12]
#print(record.seq)
#print(miRNAseq.seq)
#print(adapter_seq.seq)
#print(umi_seq.seq)
#print("@"+record.id)
if len(miRNAseq.seq+adapter_seq.seq+umi_seq.seq) <= 50:
print((miRNAseq+adapter_seq+umi_seq).format("fastq"),end='')
if len(miRNAseq.seq+adapter_seq.seq+umi_seq.seq) > 50:
cut = len(miRNAseq.seq+adapter_seq.seq+umi_seq.seq) - 50
print((miRNAseq+adapter_seq+umi_seq)[:-cut].format("fastq"), end='')
if __name__ == '__main__':
QIAseq_UMI_correction()
I solved it by removing the second part - then checking if x is not present in the dicts.keys()
(searching for dicts.values()
was much slower and was the real bottleneck) then writing y
in output. I only use one dict now and the code is already generating output. Here is the updated version.
from Bio import SeqIO
import sys
def QIAseq_UMI_correction():
script=sys.argv[0]
file_name=sys.argv[1]
dicts = {}
lst = []
with open(file_name, "r") as Fastq:
for record in SeqIO.parse(Fastq,'fastq'):
if "AACTGTAGGCACCATCAAT" in record.seq:
adapter_pos = record.seq.find('AACTGTAGGCACCATCAAT')
miRNAseq = record[:adapter_pos]
adapter_seq=record[adapter_pos:adapter_pos+19]
umi_seq = record[adapter_pos+19:adapter_pos+19+12]
i = record.id
x = miRNAseq.seq+umi_seq.seq
y = miRNAseq.seq+adapter_seq.seq+umi_seq.seq
#print((miRNAseq+umi_seq).format("fastq"))
if x not in dicts.keys():
dicts[x]=i
if len(y) <= 50:
print((miRNAseq+adapter_seq+umi_seq).format("fastq"),end='')
if len(y) > 50:
cut = len(y) - 50
print((miRNAseq+adapter_seq+umi_seq)[:-cut].format("fastq"), end='')
if __name__ == '__main__':
QIAseq_UMI_correction()