Search code examples
pythonperformancebioinformaticsbiopython

How to optimize code for a file with 72 million entries?


I have asked a question on another platform (here) - it would be great to get your input in order to make my Python code run in a very short time. Currently, it has been taking more than 3 hours for a file with millions of entries.

from Bio import SeqIO
import sys


def QIAseq_UMI_correction():
    script=sys.argv[0]
    file_name=sys.argv[1]
    dicts1 = {}
    dicts2 = {}
    lst = []
    with open(file_name, "r") as Fastq:
        for record in SeqIO.parse(Fastq,'fastq'):
            #print(record.id)
            #print(record.seq)
            #looking for the 3 prime adapter
            if "AACTGTAGGCACCATCAAT" in record.seq:
                adapter_pos = record.seq.find('AACTGTAGGCACCATCAAT')
                #Only record is used to be able to save the all atributes like phred score in the fastq file
                miRNAseq = record[:adapter_pos]
                adapter_seq=record[adapter_pos:adapter_pos+19]
                umi_seq = record[adapter_pos+19:adapter_pos+19+12]
                i = record.id
                x = miRNAseq.seq+umi_seq.seq
                #print((miRNAseq+umi_seq).format("fastq"))
                dicts1[i]=x

        #write ids and seq in a dictionary and keep one if there are multiple seqs with miRNA-seq + UMI
        for k,v in dicts1.items():
            if v not in dicts2.values():
                dicts2[k] = v

        #making a list
        for keys in dicts2:
            lst.append(keys)

    #print(dicts1)
    #print(dicts2)
    #print(lst)


    with open(file_name, "r") as Fastq:
        for record in SeqIO.parse(Fastq,'fastq'):
            #based on the saved ids in the list print the entries (miRNA + 3' adapter + UMI)
            if record.id in lst:
                adapter_pos = record.seq.find('AACTGTAGGCACCATCAAT')
                miRNAseq = record[:adapter_pos]
                adapter_seq=record[adapter_pos:adapter_pos+19]
                umi_seq = record[adapter_pos+19:adapter_pos+19+12]
                #print(record.seq)
                #print(miRNAseq.seq)
                #print(adapter_seq.seq)
                #print(umi_seq.seq)
                #print("@"+record.id)
                if len(miRNAseq.seq+adapter_seq.seq+umi_seq.seq) <= 50:
                    print((miRNAseq+adapter_seq+umi_seq).format("fastq"),end='')

                if len(miRNAseq.seq+adapter_seq.seq+umi_seq.seq) > 50:
                    cut = len(miRNAseq.seq+adapter_seq.seq+umi_seq.seq) - 50
                    print((miRNAseq+adapter_seq+umi_seq)[:-cut].format("fastq"), end='')

if __name__ == '__main__':
    QIAseq_UMI_correction()

Solution

  • I solved it by removing the second part - then checking if x is not present in the dicts.keys() (searching for dicts.values() was much slower and was the real bottleneck) then writing y in output. I only use one dict now and the code is already generating output. Here is the updated version.

    from Bio import SeqIO
    import sys
    
    def QIAseq_UMI_correction():
        script=sys.argv[0]
        file_name=sys.argv[1]
        dicts = {}
        lst = []
        
        with open(file_name, "r") as Fastq:
            for record in SeqIO.parse(Fastq,'fastq'):
                
                if "AACTGTAGGCACCATCAAT" in record.seq:
                    adapter_pos = record.seq.find('AACTGTAGGCACCATCAAT')
                    miRNAseq = record[:adapter_pos]
                    adapter_seq=record[adapter_pos:adapter_pos+19]
                    umi_seq = record[adapter_pos+19:adapter_pos+19+12]
                    i = record.id
                    x = miRNAseq.seq+umi_seq.seq
                    y = miRNAseq.seq+adapter_seq.seq+umi_seq.seq
                    #print((miRNAseq+umi_seq).format("fastq"))
                    if x not in dicts.keys():
                        dicts[x]=i
    
                        if len(y) <= 50:
                            print((miRNAseq+adapter_seq+umi_seq).format("fastq"),end='')
                        
                        if len(y) > 50:
                            cut = len(y) - 50
                            print((miRNAseq+adapter_seq+umi_seq)[:-cut].format("fastq"), end='')
    if __name__ == '__main__':
        QIAseq_UMI_correction()