Search code examples
pythonmultithreadingmultiprocessingfastq

reading large fastq file with python faster


I have several fastq files with 500.000.000 lines (125.000.000 sequences) in average. Is there a fast way to reads these fastq files faster.

What I want to do, is to read each sequence and use the first 16 sequences as barcode. Then count the number of barcode in each file.

Here is my script which takes hours:

import os, errno
from Bio import SeqIO
import gzip
files = os.listdir(".")
for file in files[:]:
    if not file.endswith(".fastq.gz"):
        files.remove(file)

maps = {}
for file in files:
    print "Now Parsing file %s"%file
    maps[file] = {}
    with gzip.open(file,"r") as handle:
        recs = SeqIO.parse(handle,"fastq")
        for rec in recs:
            tag = str(rec.seq)[0:16]
            if tag not in map[file]:
                maps[file][tag] = 1
            else:
                maps[file][tag] += 1

I have 250 GB RAM and 20 CPU that can be used for multi-threading ...

thanks.


Solution

  • Untested, but here's a way you could do this in 'embarassingly parallel' fashion:

    import multiprocessing as mp
    import os, errno
    from Bio import SeqIO
    import gzip
    
    def ImportFile(file):
    
        maps = {}
        with gzip.open(file,"r") as handle:
            recs = SeqIO.parse(handle,"fastq")
            for rec in recs:
                tag = str(rec.seq)[0:16]
                if tag not in maps.keys():
                    maps[tag] = 1
                else:
                    maps[tag] += 1
    
        return {file:maps}
    
    
    files = os.listdir(".")
    for file in files[:]:
        if not file.endswith(".fastq.gz"):
            files.remove(file)
    
    # I'd test this with smaller numbers before using up all 20 cores
    pool = mp.Pool(processes=10)
    output = pool.map(ImportFile,files)