I'm new to parallel processing in python. I have a piece of code below, that walks through all directories and unzips all tar.gz files. However, it takes quite a bit of time.
import tarfile
import gzip
import os
def unziptar(path):
for root, dirs, files in os.walk(path):
for i in files:
fullpath = os.path.join(root, i)
if i.endswith("tar.gz"):
print 'extracting... {}'.format(fullpath)
tar = tarfile.open(fullpath, 'r:gz')
tar.extractall(root)
tar.close()
path = 'C://path_to_folder'
unziptar(path)
print 'tar.gz extraction completed'
I have been looking through some posts for multiprocessing and joblib packages but I'm still not v clear how to modify my script to run parallel. Any help is appreciated.
EDIT: @tdelaney
Thanks for the help, the surprising thing is that the modified script took twice the time to unzip everything (60mins compare to 30min with the original script)!
I look at the task manager and it appears that while multi-cores were utilised, the CPU usage is v low. I'm not sure why this is so.
It's pretty easy to create a pool to do the work. Just pull the extractor out into a separate worker.
import tarfile
import gzip
import os
import multiprocessing as mp
def unziptar(fullpath):
"""worker unzips one file"""
print 'extracting... {}'.format(fullpath)
tar = tarfile.open(fullpath, 'r:gz')
tar.extractall(os.path.dirname(fullpath))
tar.close()
def fanout_unziptar(path):
"""create pool to extract all"""
my_files = []
for root, dirs, files in os.walk(path):
for i in files:
if i.endswith("tar.gz"):
my_files.append(os.path.join(root, i))
pool = mp.Pool(min(mp.cpu_count(), len(my_files))) # number of workers
pool.map(unziptar, my_files, chunksize=1)
pool.close()
if __name__=="__main__":
path = 'C://path_to_folder'
fanout_unziptar(path)
print 'tar.gz extraction has completed'