python multithreading parallel-processing multiprocessing wget

parallelization of downloading thousands of files using wget

I have thousands of the files like below to be downloaded.

urls = ['https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0450.061.2019001110251.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0455.061.2019001110452.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0500.061.2019001110658.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0535.061.2019001110116.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0555.061.2019001132709.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0615.061.2019001132734.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0630.061.2019001132950.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0635.061.2019001133203.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0650.061.2019001132727.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0655.061.2019001132653.hdf']

I can download them one by one using wget as follows.

#wget is here, https://eternallybored.org/misc/wget/1.21.3/64/wget.exe

import os, glob, subprocess
import itertools
import multiprocessing as mp
import concurrent.futures

header = "authorizationkey"
rd = '.\\results\\'
for url in urls:
    app = r"C:\Users\daniel\Downloads\wget.exe"
    subprocess.call([app, 
                     '--header', header,
                     '-P', rd, url])

However, downloading one by one is very slow. So, I wanted to use parallelization or multithreading approach. I tried it as follows.

def doers(urls):
    for url in urls:
        app = r"C:\Users\daniel\Downloads\wget.exe"
        subprocess.call([app, 
                         '--header', authorization,
                         '-P', rd, url])
def batched(iterable, n):
    it = iter(iterable)
    while (batch := tuple(itertools.islice(it, n))):
        yield batch

num_cpus = mp.cpu_count() - 1 

with mp.Pool(num_cpus) as pool:
    pool.map(doers, batched(urls, num_cpus))

However, my pc just hangs and does not produce any results. Can someone help me ?

Solution

Multithreading is probably optimum for this. You don't need to use wget if you have the requests module installed.

You may want to consider reducing the number of concurrent threads (ThreadPoolExecutor constructor) as you're likely to "flood" your network as you stream these fairly large files.

Something like this:

import requests
import os
from concurrent.futures import ThreadPoolExecutor
import sys


HEADER = {AuthorizationKey}
CHUNK = 16 * 1024
TARGET = '/Volumes/G-Drive/results'

def process(url):
    try:
        with requests.get(url, headers=HEADER, stream=True) as r:
            r.raise_for_status()
            *_, filename = url.split('/')
            os.makedirs(TARGET, exist_ok=True)
            with open(os.path.join(TARGET, filename), 'wb') as hdf:
                for chunk in r.iter_content(chunk_size=CHUNK):
                    hdf.write(chunk)
    except Exception as e:
        print(e, file=sys.stderr)


def main():
    urls = [
        'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0450.061.2019001110251.hdf',
        'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0455.061.2019001110452.hdf',
        'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0500.061.2019001110658.hdf',
        'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0535.061.2019001110116.hdf',
        'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0555.061.2019001132709.hdf',
        'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0615.061.2019001132734.hdf',
        'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0630.061.2019001132950.hdf',
        'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0635.061.2019001133203.hdf',
        'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0650.061.2019001132727.hdf',
        'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0655.061.2019001132653.hdf'
    ]
    with ThreadPoolExecutor() as tpe:
        tpe.map(process, urls)


if __name__ == '__main__':
    main()