I have thousands of the files like below to be downloaded.
urls = ['https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0450.061.2019001110251.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0455.061.2019001110452.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0500.061.2019001110658.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0535.061.2019001110116.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0555.061.2019001132709.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0615.061.2019001132734.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0630.061.2019001132950.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0635.061.2019001133203.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0650.061.2019001132727.hdf', 'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0655.061.2019001132653.hdf']
I can download them one by one using wget as follows.
#wget is here, https://eternallybored.org/misc/wget/1.21.3/64/wget.exe
import os, glob, subprocess
import itertools
import multiprocessing as mp
import concurrent.futures
header = "authorizationkey"
rd = '.\\results\\'
for url in urls:
app = r"C:\Users\daniel\Downloads\wget.exe"
subprocess.call([app,
'--header', header,
'-P', rd, url])
However, downloading one by one is very slow. So, I wanted to use parallelization or multithreading approach. I tried it as follows.
def doers(urls):
for url in urls:
app = r"C:\Users\daniel\Downloads\wget.exe"
subprocess.call([app,
'--header', authorization,
'-P', rd, url])
def batched(iterable, n):
it = iter(iterable)
while (batch := tuple(itertools.islice(it, n))):
yield batch
num_cpus = mp.cpu_count() - 1
with mp.Pool(num_cpus) as pool:
pool.map(doers, batched(urls, num_cpus))
However, my pc just hangs and does not produce any results. Can someone help me ?
Multithreading is probably optimum for this. You don't need to use wget if you have the requests module installed.
You may want to consider reducing the number of concurrent threads (ThreadPoolExecutor constructor) as you're likely to "flood" your network as you stream these fairly large files.
Something like this:
import requests
import os
from concurrent.futures import ThreadPoolExecutor
import sys
HEADER = {AuthorizationKey}
CHUNK = 16 * 1024
TARGET = '/Volumes/G-Drive/results'
def process(url):
try:
with requests.get(url, headers=HEADER, stream=True) as r:
r.raise_for_status()
*_, filename = url.split('/')
os.makedirs(TARGET, exist_ok=True)
with open(os.path.join(TARGET, filename), 'wb') as hdf:
for chunk in r.iter_content(chunk_size=CHUNK):
hdf.write(chunk)
except Exception as e:
print(e, file=sys.stderr)
def main():
urls = [
'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0450.061.2019001110251.hdf',
'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0455.061.2019001110452.hdf',
'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0500.061.2019001110658.hdf',
'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0535.061.2019001110116.hdf',
'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0555.061.2019001132709.hdf',
'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0615.061.2019001132734.hdf',
'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0630.061.2019001132950.hdf',
'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0635.061.2019001133203.hdf',
'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0650.061.2019001132727.hdf',
'https://ladsweb.modaps.eosdis.nasa.gov//archive/allData/61/MOD03/2019/001/MOD03.A2019001.0655.061.2019001132653.hdf'
]
with ThreadPoolExecutor() as tpe:
tpe.map(process, urls)
if __name__ == '__main__':
main()