Search code examples
pythonpython-requestspython-multithreading

Downloading multiple urls with threadpool


I having problem downloading multiple urls. My code still only download 1 url per session. Still need to finish the first one before downloading the next one.

I want to download like 3 urls at the same time.

Here's my code:

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'
}

def download(path, video_url, bar: tqdm):
    
    res = requests.get(video_url, headers, stream=True)

    with open(path, 'wb') as f:
        for b in res.iter_content(1024):
            f.write(b)
            bar.update(len(b))

def get_length(video_url):
    res = requests.get(video_url, headers, stream=True)
    le = int(res.headers['Content-Length'])
    return le

def download_all(urls: list, thread: int = cpu_count()):

    total = len(urls)
    count = 0

    pool = ThreadPool(thread)  # https://stackoverflow.com/a/56528204/14951175

    for url in urls:
        output_file = get_url_path(url)
        count += 1
        content_length = get_length(video_url=url)
        with tqdm(total=content_length, unit='B', ncols=(150-1), desc=f'Downloading {count} of {total}', unit_divisor=1024, ascii=True, unit_scale=True) as bar:
            pool.apply_async(download(output_file, url, bar))
    pool.close()
    pool.join()


  urls = read_lines('urls.txt')
  download_all(urls)

Solution

  • This line

    pool.apply_async(download(output_file, url, bar))
    

    must be

    pool.apply_async(download, (output_file, url, bar))
    

    Otherwise you call the download method instead of passing it (and the args) to the ThreadPool.


    Edit

    Use starmap to map the urls to func where you perform the download (btw: you can safe the duplicate get-request). And add the position argument.
    To be honest, the bars don't works very smooth, but I don't really have experience with tqdm or ThreadPool. But in general the downloads seem to work.

    def download_all(urls: list, thread: int = cpu_count()):
        total = len(urls)
    
        pool = ThreadPool(thread)
    
        def func(count, url):
            output_file = get_url_path(url)
            req = requests.get(url, headers=headers, stream=True)
            content_length = int(req.headers['Content-Length'])
            with tqdm(total=content_length, unit='B', desc=f'Downloading {count + 1} of {total}',
                      unit_divisor=1024, ascii=True, unit_scale=True, position=count, file=sys.stdout) as bar:
                with open(output_file, 'wb') as f:
                    for b in req.iter_content(1024):
                        f.write(b)
                        bar.update(len(b))
    
        pool.starmap(func, enumerate(urls))
    
        pool.close()
        pool.join()