I've been trying to download thousands of images in my local filesystem but it hasn't worked correctly because I got an exception called asyncio.exceptions.TimeoutError when I had downloaded around 5,000 images separated by directories.
The first time I executed next script I got 16.000 downloads, but each time I execute it, it decrease the number of downloaded images and currently I'm around 5,000 images.
That's the script I've implemented:
import os
import asyncio
import aiofiles
import async_timeout
from aiohttp import ClientSession
from generator import generate_hash
from logger import logger
from typing import List, Dict, Any
async def download_file(session: Any, remote_url: str, filename: str) -> None:
try:
async with async_timeout.timeout(120):
async with session.get(remote_url) as response:
if response.status == 200:
async with aiofiles.open(filename, mode='wb') as f:
async for data in response.content.iter_chunked(1024):
await f.write(data)
else:
logger.error(f"Error to get {filename} from Remote Server")
except asyncio.TimeoutError:
logger.error(f"Timeout error to download {filename} into Local Server")
raise
async def download_files(images: List[Dict[str, Any]], path: str) -> None:
headers = {"user-agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
async with ClientSession(headers=headers) as session:
tasks = [asyncio.ensure_future(download_file(session, image['resource'], get_filename(image, path))) for image in images]
await asyncio.gather(*tasks)
def download_images(images: List[Dict[str, Any]], path: str) -> None:
try:
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(download_files(images, path))
loop.run_until_complete(future)
logger.info(f'Images from Remote Server have been downloaded successfully')
except Exception as error:
logger.error(f'Error to download images from Remote Server: {error}')
raise
def get_filename(image: Dict[str, Any], path: str) -> str:
image_dir = '{}/{}'.format(path, image['id'])
image_file = '{}.jpg'.format(generate_hash(image['resource']))
if not os.path.exists(image_dir):
os.makedirs(image_dir)
return os.path.join(image_dir, image_file)
def main():
images = [
{
'id': '10755431',
'resource': 'http://image1.jpg'
},
{
'id': '10755432',
'resource': 'http://image2.jpg'
},
{
'id': '101426201',
'recurso': 'http://image3.jpg'
}
]
IMAGES_PATH = '/home/stivenramireza'
download_images(images, IMAGES_PATH)
if __name__ == "__main__":
main()
I got this error:
ERROR:root:Timeout error to download /home/stivenramireza/10755431/664e3bdd10cd69452774f38ec822a9eb.jpg into Local Server
ERROR:root:Error to download images from Remote Server:
Traceback (most recent call last):
File "/home/stivenramireza/storage/main.py", line 17, in download_file
async for data in response.content.iter_chunked(1024):
File "/home/stivenramireza/.local/lib/python3.8/site-packages/aiohttp/streams.py", line 39, in __anext__
rv = await self.read_func()
File "/home/stivenramireza/.local/lib/python3.8/site-packages/aiohttp/streams.py", line 368, in read
await self._wait('read')
File "/home/stivenramireza/.local/lib/python3.8/site-packages/aiohttp/streams.py", line 296, in _wait
await waiter
asyncio.exceptions.CancelledError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "main.py", line 70, in <module>
main()
File "main.py", line 67, in main
download_images(images, IMAGES_PATH)
File "/home/stivenramireza/storage/main.py", line 34, in download_images
loop.run_until_complete(future)
File "/usr/lib/python3.8/asyncio/base_events.py", line 616, in run_until_complete
return future.result()
File "/home/stivenramireza/storage/main.py", line 28, in download_files
await asyncio.gather(*[asyncio.ensure_future(download_file(session, image['recurso'], get_filename(image, path))) for image in images])
File "/home/stivenramireza/storage/main.py", line 20, in download_file
logger.error(f"Error to get {filename} from Re Server")
File "/home/stivenramireza/.local/lib/python3.8/site-packages/async_timeout/__init__.py", line 55, in __aexit__
self._do_exit(exc_type)
File "/home/stivenramireza/.local/lib/python3.8/site-packages/async_timeout/__init__.py", line 92, in _do_exit
raise asyncio.TimeoutError
asyncio.exceptions.TimeoutError
What should I do?
Thanks in advance.
Your download_file
function catches the timeout error and re-raises it. Your download_files
function uses asyncio.gather()
which exits on first exception and propagates it to the caller. It is reasonable to assume that, when downloading a large number of files, sooner or later one of them times out, in which case your whole program gets interrupted.
What should I do?
That depends on what you want your program to do in case of a timeout. For example, you might want to retry that file, or you might want to give up. But you most likely don't want to interrupt the whole download because of a single file that has timed out.
While re-raising an exception you've caught is in many cases the right thing to do, it is not the right thing here. You can change raise
at the end of download_file
to return (remote_url, filename)
which will result in gather()
returning a list of failed downloads and you can try to download them again.