Search code examples
pythonasynchronousweb-scrapingpython-asynciotqdm

Making asynchronous http requests using python's asyncio


I am trying to do some scraping where I essentially have a list of URLs and I get the HTML response and then continue with the scraping. Naturally, I have attempted to make the requests to the URLs asynchronously but I have failed.

Here is what I have so far:

import aiohttp
import asyncio


async def save_file(row, file_path):
    with open(file_path, 'w') as f:
        f.write(row)


async def download_html(url_idx, url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:

            # Requesting the page
            body = await resp.text()

            file_path = #path to output file

            #some scraping logic......

            await save_file(#result of the scrapping, file_path)


async def main():
    urls = ['url1', 'url2', 'url3']
    tasks = []

    for url_idx, url in enumerate(urls):
        task = asyncio.create_task(download_html(url_idx, url))
        tasks.append(task)
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

Running the above code gives me this error:

There is no current event loop: loop = asyncio.get_event_loop()

What would be the right approach?

I would also love to know how to use tqdm to display a progress bar.


Solution

  • Here is example how you can restructure a little bit your code + integrate tqdm progress bar:

    import asyncio
    
    import aiohttp
    from bs4 import BeautifulSoup
    from tqdm import tqdm
    
    
    async def download_html(session, url):
        async with session.get(url) as resp:
            # Requesting the page
            body = await resp.text()
    
            soup = BeautifulSoup(body, "html.parser")
            return url, soup.title.text
    
    
    async def main():
        urls = [
            "http://www.google.com",
            "http://www.google.es",
            "http://www.google.de",
        ]
    
        async with aiohttp.ClientSession() as session:
            tasks = set()
            for url_idx, url in enumerate(urls):
                task = asyncio.create_task(download_html(session, url))
                tasks.add(task)
    
            results = []
            for t in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
                url, title = await t
                results.append((url, title))
    
        print()
        print("Results:")
        print(*results, sep="\n")
    
        # save the results, etc.
        # ...
    
    
    if __name__ == "__main__":
        asyncio.run(main())
    

    Prints:

    100%|███████████████████████████████████████████████████████| 3/3 [00:00<00:00, 24.20it/s]
    
    Results:
    ('http://www.google.com', 'Google')
    ('http://www.google.de', 'Google')
    ('http://www.google.es', 'Google')