python asynchronous web-scraping python-asyncio tqdm

Making asynchronous http requests using python's asyncio

I am trying to do some scraping where I essentially have a list of URLs and I get the HTML response and then continue with the scraping. Naturally, I have attempted to make the requests to the URLs asynchronously but I have failed.

Here is what I have so far:

import aiohttp
import asyncio


async def save_file(row, file_path):
    with open(file_path, 'w') as f:
        f.write(row)


async def download_html(url_idx, url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:

            # Requesting the page
            body = await resp.text()

            file_path = #path to output file

            #some scraping logic......

            await save_file(#result of the scrapping, file_path)


async def main():
    urls = ['url1', 'url2', 'url3']
    tasks = []

    for url_idx, url in enumerate(urls):
        task = asyncio.create_task(download_html(url_idx, url))
        tasks.append(task)
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

Running the above code gives me this error:

There is no current event loop: loop = asyncio.get_event_loop()

What would be the right approach?

I would also love to know how to use tqdm to display a progress bar.

Solution

Here is example how you can restructure a little bit your code + integrate tqdm progress bar:

import asyncio

import aiohttp
from bs4 import BeautifulSoup
from tqdm import tqdm


async def download_html(session, url):
    async with session.get(url) as resp:
        # Requesting the page
        body = await resp.text()

        soup = BeautifulSoup(body, "html.parser")
        return url, soup.title.text


async def main():
    urls = [
        "http://www.google.com",
        "http://www.google.es",
        "http://www.google.de",
    ]

    async with aiohttp.ClientSession() as session:
        tasks = set()
        for url_idx, url in enumerate(urls):
            task = asyncio.create_task(download_html(session, url))
            tasks.add(task)

        results = []
        for t in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
            url, title = await t
            results.append((url, title))

    print()
    print("Results:")
    print(*results, sep="\n")

    # save the results, etc.
    # ...


if __name__ == "__main__":
    asyncio.run(main())

Prints:

100%|███████████████████████████████████████████████████████| 3/3 [00:00<00:00, 24.20it/s]

Results:
('http://www.google.com', 'Google')
('http://www.google.de', 'Google')
('http://www.google.es', 'Google')