I am trying to do some scraping where I essentially have a list of URLs and I get the HTML response and then continue with the scraping. Naturally, I have attempted to make the requests to the URLs asynchronously but I have failed.
Here is what I have so far:
import aiohttp
import asyncio
async def save_file(row, file_path):
with open(file_path, 'w') as f:
f.write(row)
async def download_html(url_idx, url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
# Requesting the page
body = await resp.text()
file_path = #path to output file
#some scraping logic......
await save_file(#result of the scrapping, file_path)
async def main():
urls = ['url1', 'url2', 'url3']
tasks = []
for url_idx, url in enumerate(urls):
task = asyncio.create_task(download_html(url_idx, url))
tasks.append(task)
await asyncio.gather(*tasks)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
Running the above code gives me this error:
There is no current event loop: loop = asyncio.get_event_loop()
What would be the right approach?
I would also love to know how to use tqdm
to display a progress bar.
Here is example how you can restructure a little bit your code + integrate tqdm
progress bar:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from tqdm import tqdm
async def download_html(session, url):
async with session.get(url) as resp:
# Requesting the page
body = await resp.text()
soup = BeautifulSoup(body, "html.parser")
return url, soup.title.text
async def main():
urls = [
"http://www.google.com",
"http://www.google.es",
"http://www.google.de",
]
async with aiohttp.ClientSession() as session:
tasks = set()
for url_idx, url in enumerate(urls):
task = asyncio.create_task(download_html(session, url))
tasks.add(task)
results = []
for t in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
url, title = await t
results.append((url, title))
print()
print("Results:")
print(*results, sep="\n")
# save the results, etc.
# ...
if __name__ == "__main__":
asyncio.run(main())
Prints:
100%|███████████████████████████████████████████████████████| 3/3 [00:00<00:00, 24.20it/s]
Results:
('http://www.google.com', 'Google')
('http://www.google.de', 'Google')
('http://www.google.es', 'Google')