Step 1 (sync): Determine exactly how many pages need to be scraped.
Step 2 (sync): create the links to the pages to be scraped in a for-loop.
Step 3 (async): Use the link list from step 2 to get the links to the desired detail pages from each of these pages.
Step 4 (async): Use the result from step 3 to extract the detail information for each hofladen. This information is stored in a list for each farm store and each of these lists is appended to a global list.
The transition from step 3 to step 4 does not seem to work properly.
Traceback (most recent call last):
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 108, in <module>
asyncio.run(main())
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 96, in main
await asyncio.gather(*tasks_detail_infos)
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 61, in scrape_detail_infos
data = JsonLdExtractor().extract(body_d)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/extruct/jsonld.py", line 21, in extract
tree = parse_html(htmlstring, encoding=encoding)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/extruct/utils.py", line 10, in parse_html
return lxml.html.fromstring(html, parser=parser)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/lxml/html/__init__.py", line 873, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/lxml/html/__init__.py", line 761, in document_fromstring
raise etree.ParserError(
lxml.etree.ParserError: Document is empty
Process finished with exit code 1
In a first attempt I rewrote the async function append_detail_infos so that it no longer tries to create a list and append the values but only prints data[0]["name"]. This resulted in the error message
Traceback (most recent call last):
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 108, in <module>
asyncio.run(main())
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 96, in main
await asyncio.gather(*tasks_detail_infos)
File "/Users/REPLACED_MY_USER/PycharmProjects/PKI-Projekt/test_ttt.py", line 61, in scrape_detail_infos
data = JsonLdExtractor().extract(body_d)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/extruct/jsonld.py", line 21, in extract
tree = parse_html(htmlstring, encoding=encoding)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/extruct/utils.py", line 10, in parse_html
return lxml.html.fromstring(html, parser=parser)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/lxml/html/__init__.py", line 873, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/Users/REPLACED_MY_USER/miniconda3/envs/scrapy/lib/python3.10/site-packages/lxml/html/__init__.py", line 761, in document_fromstring
raise etree.ParserError(
lxml.etree.ParserError: Document is empty
Process finished with exit code 1
In the next attempt, I exported the links from detail_links as .csv and visually checked them and opened some of them to see if they were valid. This was also the case.
import asyncio
import time
import aiohttp
import requests
import re
from selectolax.parser import HTMLParser
from extruct.jsonld import JsonLdExtractor
import pandas as pd
BASE_URL = "https://hofladen.info"
FIRST_PAGE = 1
def get_last_page(url: str) -> int:
res = requests.get(url).text
html = HTMLParser(res)
last_page = int(re.findall("(\d+)", html.css("li.page-last > a")[0].attributes["href"])[0])
return last_page
def build_links_to_pages(start: int, ende: int) -> list:
lst = []
for i in range(start, ende + 1):
url = f"https://hofladen.info/regionale-produkte?page={i}"
lst.append(url)
return lst
async def scrape_detail_links(url: str):
async with aiohttp.ClientSession() as session:
async with session.get(url, allow_redirects=True) as resp:
body = await resp.text()
html = HTMLParser(body)
for node in html.css(".sp13"):
detail_link = BASE_URL + node.attributes["href"]
detail_links.append(detail_link)
async def append_detail_infos(data):
my_detail_lst = []
# print(data[0]["name"]) # name for debugging purpose
my_detail_lst.append(data[0]["name"]) # name
my_detail_lst.append(data[0]["address"]["streetAddress"]) # str
my_detail_lst.append(data[0]["address"]["postalCode"]) # plz
my_detail_lst.append(data[0]["address"]["addressLocality"]) # ort
my_detail_lst.append(data[0]["address"]["addressRegion"]) # bundesland
my_detail_lst.append(data[0]["address"]["addressCountry"]) # land
my_detail_lst.append(data[0]["geo"]["latitude"]) # breitengrad
my_detail_lst.append(data[0]["geo"]["longitude"]) # längengrad
detail_infos.append(my_detail_lst)
async def scrape_detail_infos(detail_link: str):
async with aiohttp.ClientSession() as session_detailinfos:
async with session_detailinfos.get(detail_link) as res_d:
body_d = await res_d.text()
data = JsonLdExtractor().extract(body_d)
await append_detail_infos(data)
async def main() -> None:
start_time = time.perf_counter()
# Beginn individueller code
# ----------
global detail_links, detail_infos
detail_links, detail_infos = [], []
tasks = []
tasks_detail_infos = []
# extrahiere die letzte zu iterierende Seite
last_page = get_last_page("https://hofladen.info/regionale-produkte")
# scrape detail links
links_to_pages = build_links_to_pages(FIRST_PAGE, last_page)
for link in links_to_pages:
task = asyncio.create_task(scrape_detail_links(link))
tasks.append(task)
print("Saving the output of extracted information.")
await asyncio.gather(*tasks)
pd.DataFrame(data=detail_links).to_csv("detail_links.csv")
# scrape detail infos
for detail_url in detail_links:
task_detail_infos = asyncio.create_task(scrape_detail_infos(detail_url))
tasks_detail_infos.append(task_detail_infos)
await asyncio.gather(*tasks_detail_infos)
# Ende individueller Code
# ------------
time_difference = time.perf_counter() - start_time
print(f"Scraping time: {time_difference} seconds.")
print(len(detail_links))
# print(detail_infos[])
asyncio.run(main())
added python allow_redirects=True
to python async with session_detailinfos.get(detail_link, allow_redirects=True) as res_d:
added python return_exceptions=True
to python await asyncio.gather(*tasks_detail_infos, return_exceptions=True)
added
python allow_redirects=True
to python async with session_detailinfos.get(detail_link, allow_redirects=True) as res_d:
added python return_exceptions=True
to python await asyncio.gather(*tasks_detail_infos, return_exceptions=True)