I am currently working on a web scraper and each time i am trying to click or try to get the href of a certain link button with it, it gives me absolutly nothing. However, I tried and I must point out that when I go to the website myself, the link which i need to click works and the data is accessible but when i'm am using my webscraper it doesn't why ?
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
import urllib.request
import os
WEBSITE_URL = 'https://www.i-de.es/conexion-red-electrica/produccion-energia/mapa-capacidad-acceso'
BUTTON_COOKIE_XPATH = '//*[@id="onetrust-accept-btn-handler"]'
BUTTON_AVISO_XPATH = '//*[@id="MapaCapaciadaModalButton"]/span[1]'
BUTTON_PDF_XPATH = '//*[@id="portlet_com_liferay_journal_content_web_portlet_JournalContentPortlet_INSTANCE_aVVDHaAKM4S6"]/div/div/div/div/div/p/a'
DOWNLOAD_PATH = '/path/to/download/directory'
PROFILE_PATH = 'my personal path to my chrome profile'
def setup_driver(profile_path: str = None) -> webdriver.Chrome:
chrome_options = Options()
if profile_path:
chrome_options.add_argument(f"user-data-dir={profile_path}")
chrome_options.add_experimental_option("prefs", {
"download.default_directory": DOWNLOAD_PATH,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
})
driver = webdriver.Chrome(options=chrome_options)
return driver
def wait_and_click(driver: webdriver.Chrome, by: By, value: str):
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((by, value))
)
element.click()
def get_pdf_url(driver: webdriver.Chrome) -> str:
pdf_link_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, BUTTON_PDF_XPATH))
)
url = pdf_link_element.get_attribute('href')
if not url:
raise ValueError("Failed to retrieve the PDF URL")
return url
def download_pdf(url: str, download_path: str) -> str:
local_pdf_path = os.path.join(download_path, "downloaded_file.pdf")
urllib.request.urlretrieve(url, local_pdf_path)
sleep(10)
if not os.path.isfile(local_pdf_path):
raise FileNotFoundError("PDF file was not found after downloading")
return local_pdf_path
def main():
driver = setup_driver()
try:
driver.get(WEBSITE_URL)
sleep(10)
wait_and_click(driver, By.XPATH, BUTTON_COOKIE_XPATH)
wait_and_click(driver, By.XPATH, BUTTON_AVISO_XPATH)
pdf_url = get_pdf_url(driver)
downloaded_pdf_path = download_pdf(pdf_url, DOWNLOAD_PATH)
print(f"PDF downloaded to: {downloaded_pdf_path}")
finally:
driver.quit()
if __name__ == "__main__":
main()
As you can see it's not a really big scraper and only want to have this one file described as 'BUTTON_PDF_XPATH'.
So i tried things in order to fix it like using my chrome profile with the web scrapper which sometimes resulted in giving me the error: Err_HTTP2_Protocol_Error ,infinite loading until it timed out or in some cases it loaded the website but it could click on nothing (all the XPATH work i can assure you).
I also tried to slow down the scraper with some sleep() but it resulted in just making me wait for nothing, or i even tried to directly click on it but it just keeped making me leave.
Finally i wanted to try to use an argument such as :options.add_argument('--disable-http2') for the Err_HTTP2_Protocol_Error but i don't know how to use it.
You can get the pdf link from the static html, no need for selenium:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
def extract_pdf_link(url):
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
pdf_url = urljoin(url, soup.select_one('a[href*=".pdf/"]').get('href'))
return pdf_url
def download_pdf(url, download_path):
local_pdf_path = os.path.join(download_path, "downloaded_file.pdf")
response = requests.get(url, headers=HEADERS)
with open(local_pdf_path, 'wb') as f:
f.write(response.content)
return local_pdf_path
WEBSITE_URL = 'https://www.i-de.es/conexion-red-electrica/produccion-energia/mapa-capacidad-acceso'
DOWNLOAD_PATH = ''
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'}
pdf_url = extract_pdf_link(WEBSITE_URL)
downloaded_pdf_path = download_pdf(pdf_url, DOWNLOAD_PATH)