Search code examples
pythonseleniumselenium-chromedriverurllib

How do I control Selenium PDF and Excel files download behavior?


I want to download all the tender documents from this url 'http://www.ha.org.hk/haho/ho/bssd/T18G014Pc.htm'

I'm using selenium to go through each tender links and download the files.

However, my scraper couldn't handle the Excel download behavior. Currently, it handles PDF files pretty well.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
from bs4 import BeautifulSoup
import os
from urllib.request import urlretrieve



driver = webdriver.Chrome(executable_path='chromedriver_win32/chromedriver.exe')
# open url in browser

driver.get('http://www.ha.org.hk/haho/ho/bssd/TN_236490_000952a.htm')

# get html file source
html = driver.page_source
soup = BeautifulSoup(html, "lxml")

# extract table
table_body=soup.find('tbody')

# extract all tender links
table_url = soup.find_all('a')
for url in table_url:
    print("Opening url:", url['href'])
    print("Subject matter:", url.getText().strip())
    driver.get(url['href'])
    # get html file source
    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")
    # look for url links which may contain downloadable documents
    doc_urls = soup.find_all('a')

    if doc_urls[0].has_attr('href'): # some a tag doesn't have any href, so we skip
        driver.get(doc_urls[0]['href'])
        tender_document = driver.current_url
        print(doc_urls[0].getText().strip(),'.pdf', sep='')

    # loop through all urls
    for doc_url in doc_urls:
        if doc_url.has_attr('href'): # some a tag doesn't have any href, so we skip
        #open the doc url
        driver.get(doc_url['href'])
       # get the tender pdf file path
        tender_document = driver.current_url
        # download file
        folder_location = 'C:\\Users\\user1\\Desktop\\tender_documents'
        print(doc_url.getText().strip(),'.pdf', sep='')
        fullfilename = os.path.join(folder_location, filename)
        urlretrieve(tender_document, fullfilename)


Solution

  • Try and to download all documents:

    import requests
    from bs4 import BeautifulSoup
    import re
    
    
    base_url = "http://www.ha.org.hk"
    tender = "T18G014Pc"
    
    with requests.Session() as session:
        r = session.get(f"{base_url}/haho/ho/bssd/{tender}.htm")
    
        # get all documents links
        docs = BeautifulSoup(r.text, "html.parser").select("a[href]")
        for doc in docs:
            href = doc.attrs["href"]
            name = doc.text
            print(f"name: {name}, href: {href}")
    
            # open document page
            r = session.get(href)
    
            # get file path
            file_path = re.search("(?<=window.open\\(')(.*)(?=',)", r.text).group(0)
            file_name = file_path.split("/")[-1]
    
            # get file and save
            r = session.get(f"{base_url}/{file_path}")
            with open(file_name, 'wb') as f:
                f.write(r.content)