My goal is to extract all the href links from this page and find the .pdf links. I tried using the requests library and Selenium, but neither of them could extract it.
How can I solve this problem? Thank you.
Ex: This contain a .pdf file link
This is the request code:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0'}
url="https://www.bain.com/insights/topics/energy-and-natural-resources-report/"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a'):
print(link.get('href'))
This is the selenium code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
page_source = driver.get("https://www.bain.com/insights/topics/energy-and-natural-resource-report/")
driver.implicitly_wait(10)
soup = BeautifulSoup(page_source, 'html.parser')
for link in soup.find_all('a'):
print(link.get('href'))
driver.quit()
Here is the python-requests version how to get all pdfs from that page:
import json
import re
import requests
from bs4 import BeautifulSoup
def find_pdfs(data):
if isinstance(data, dict):
for k, v in data.items():
if k == "url" and ".pdf" in v:
yield v
else:
yield from find_pdfs(v)
elif isinstance(data, list):
for v in data:
yield from find_pdfs(v)
url = "https://www.bain.com/insights/topics/energy-and-natural-resources-report/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
iframe_src = soup.iframe["src"]
iframe_text = requests.get(iframe_src).text
doc = re.search(r"docVersion: (.*}),", iframe_text).group(1)
doc = json.loads(doc)
data = requests.get(doc["committedJsonUrl"]).text
data = re.search(r"(\{.*\})\);", data).group(1)
data = json.loads(data)
# print(json.dumps(data, indent=4))
pdfs = set(find_pdfs(data))
print(*pdfs, sep="\n")
Prints:
https://www.bain.com/globalassets/noindex/2022/bain_report_global-private-equity-report-2022.pdf
https://www.bain.com/globalassets/noindex/2023/bain_report_engineering_and_r_and_d_report_2023.pdf
https://www.bain.com/globalassets/noindex/2023/bain_report_energy_and_natural_resources_2023.pdf