Python: Python 3.11.2 Python Editor: PyCharm 2022.3.3 (Community Edition) - Build PC-223.8836.43 OS: Windows 11 Pro, 22H2, 22621.1413 Browser: Chrome 111.0.5563.65 (Official Build) (64-bit)
Still a Baby Pythoneer, I'm scraping a URL https://dockets.justia.com/search?parties=Novo+Nordisk
but also want to scrape its 10 hyperlinked pages (e.g., "https://dockets.justia.com/docket/puerto-rico/prdce/3:2023cv01127/175963,https://dockets.justia.com/docket/california/cacdce/2:2023cv01929/878409, etc.)
How do I (1) "open" the 10 hyperlinked pages, (2) scrape the information in the subsidiary, hyperlinked document, e.g., inside table-responsive with-gaps table-padding--small table-bordered table-padding-sides--small table-full-width
, and then (3) append the captured information to the print output files (index) the parent URL.
I have looked into Selenium a bit to perhaps open and control the webpage that way, but it doesn't seem particularly applicable here. Do I really need Selenium for this or is there some nifty and simple way to do this?
This is what I have so far...
from bs4 import BeautifulSoup
import requests
html_text = requests.get("https://dockets.justia.com/search?parties=Novo+Nordisk").text
soup = BeautifulSoup(html_text, "lxml")
cases = soup.find_all("div", class_ = "has-padding-content-block-30 -zb")
# Printing to individual files
for index, case in enumerate(cases):
case_number = case.find("span", class_ = "citation").text.replace(" ","")
case_url = case.find("a", {"class": "case-name"})["href"]
with open(f"posts/{index}.txt", "w") as f:
f.write(f"Case No.: {case_number.strip()} \t")
f.write(f"Case URL: {case_url} \n")
print("File saved: {index}")
# If printing in terminal
# for case in cases:
# case_number = case.find("span", class_ = "citation").text.replace(" ","")
# case_url = case.find("a", {"class": "case-name"})["href"]
# print(f"Case No.: {case_number.strip()}") # strip cleans off tags
# print(f"Case URL: {case_url}")
from aiohttp import ClientSession
from pyuseragents import random
from bs4 import BeautifulSoup
from asyncio import run
class DocketsJustia:
def __init__(self):
self.headers = {
'authority': 'dockets.justia.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'ro-RO,ro;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
'referer': 'https://dockets.justia.com/search?parties=Novo+Nordisk',
'user-agent': random(),
}
self.PatchFile = "nametxt.txt"
async def Parser(self, session):
count = 1
while True:
params = {
'parties': 'Novo Nordisk',
'page': f'{count}',
}
async with session.get(f'https://dockets.justia.com/search?parties=Novo+Nordisk&page={count}', params=params) as response:
links = BeautifulSoup(await response.text(), "lxml").find_all("div", {"class": "has-padding-content-block-30 -zb"})
for link in links:
try:
case_link = link.find("a", {"class": "case-name"}).get("href")
case_number = link.find("span", {"class": "citation"}).text
print(case_number + "\t" + case_link + "\n")
with open(self.PatchFile, "a", encoding='utf-8') as file:
file.write(case_number + "\t" + case_link + "\n")
except:
pass
count += 1
async def LoggerParser(self):
async with ClientSession(headers=self.headers) as session:
await self.Parser(session)
def StartDocketsJustia():
run(DocketsJustia().LoggerParser())
if __name__ == '__main__':
StartDocketsJustia()