I want to go to department
and only want to select/print the name
and url
. I have tried the following but I am unable to understand how to go in department
and select those 2 specific things. How can I get the "name" and "url" for all the links?
import json
import urllib.request
from bs4 import BeautifulSoup
def getContent():
# target site url
url = "www.xyz.com"
# requesting the url for data
request = urllib.request.Request(url)
# get the html, whole page
htmlpage = urllib.request.urlopen(request).read()
bsoup = BeautifulSoup(htmlpage, "html.parser")
# print(bsoup.prettify())
# main_table = bsoup.find("div",attrs)
# print(main_table)
# print(bsoup.find_all('name'))
# nav = bsoup.nav
# print(bsoup.title.department.url)
# for url in find_all('a'):
# print(url.get('href'))
for link in bsoup.find_all("a"):
print("Title: {}".format(link.get("name")))
print("href: {}".format(link.get("href")))
You can get the name
/ url
using the json
module as follows:
import json
import urllib.request
from bs4 import BeautifulSoup
def get_content():
url = "http://www.ucdenver.edu/pages/ucdwelcomepage.aspx"
request = urllib.request.Request(url)
html_page = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html_page, 'html.parser')
json_data = json.loads(soup.find("script", type="application/ld+json").string)
for data in json_data["department"]:
print("{:<60} {}".format(data["name"], data["url"]))
get_content()
Output:
Center for Undergraduate Exploration and Advising https://www.ucdenver.edu/center-for-undergraduate-exploration-and-advising
Commencement https://www.ucdenver.edu/commencement
Counseling Center https://www.ucdenver.edu/counseling-center
First Year Experiences https://www.ucdenver.edu/first-year-experiences
Health Programs https://www.ucdenver.edu/programs/health-programs
Housing and Dining https://www.ucdenver.edu/housing-and-dining
...