I'm trying to scrape the website address of Yauatcha Riyadh
from a webpage using the requests module, but I end up getting None. I can fetch the title and the phone number from that page, but I failed to grab the website address.
import requests
from bs4 import BeautifulSoup
link = 'https://www.tripadvisor.com/Restaurant_Review-g293995-d19398253-Reviews-Yauatcha_Riyadh-Riyadh_Riyadh_Province.html'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
}
with requests.Session() as s:
s.headers.update(headers)
res = s.get(link)
soup = BeautifulSoup(res.text,"lxml")
title = soup.select_one("[data-test-target='restaurant-detail-info'] h1").get_text(strip=True)
phone = soup.select_one("[data-test-target='restaurant-detail-info'] a[href^='tel:']").get("href")
try:
website = soup.select_one("[data-test-target='restaurant-detail-info'] span > a:-soup-contains('Website')").get("href")
except AttributeError: website = ""
print((title,phone,website))
Output I'm getting:
('Yauatcha Riyadh', 'tel:+966 9200 06555', '')
Output I wish to get:
('Yauatcha Riyadh', 'tel:+966 9200 06555', 'http://yauatcha.sa/reservations/')
How can I scrape the website address from the given webpage using the requests module?
The URL is base64 encoded inside quoted Json string that is embedded within the page. To decode it you can use next example:
import base64
import json
import re
from ast import literal_eval
from urllib.parse import unquote
import requests
from bs4 import BeautifulSoup
link = "https://www.tripadvisor.com/Restaurant_Review-g293995-d19398253-Reviews-Yauatcha_Riyadh-Riyadh_Riyadh_Province.html"
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "en-US,en;q=0.9",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
}
with requests.Session() as s:
s.headers.update(headers)
res = s.get(link)
soup = BeautifulSoup(res.text, "lxml")
data = soup.find_all("script")[-1]
data = unquote(data["src"])
data = re.search(r'JSON\.parse\((".*")\)', data).group(1)
data = json.loads(literal_eval(data))
for d in data["urqlSsrData"]["results"].values():
d = json.loads(d["data"])
if "RestaurantPresentation_getRestaurantReviewSnippetGroups" in d:
break
else:
raise ValueError("Not found.")
d = d["RestaurantPresentation_searchRestaurantsById"]["restaurants"][0]
# print all info about the restaurant:
# print(json.dumps(d, indent=4))
# decode the URL:
url = base64.b64decode(d["url"]).decode("utf-8")
print("URL:", url.split("_", maxsplit=1)[-1].rsplit("_", maxsplit=1)[0])
Prints:
URL: http://yauatcha.sa/reservations/