python python-3.x web-scraping beautifulsoup python-requests

Trouble scraping a website address from a webpage using the requests module

I'm trying to scrape the website address of Yauatcha Riyadh from a webpage using the requests module, but I end up getting None. I can fetch the title and the phone number from that page, but I failed to grab the website address.

import requests
from bs4 import BeautifulSoup

link = 'https://www.tripadvisor.com/Restaurant_Review-g293995-d19398253-Reviews-Yauatcha_Riyadh-Riyadh_Riyadh_Province.html'

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Accept-Language': 'en-US,en;q=0.9',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
}
with requests.Session() as s:
    s.headers.update(headers)
    res = s.get(link)
    soup = BeautifulSoup(res.text,"lxml")
    title = soup.select_one("[data-test-target='restaurant-detail-info'] h1").get_text(strip=True)
    phone = soup.select_one("[data-test-target='restaurant-detail-info'] a[href^='tel:']").get("href")
    try:
        website = soup.select_one("[data-test-target='restaurant-detail-info'] span > a:-soup-contains('Website')").get("href")
    except AttributeError: website = ""
    print((title,phone,website))

Output I'm getting:

('Yauatcha Riyadh', 'tel:+966 9200 06555', '')

Output I wish to get:

('Yauatcha Riyadh', 'tel:+966 9200 06555', 'http://yauatcha.sa/reservations/')

How can I scrape the website address from the given webpage using the requests module?

Solution

The URL is base64 encoded inside quoted Json string that is embedded within the page. To decode it you can use next example:

import base64
import json
import re
from ast import literal_eval
from urllib.parse import unquote

import requests
from bs4 import BeautifulSoup

link = "https://www.tripadvisor.com/Restaurant_Review-g293995-d19398253-Reviews-Yauatcha_Riyadh-Riyadh_Riyadh_Province.html"

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Accept-Language": "en-US,en;q=0.9",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
}
with requests.Session() as s:
    s.headers.update(headers)
    res = s.get(link)
    soup = BeautifulSoup(res.text, "lxml")
    data = soup.find_all("script")[-1]
    data = unquote(data["src"])

    data = re.search(r'JSON\.parse\((".*")\)', data).group(1)
    data = json.loads(literal_eval(data))

    for d in data["urqlSsrData"]["results"].values():
        d = json.loads(d["data"])
        if "RestaurantPresentation_getRestaurantReviewSnippetGroups" in d:
            break
    else:
        raise ValueError("Not found.")

    d = d["RestaurantPresentation_searchRestaurantsById"]["restaurants"][0]

    # print all info about the restaurant:
    # print(json.dumps(d, indent=4))

    # decode the URL:
    url = base64.b64decode(d["url"]).decode("utf-8")
    print("URL:", url.split("_", maxsplit=1)[-1].rsplit("_", maxsplit=1)[0])

Prints:

URL: http://yauatcha.sa/reservations/