Search code examples
pythonpython-3.xweb-scrapingbeautifulsouppython-requests

Trouble scraping a website address from a webpage using the requests module


I'm trying to scrape the website address of Yauatcha Riyadh from a webpage using the requests module, but I end up getting None. I can fetch the title and the phone number from that page, but I failed to grab the website address.

import requests
from bs4 import BeautifulSoup

link = 'https://www.tripadvisor.com/Restaurant_Review-g293995-d19398253-Reviews-Yauatcha_Riyadh-Riyadh_Riyadh_Province.html'

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Accept-Language': 'en-US,en;q=0.9',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
}
with requests.Session() as s:
    s.headers.update(headers)
    res = s.get(link)
    soup = BeautifulSoup(res.text,"lxml")
    title = soup.select_one("[data-test-target='restaurant-detail-info'] h1").get_text(strip=True)
    phone = soup.select_one("[data-test-target='restaurant-detail-info'] a[href^='tel:']").get("href")
    try:
        website = soup.select_one("[data-test-target='restaurant-detail-info'] span > a:-soup-contains('Website')").get("href")
    except AttributeError: website = ""
    print((title,phone,website))

Output I'm getting:

('Yauatcha Riyadh', 'tel:+966 9200 06555', '')

Output I wish to get:

('Yauatcha Riyadh', 'tel:+966 9200 06555', 'http://yauatcha.sa/reservations/')

How can I scrape the website address from the given webpage using the requests module?


Solution

  • The URL is base64 encoded inside quoted Json string that is embedded within the page. To decode it you can use next example:

    import base64
    import json
    import re
    from ast import literal_eval
    from urllib.parse import unquote
    
    import requests
    from bs4 import BeautifulSoup
    
    link = "https://www.tripadvisor.com/Restaurant_Review-g293995-d19398253-Reviews-Yauatcha_Riyadh-Riyadh_Riyadh_Province.html"
    
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Accept-Language": "en-US,en;q=0.9",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    }
    with requests.Session() as s:
        s.headers.update(headers)
        res = s.get(link)
        soup = BeautifulSoup(res.text, "lxml")
        data = soup.find_all("script")[-1]
        data = unquote(data["src"])
    
        data = re.search(r'JSON\.parse\((".*")\)', data).group(1)
        data = json.loads(literal_eval(data))
    
        for d in data["urqlSsrData"]["results"].values():
            d = json.loads(d["data"])
            if "RestaurantPresentation_getRestaurantReviewSnippetGroups" in d:
                break
        else:
            raise ValueError("Not found.")
    
        d = d["RestaurantPresentation_searchRestaurantsById"]["restaurants"][0]
    
        # print all info about the restaurant:
        # print(json.dumps(d, indent=4))
    
        # decode the URL:
        url = base64.b64decode(d["url"]).decode("utf-8")
        print("URL:", url.split("_", maxsplit=1)[-1].rsplit("_", maxsplit=1)[0])
    

    Prints:

    URL: http://yauatcha.sa/reservations/