Search code examples
pythonselenium-webdriverweb-scrapingplaywright

Website section not visible when using Selenium over Playwright


Problem

I have a scraper that scrapes product information. One thing that I would like to scrape is the CO2 emission and the compensation price of the product. Since this information is available in a specific section I have to click on a button and therefore use a browser automation tool.

I have now created the scraper with Playwright which is able to scrape the information in the sustainability section. I would now like to refactor the playwright part to selenium.

However, when I do this, somehow the product pages displayed with the driver in selenium do not include the sustainability section and make it not possible to scrape the information (see the normal website of product page and website accessed with selenium driver).

Why does this happen and how can I scrape the sustainability section using selenium?

Code

Scraper with Playwright part

import requests
import time
import random
import pandas as pd
from typing import List
from bs4 import BeautifulSoup
from playwright.sync_api import Playwright, sync_playwright, TimeoutError as PlaywrightTimeoutError
from product_scraper.port.sources import Scraper
from product_scraper.domain import ProductItem
from dataclasses import asdict


class DayDealScraper(Scraper):

    def __init__(self, url):
        self.url = url
        self.urls = self._get_product_links(self.url)

    def get_product_info_df(self):
        """
        Return pd.DataFrame with product information from deals of the day.
        """
        product_info_df = self._get_product_info()
        print(product_info_df)
        return product_info_df

    def _get_product_links(self, url: str) -> List[str]:
        """
        Get href of products on url-page
        """
        urls = []

        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')

        articles = soup.find_all('article')

        for article in articles:

            try:
                href = article.find('a', class_='sc-qlvix8-0 dgECEw')['href']
                urls.append(f"https://www.digitec.ch{href}")
            except TypeError:
                continue

        return urls

    def _get_product_info(self):
        """
        Scrape product info of every subpage
        """
        urls = self._get_product_links(self.url)

        products = []
        for url in urls:
            print(url)

            r = requests.get(url)
            soup = BeautifulSoup(r.content, 'lxml')

            name = soup.find('h1', class_='sc-12r9jwk-0 hcjJEJ').text
            price = float(soup.find('div', class_='sc-18ppxou-1 gwNBaL').text.split('.')[0])
            # Narrow down navigation section to get category
            navigation = soup.find('ol', class_='sc-4cfuhz-2 ipoVcw')
            navigation_parts = navigation.find_all('li', class_='sc-4cfuhz-3 iIgemP')
            category = [subcategory.text for subcategory in navigation_parts][-2]

            time.sleep(random.randint(4, 6))
            # Use Playwright to scrape emission information
            try:
                with sync_playwright() as pw:
                    agent = 'userMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \
                            'Chrome/83.0.4103.116 Safari/537.36'
                    browser = pw.chromium.launch(headless=True)
                    context = browser.new_context(user_agent=agent)
                    page = context.new_page()
                    page.goto(url)

                    # Find weight under product Specifications > Show more
                    page.locator("[data-test=\"showMoreButton-specifications\"]").click()
                    weight = page.text_content("td:text(\"Weight\") + td").split("\xa0")
                    weight = " ".join(weight)

                    # Find sustainability section and open it
                    page.locator("[data-test=\"sustainability\"]").click()
                    compensation_price = page.get_by_role("row", name="Compensation amount").text_content()
                    compensation_price = compensation_price.split("CHF ")[1].replace("’", "")
                    compensation_price = float(compensation_price)
                    emission = page.get_by_role("row", name="CO₂-Emission").text_content()
                    emission = emission.split("Emission")[1].split("kg")[0].replace("’", "")
                    emission = float(emission)

                    context.close()
                    browser.close()

            except PlaywrightTimeoutError:
                print(f"{url} has no sustainability section")
                continue

            product = ProductItem(name=name,
                                  price=price,
                                  category=category,
                                  weight=weight,
                                  emission=emission,
                                  compensation_price=compensation_price)
            products.append(asdict(product))

            print(asdict(product))

        products_df = pd.DataFrame(products)

        return products_df


if __name__ == '__main__':
    url = 'https://www.digitec.ch/en/daily-deal'
    day_deals = DayDealScraper(url)
    day_deals.get_product_info_df()

Scraper with Selenium part

import requests
import time
import random
import pandas as pd
from typing import List
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from product_scraper.port.sources import Scraper
from product_scraper.domain import ProductItem
from dataclasses import asdict


class DayDealScraper(Scraper):

    def __init__(self, url):
        self.url = url
        self.urls = self._get_product_links(self.url)

    def get_product_info_df(self):
        """
        Return pd.DataFrame with product information from deals of the day.
        """
        product_info_df = self._get_product_info()
        print(product_info_df)
        return product_info_df

    def _get_product_links(self, url: str) -> List[str]:
        """
        Get href of products on url-page
        """
        urls = []

        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')

        articles = soup.find_all('article')

        for article in articles:

            try:
                href = article.find('a', class_='sc-qlvix8-0 dgECEw')['href']
                urls.append(f"https://www.digitec.ch{href}")
            except TypeError:
                continue

        return urls

    def _get_product_info(self):
        """
        Scrape product info of every subpage
        """
        urls = self._get_product_links(self.url)

        products = []
        for url in urls:
            print(url)

            r = requests.get(url)
            soup = BeautifulSoup(r.content, 'lxml')

            name = soup.find('h1', class_='sc-12r9jwk-0 hcjJEJ').text
            price = float(soup.find('div', class_='sc-18ppxou-1 gwNBaL').text.split('.')[0])
            # Narrow down navigation section to get category
            navigation = soup.find('ol', class_='sc-4cfuhz-2 ipoVcw')
            navigation_parts = navigation.find_all('li', class_='sc-4cfuhz-3 iIgemP')
            category = [subcategory.text for subcategory in navigation_parts][-2]

            # Use Selenium to scrape emission information
            options = Options()

            # Set user agent
            user_agent = 'userMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \
                         'Chrome/83.0.4103.116 Safari/537.36'
            options.add_argument(f'user-agent={user_agent}')
            #options.add_argument('-headless')

            # Launch the browser
            driver = webdriver.Firefox(options=options)

            # Navigate to the URL
            driver.get(url)
            time.sleep(random.randint(4, 6))

            try:
                # Find weight under product Specifications > Show more
                show_more_button = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '[data-test="showMoreButton-specifications"]')))
                show_more_button.click()
                weight = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
                    (By.XPATH, '//td[text()="Weight"]/following-sibling::td'))).text.strip()

                # Find sustainability section and open it
                sustainability_section = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '[data-test="sustainability"]')))
                sustainability_section.click()

                compensation_price = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
                    (By.XPATH, '//tr[./th[text()="Compensation amount"]]/td'))).text.strip()
                compensation_price = compensation_price.split("CHF ")[1].replace("’", "")
                compensation_price = float(compensation_price)

                emission = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//tr[./th[text()="CO₂-Emission"]]/td'))).text.strip()
                emission = emission.split("Emission")[1].split("kg")[0].replace("’", "")
                emission = float(emission)

            except TimeoutException:
                print(f"{url} has no sustainability section")
                continue

            finally:
                driver.close()

            product = ProductItem(name=name,
                                  price=price,
                                  category=category,
                                  weight=weight,
                                  emission=emission,
                                  compensation_price=compensation_price)
            products.append(asdict(product))

            print(asdict(product))

        products_df = pd.DataFrame(products)

        return products_df


if __name__ == '__main__':
    url = 'https://www.digitec.ch/en/daily-deal'
    day_deals = DayDealScraper(url)
    day_deals.get_product_info_df()

ProductItem code (dataclass) - product_scraper.domain

from dataclasses import dataclass

@dataclass
class ProductItem():
    """
    Product in daily deals.
    """
    name: str
    price: float
    category: str
    weight: str
    emission: float
    compensation_price: float

Scraper abstract class - product_scraper.port.sources

from abc import ABC, abstractmethod


class Scraper(ABC):

    @abstractmethod
    def _get_product_links(self, url):
        pass

Solution

  • I was able to resolve the problem by using selenium-stealth, which helps to prevent selenium detection.

    Installation: pip install selenium-stealth

    In script:

    from selenium_stealth import stealth
    
    ...
    
    # Stealth selenium - add before driver.get(url)
    stealth(driver,
    languages=["en-US", "en"],
    vendor="Google Inc.",
    platform="Win32",
    webgl_vendor="Intel Inc.",
    renderer="Intel Iris OpenGL Engine",
    fix_hairline=True,
    )
    
    ...