Search code examples
pythonselenium-webdriverweb-scraping

Why doesn't pagination work in this case using Selenium?


Most websites display data across multiple pages. This is done to improve user experience and reduce loading times. But when I wanted to automate the data extraction process using Selenium, I noticed that my script only retrieves information from page one and then stops. What am I doing wrong?

from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import undetected_chromedriver as uc

url = "https://www.zoopla.co.uk/house-prices/england/?new_homes=include&q=england+&orig_q=united+kingdom&view_type=list&pn=1"

# Handle elements that may not be currently visible
def etext(e):
    """Extracts text from an element, handling visibility issues."""
    if e:
        if t := e.text.strip():
            return t
        if (p := e.get_property("textContent")) and isinstance(p, str):
            return p.strip()
    return ""

driver = uc.Chrome()

# Initialize result list to store data
result = []

with Chrome() as driver:
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    
    while True:
        # Wait for the main content to load
        sel = By.CSS_SELECTOR, "div[data-testid=result-item]"
        houses = wait.until(EC.presence_of_all_elements_located(sel))
        
        # Extract and store data from the current page
        for house in houses:
            try:
                item = {
                    "address": etext(house.find_element(By.CSS_SELECTOR, "h2")),
                    "DateLast_sold": etext(house.find_element(By.CSS_SELECTOR, "._1hzil3o9._1hzil3o8._194zg6t7"))
                }
                result.append(item)
            except Exception as e:
                print(f"Error extracting address or date: {e}")
        
        # Check for "Next" button and move to the next page
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, '#main-content div._12n2exy2 nav div._14xj7k72')
            next_button.click()
            wait.until(EC.staleness_of(houses[0]))  # Wait for the new page to load
        except Exception as e:
            print("No more pages to scrape or error:", e)
            break  # Stop if no more pages

# Convert results to a DataFrame and display
df = pd.DataFrame(result)
print(df)

Solution

  • Different websites often require bespoke strategies in order to scrape them with any level of success.

    This site is protected by Cloudflare. When Cloudflare detects too many automated invocations it will intervene and present a page that requires you to prove that you're not a robot. In this case, the number of pages that you can scrape before this happens is variable although it seems to be anywhere between 20 and 30 pages which is unfortunate because there are ~40 pages available.

    The code below will handle the cookie prompt (if it appears) and then will try to get as many addresses as possible. You should be able to adapt this to your specific needs.

    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from undetected_chromedriver import Chrome
    from selenium.webdriver.remote.webelement import WebElement
    from selenium.webdriver.remote.webdriver import WebDriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.action_chains import ActionChains
    from typing import cast
    from collections.abc import Iterator
    
    URL = "https://www.zoopla.co.uk/house-prices/england/?new_homes=include&q=england+&orig_q=united+kingdom&view_type=list&pn=1"
    TIMEOUT = 5
    
    # get text from webelement that may not be visible
    def etext(e: WebElement) -> str:
        if e:
            if t := e.text.strip():
                return t
            if (p := e.get_property("textContent")) and isinstance(p, str):
                return p.strip()
        return ""
    
    # click the WebElement
    def click(driver: WebDriver, e: WebElement) -> None:
        ActionChains(driver).click(e).perform()
    
    # get all WebElements that match the given css
    def get_all(driver: WebDriver, css: str) -> Iterator[WebElement]:
        wait = WebDriverWait(driver, TIMEOUT)
        ec = EC.presence_of_all_elements_located
        sel = By.CSS_SELECTOR, css
        try:
            yield from wait.until(ec(sel))
        except TimeoutException:
            pass
    
    # look for the Next button and click it
    def click_next(driver: WebDriver) -> None:
        for a in get_all(driver, "a[aria-live=polite] > div > div:nth-child(2)"):
            if etext(a) == "Next":
                click(driver, a)
                break
    
    # look for the shadow root
    def get_shadow_root(driver: WebDriver) -> WebDriver:
        wait = WebDriverWait(driver, TIMEOUT)
        ec = EC.presence_of_element_located
        sel = By.ID, "usercentrics-root"
        sre = wait.until(ec(sel))
        return cast(WebDriver, sre.shadow_root)
    
    # you may be required to accept or decline cookies
    # ignore any exceptions that may arise
    def click_through(driver: WebDriver) -> None:
        try:
            wait = WebDriverWait(get_shadow_root(driver), TIMEOUT)
            ec = EC.element_to_be_clickable
            sel = By.CSS_SELECTOR, "button[data-testid=uc-deny-all-button]"
            button = wait.until(ec(sel))
            click(driver, button)
        except Exception:
            pass
    
    if __name__ == "__main__":
        with Chrome() as driver:
            driver.get(URL)
            click_through(driver)
            prev_url = ""
            npages = 0
            # if and when Cloudflare intervenes, the current URL does not change
            while prev_url != driver.current_url:
                prev_url = driver.current_url
                for h2 in get_all(driver, "div[data-testid=result-item] h2"):
                    print(etext(h2))
                click_next(driver)
                npages += 1
            print(f"Processed {npages=}")