Most websites display data across multiple pages. This is done to improve user experience and reduce loading times. But when I wanted to automate the data extraction process using Selenium, I noticed that my script only retrieves information from page one and then stops. What am I doing wrong?
from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import undetected_chromedriver as uc
url = "https://www.zoopla.co.uk/house-prices/england/?new_homes=include&q=england+&orig_q=united+kingdom&view_type=list&pn=1"
# Handle elements that may not be currently visible
def etext(e):
"""Extracts text from an element, handling visibility issues."""
if e:
if t := e.text.strip():
return t
if (p := e.get_property("textContent")) and isinstance(p, str):
return p.strip()
return ""
driver = uc.Chrome()
# Initialize result list to store data
result = []
with Chrome() as driver:
driver.get(url)
wait = WebDriverWait(driver, 10)
while True:
# Wait for the main content to load
sel = By.CSS_SELECTOR, "div[data-testid=result-item]"
houses = wait.until(EC.presence_of_all_elements_located(sel))
# Extract and store data from the current page
for house in houses:
try:
item = {
"address": etext(house.find_element(By.CSS_SELECTOR, "h2")),
"DateLast_sold": etext(house.find_element(By.CSS_SELECTOR, "._1hzil3o9._1hzil3o8._194zg6t7"))
}
result.append(item)
except Exception as e:
print(f"Error extracting address or date: {e}")
# Check for "Next" button and move to the next page
try:
next_button = driver.find_element(By.CSS_SELECTOR, '#main-content div._12n2exy2 nav div._14xj7k72')
next_button.click()
wait.until(EC.staleness_of(houses[0])) # Wait for the new page to load
except Exception as e:
print("No more pages to scrape or error:", e)
break # Stop if no more pages
# Convert results to a DataFrame and display
df = pd.DataFrame(result)
print(df)
Different websites often require bespoke strategies in order to scrape them with any level of success.
This site is protected by Cloudflare. When Cloudflare detects too many automated invocations it will intervene and present a page that requires you to prove that you're not a robot. In this case, the number of pages that you can scrape before this happens is variable although it seems to be anywhere between 20 and 30 pages which is unfortunate because there are ~40 pages available.
The code below will handle the cookie prompt (if it appears) and then will try to get as many addresses as possible. You should be able to adapt this to your specific needs.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from undetected_chromedriver import Chrome
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from typing import cast
from collections.abc import Iterator
URL = "https://www.zoopla.co.uk/house-prices/england/?new_homes=include&q=england+&orig_q=united+kingdom&view_type=list&pn=1"
TIMEOUT = 5
# get text from webelement that may not be visible
def etext(e: WebElement) -> str:
if e:
if t := e.text.strip():
return t
if (p := e.get_property("textContent")) and isinstance(p, str):
return p.strip()
return ""
# click the WebElement
def click(driver: WebDriver, e: WebElement) -> None:
ActionChains(driver).click(e).perform()
# get all WebElements that match the given css
def get_all(driver: WebDriver, css: str) -> Iterator[WebElement]:
wait = WebDriverWait(driver, TIMEOUT)
ec = EC.presence_of_all_elements_located
sel = By.CSS_SELECTOR, css
try:
yield from wait.until(ec(sel))
except TimeoutException:
pass
# look for the Next button and click it
def click_next(driver: WebDriver) -> None:
for a in get_all(driver, "a[aria-live=polite] > div > div:nth-child(2)"):
if etext(a) == "Next":
click(driver, a)
break
# look for the shadow root
def get_shadow_root(driver: WebDriver) -> WebDriver:
wait = WebDriverWait(driver, TIMEOUT)
ec = EC.presence_of_element_located
sel = By.ID, "usercentrics-root"
sre = wait.until(ec(sel))
return cast(WebDriver, sre.shadow_root)
# you may be required to accept or decline cookies
# ignore any exceptions that may arise
def click_through(driver: WebDriver) -> None:
try:
wait = WebDriverWait(get_shadow_root(driver), TIMEOUT)
ec = EC.element_to_be_clickable
sel = By.CSS_SELECTOR, "button[data-testid=uc-deny-all-button]"
button = wait.until(ec(sel))
click(driver, button)
except Exception:
pass
if __name__ == "__main__":
with Chrome() as driver:
driver.get(URL)
click_through(driver)
prev_url = ""
npages = 0
# if and when Cloudflare intervenes, the current URL does not change
while prev_url != driver.current_url:
prev_url = driver.current_url
for h2 in get_all(driver, "div[data-testid=result-item] h2"):
print(etext(h2))
click_next(driver)
npages += 1
print(f"Processed {npages=}")