javascript python html selenium-webdriver web-scraping

Why can't I extract listings information

I am trying to extract the EPC rating from each listings. you can only get the EPC rating when you click on the listing. each time i run my script it it's keeps timing out, what could be the issue ? though i tried increasing the waiting time for the main content to load but I still encounter the same issue. could it be the headless browser not been able to load?

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from undetected_chromedriver import Chrome
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from typing import Iterator
import pandas as pd

# Constants
URL = "https://www.zoopla.co.uk/house-prices/england/?new_homes=include&q=england+&orig_q=united+kingdom&view_type=list&pn=1"
TIMEOUT = 5

# Helper function to extract text from a WebElement
def etext(e: WebElement) -> str:
    if e:
        if t := e.text.strip():
            return t
        if (p := e.get_property("textContent")) and isinstance(p, str):
            return p.strip()
    return ""

# Click a WebElement
def click(driver: WebDriver, e: WebElement) -> None:
    ActionChains(driver).click(e).perform()

# Get all WebElements that match the given CSS selector
def get_all(driver: WebDriver, css: str) -> Iterator[WebElement]:
    wait = WebDriverWait(driver, TIMEOUT)
    sel = (By.CSS_SELECTOR, css)
    try:
        yield from wait.until(EC.presence_of_all_elements_located(sel))
    except TimeoutException:
        pass  # Return empty if elements are not found

# Click the "Next" button for pagination
def click_next(driver: WebDriver) -> None:
    for a in get_all(driver, "a[aria-live=polite] > div > div:nth-child(2)"):
        if etext(a) == "Next":
            click(driver, a)
            break

# Handle cookie consent popup
def click_through(driver: WebDriver) -> None:
    try:
        wait = WebDriverWait(driver, TIMEOUT)
        shadow_root = driver.find_element(By.ID, "usercentrics-root").shadow_root
        button = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, "button[data-testid=uc-deny-all-button]")
        ))
        click(driver, button)
    except Exception:
        pass  # Ignore if cookie popup isn't present

# Scrape EPC Rating from individual listing
def get_epc_rating(driver: WebDriver, listing_url: str) -> str:
    driver.get(listing_url)  # Open property details page
    try:
        epc_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.main-content .z3kgis3 ._1vhryas0 ._8lgu4x1 div:nth-child(3) div'))
        )
        return etext(epc_element)  # Extract EPC rating text
    except TimeoutException:
        return "N/A"  # Return "N/A" if EPC Rating is missing

# Scrape data from the search results page
def scrape_page(driver: WebDriver) -> list[dict]:
    result = []
    for house in get_all(driver, "div[data-testid=result-item]"):
        try:
            listing_url = house.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
            address = etext(house.find_element(By.CSS_SELECTOR, "h2"))
            date_sold = etext(house.find_element(By.CSS_SELECTOR, "._1hzil3o9._1hzil3o8._194zg6t7"))
            house_type = etext(house.find_element(By.CSS_SELECTOR, "div._1pbf8i52 p"))
            num_rooms = etext(house.find_element(By.CSS_SELECTOR, "._1pbf8i51 div:nth-child(2) p"))
            tenure = etext(house.find_element(By.CSS_SELECTOR, ".agepcz0 div:nth-child(1) div"))
            square_foot = etext(house.find_element(By.CSS_SELECTOR, ".agepcz0 div:nth-child(2) div"))

            # Get EPC Rating from listing page
            epc_rating = get_epc_rating(driver, listing_url)

            result.append({
                "Address": address,
                "Date Last Sold": date_sold,
                "Property Type": house_type,
                "Number of Rooms": num_rooms,
                "Tenure": tenure,
                "Square Foot": square_foot,
                "EPC Rating": epc_rating,
                "Listing URL": listing_url
            })
        except NoSuchElementException:
            continue  # Skip missing elements
    return result

# Main script execution
if __name__ == "__main__":
    with Chrome() as driver:
        driver.get(URL)
        click_through(driver)  # Handle cookies

        all_results = []
        prev_url = ""
        npages = 0

        while prev_url != driver.current_url:  # Stop if pagination stops working (e.g., Cloudflare blocks)
            prev_url = driver.current_url
            all_results.extend(scrape_page(driver))
            click_next(driver)
            npages += 1

        # Convert results to DataFrame
        df = pd.DataFrame(all_results)

        # Display results
        print(df)
        print(f"Processed {npages} pages")

        # Save to CSV
        df.to_csv("zoopla_data.csv", index=False)

Solution

There's quite a bit of errors in your logic in the code here, so I'll offer you the solution to get the EPC rating, but you are going to need to go back and rework your logic of how you are navigating the site.

Your CSS Selector is off. Those classes also appear to be dynamically generated. Try avoiding hard coding these and think of robust ways to get the data when you encounter obviously randomized classes or ids.

So what you can do instead is just find the content in the html that starts with EPC rating. That will find the specific element then:

Code:

def get_epc_rating(driver: WebDriver, listing_url: str) -> str:
    driver.get(listing_url)  # Open property details page
    try:
        xpath_expression = (
            "//*[starts-with(translate(text(), 'abcdefghijklmnopqrstuvwxyz', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 'EPC RATING')]"
        )
        
        epc_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.XPATH, xpath_expression))
        )
        return etext(epc_element)  # Extract EPC rating text
    except TimeoutException:
        return "N/A"  # Return "N/A" if EPC Rating is missing