Search code examples
pythonpandasseleniumbeautifulsoupexport-to-csv

Scraping with Beautiful Soup and Python to CSV


Trying to scrape floor sizes (in sq ft) and lot sizes (in hectares) from listings on a real estate website using Beautiful Soup and Selenium.

The floor sizes print fine in the console

image

but when writing to a csv file the the 'sq ft' info under the floor size column is not extracted

image

It seems if 'sq ft' is found by BS4 in the ID element after the the one stipulated, that is returned instead and all other 'sq ft' text is passed over on every other url when writing to the csv. As you can see on (image) two of the listings have this, despite those two links having hectares as well:

http://property.shw.co.uk/propertyInfo/11080/145151-London-Road-Croydon--CR0-2RG http://property.shw.co.uk/propertyInfo/16162/Public-HouseRestaurant-Site-Westvale-Park-Horley-Surrey--RH6-0HJ

Can someone explain why the sq ft are printed on the console but not written to the csv? Any help would be appreciated.

Relevant HTML where CP2_CPContent_conDetails1_divDetails is relevant locator for floor sizes and lot sizes:

<div id="CP2_CPContent_conDetails1_divDetails">
                0.3 Acres <br>(0.12 Hectares)
                <div class="clear"></div>

                <div id="CP2_CPContent_conDetails1_divDes" class="divInfo">
                      Potential building size of 6,458 sq ft (600 sq m)<br>
                </div>

Code as follows:

driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)


#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
    house_links = []
    driver.get(url)
    for i in range(pages):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        listings = soup.find_all("a", class_="L")
        page_data = [row['href'] for row in listings]
        house_links.append(page_data)
        time.sleep(np.random.lognormal(0, 1))
        next_button = soup.select('img[src*="propNext"]')
        if next_button:
            next_button = next_button[0].find_parent('a')
            next_button_link = 'http://property.shw.co.uk' + next_button['href']
            driver.get(next_button_link)
    return house_links

#get html data from url and return as object
def get_html_data(url, driver):
    driver.get(url)
    time.sleep(np.random.lognormal(0,1))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

def get_lot_size(soup):
    try:
        for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
            lot_size = element.find_next(text=re.compile('Hectares'))
        lot_size = lot_size.replace("(", "").replace(")", "")
        print(lot_size)
        return lot_size
    except:
        return 'NA'

def get_floor_size(soup):
    try:
        for element in soup.find('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
            floor_size = element.find_next(text=re.compile('sq ft'))
        print(floor_size)
        return floor_size
    except:
        return 'NA'

def flatten_list(house_links):
    house_links_flat = []
    for sublist in house_links:
        for item in sublist:
            house_links_flat.append(item)
    return house_links_flat

def get_house_data(driver, house_links_flat):
    house_data = []
    for link in house_links_flat:
        soup = get_html_data(link, driver)
        floor_size = get_floor_size(soup)
        lot_size = get_lot_size(soup)
        house_data.append([floor_size, lot_size])

    return house_data

house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)


#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
                           str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
    file_name, index = False, encoding = "UTF-8"
)


Solution

  • I have no problem to get Hectares with your code.

    I had problem with sq ft - it doesn't even display it. All because you used find() instead of find_all() in

     for element in soup.find()
    

    but find() doesn't return list with elements but single element and then for doesn't get this element from list but it gets probably its children and it searchs sq ft in wrong places.


    from selenium import webdriver
    import numpy as np
    import time
    import re
    from bs4 import BeautifulSoup
    import pandas as pd
    
    driver = webdriver.Chrome()
    shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
    driver.get(shw_search_url)
    
    
    #identify and extract listing links from each page
    def get_house_links(url, driver, pages=3):
        house_links = []
        driver.get(url)
        for i in range(pages):
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            listings = soup.find_all("a", class_="L")
            page_data = [row['href'] for row in listings]
            house_links.append(page_data)
            time.sleep(np.random.lognormal(0, 1))
            next_button = soup.select('img[src*="propNext"]')
            if next_button:
                next_button = next_button[0].find_parent('a')
                next_button_link = 'http://property.shw.co.uk' + next_button['href']
                driver.get(next_button_link)
        return house_links
    
    #get html data from url and return as object
    def get_html_data(url, driver):
        driver.get(url)
        time.sleep(np.random.lognormal(0,1))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        return soup
    
    def get_lot_size(soup):
        try:
            for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
                lot_size = element.find_next(text=re.compile('Hectares'))
                if lot_size:
                    lot_size = lot_size.replace("(", "").replace(")", "")
                    lot_size = lot_size.strip()
                print('lot_size:', lot_size)
            return lot_size
        except Exception as ex:
            print("EX:", ex)
            return 'NA'
    
    def get_floor_size(soup):
        try:
            for element in soup.find_all('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
                floor_size = element.find_next(text=re.compile('sq ft'))
                if floor_size:
                    floor_size = floor_size.strip()
                print('floor_size:', floor_size)
            return floor_size
        except Exception as ex:
            print("EX:", ex)
            return 'NA'
    
    def flatten_list(house_links):
        house_links_flat = []
        for sublist in house_links:
            for item in sublist:
                house_links_flat.append(item)
        return house_links_flat
    
    def get_house_data(driver, house_links_flat):
        house_data = []
        for link in house_links_flat:
            soup = get_html_data(link, driver)
            floor_size = get_floor_size(soup)
            lot_size = get_lot_size(soup)
            house_data.append([floor_size, lot_size])
            print('-------------------')
    
        return house_data
    
    house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
    house_links_flat = flatten_list(house_links_3pages)
    house_data_3pages = get_house_data(driver,house_links_flat)
    
    
    #open and write results to csv
    file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
                               str(time.strftime("%H:%M%S")))
    columns = ["Floor_Size", "Lot_Size"]
    pd.DataFrame(house_data_3pages, columns = columns).to_csv(
        file_name, index = False, encoding = "UTF-8"
    )
    

    CSV:

    Floor_Size,Lot_Size
    ,0.21 Hectares
    7342 sq ft,
    1665 sq ft,
    "The existing property extends to approximately 2,290 sq m (24,649 sq ft) GIA and sits within an L-shaped site extending to approximately 0.6 acres (0.25 hectares). Fronting London Road is a four storey commercial building, built as a garage with offices above which is currently occupied by a motor company at ground floor level, and by a church across the upper floors and basement. To the rear of the site fronting Montague Road are a number of single storey industrial buildings, currently occupied by a hand carwash. The remainder of the front forecourt and rear of the site is hard standing, predominantly used as car parking.",0.25 Hectares
    4672 to 20302 sq ft,
    ,0.36 Hectares
    ,0.08 Hectares
    ,0.18 Hectares
    2325 sq ft,
    ,0.02 Hectares
    5288 sq ft,
    0 sq ft,
    ,0.36 Hectares
    ,0.18 Hectares
    "*  Potential building size of 6,458 sq ft (600 sq m)",0.12 Hectares
    1258 to 5385 sq ft,
    ,0.13 Hectares
    3600 sq ft,
    ,0.24 Hectares
    6781 to 6871 sq ft,