Trying to scrape floor sizes (in sq ft) and lot sizes (in hectares) from listings on a real estate website using Beautiful Soup and Selenium.
The floor sizes print fine in the console
but when writing to a csv file the the 'sq ft' info under the floor size column is not extracted
It seems if 'sq ft' is found by BS4 in the ID element after the the one stipulated, that is returned instead and all other 'sq ft' text is passed over on every other url when writing to the csv. As you can see on (image) two of the listings have this, despite those two links having hectares as well:
http://property.shw.co.uk/propertyInfo/11080/145151-London-Road-Croydon--CR0-2RG http://property.shw.co.uk/propertyInfo/16162/Public-HouseRestaurant-Site-Westvale-Park-Horley-Surrey--RH6-0HJ
Can someone explain why the sq ft are printed on the console but not written to the csv? Any help would be appreciated.
Relevant HTML where CP2_CPContent_conDetails1_divDetails is relevant locator for floor sizes and lot sizes:
<div id="CP2_CPContent_conDetails1_divDetails">
0.3 Acres <br>(0.12 Hectares)
<div class="clear"></div>
<div id="CP2_CPContent_conDetails1_divDes" class="divInfo">
Potential building size of 6,458 sq ft (600 sq m)<br>
</div>
Code as follows:
driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)
#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
house_links = []
driver.get(url)
for i in range(pages):
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all("a", class_="L")
page_data = [row['href'] for row in listings]
house_links.append(page_data)
time.sleep(np.random.lognormal(0, 1))
next_button = soup.select('img[src*="propNext"]')
if next_button:
next_button = next_button[0].find_parent('a')
next_button_link = 'http://property.shw.co.uk' + next_button['href']
driver.get(next_button_link)
return house_links
#get html data from url and return as object
def get_html_data(url, driver):
driver.get(url)
time.sleep(np.random.lognormal(0,1))
soup = BeautifulSoup(driver.page_source, 'html.parser')
return soup
def get_lot_size(soup):
try:
for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
lot_size = element.find_next(text=re.compile('Hectares'))
lot_size = lot_size.replace("(", "").replace(")", "")
print(lot_size)
return lot_size
except:
return 'NA'
def get_floor_size(soup):
try:
for element in soup.find('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
floor_size = element.find_next(text=re.compile('sq ft'))
print(floor_size)
return floor_size
except:
return 'NA'
def flatten_list(house_links):
house_links_flat = []
for sublist in house_links:
for item in sublist:
house_links_flat.append(item)
return house_links_flat
def get_house_data(driver, house_links_flat):
house_data = []
for link in house_links_flat:
soup = get_html_data(link, driver)
floor_size = get_floor_size(soup)
lot_size = get_lot_size(soup)
house_data.append([floor_size, lot_size])
return house_data
house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)
#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
file_name, index = False, encoding = "UTF-8"
)
I have no problem to get Hectares
with your code.
I had problem with sq ft
- it doesn't even display it. All because you used find()
instead of find_all()
in
for element in soup.find()
but find()
doesn't return list with elements but single element and then for
doesn't get this element from list but it gets probably its children and it searchs sq ft
in wrong places.
from selenium import webdriver
import numpy as np
import time
import re
from bs4 import BeautifulSoup
import pandas as pd
driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)
#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
house_links = []
driver.get(url)
for i in range(pages):
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all("a", class_="L")
page_data = [row['href'] for row in listings]
house_links.append(page_data)
time.sleep(np.random.lognormal(0, 1))
next_button = soup.select('img[src*="propNext"]')
if next_button:
next_button = next_button[0].find_parent('a')
next_button_link = 'http://property.shw.co.uk' + next_button['href']
driver.get(next_button_link)
return house_links
#get html data from url and return as object
def get_html_data(url, driver):
driver.get(url)
time.sleep(np.random.lognormal(0,1))
soup = BeautifulSoup(driver.page_source, 'html.parser')
return soup
def get_lot_size(soup):
try:
for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
lot_size = element.find_next(text=re.compile('Hectares'))
if lot_size:
lot_size = lot_size.replace("(", "").replace(")", "")
lot_size = lot_size.strip()
print('lot_size:', lot_size)
return lot_size
except Exception as ex:
print("EX:", ex)
return 'NA'
def get_floor_size(soup):
try:
for element in soup.find_all('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
floor_size = element.find_next(text=re.compile('sq ft'))
if floor_size:
floor_size = floor_size.strip()
print('floor_size:', floor_size)
return floor_size
except Exception as ex:
print("EX:", ex)
return 'NA'
def flatten_list(house_links):
house_links_flat = []
for sublist in house_links:
for item in sublist:
house_links_flat.append(item)
return house_links_flat
def get_house_data(driver, house_links_flat):
house_data = []
for link in house_links_flat:
soup = get_html_data(link, driver)
floor_size = get_floor_size(soup)
lot_size = get_lot_size(soup)
house_data.append([floor_size, lot_size])
print('-------------------')
return house_data
house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)
#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
file_name, index = False, encoding = "UTF-8"
)
CSV:
Floor_Size,Lot_Size
,0.21 Hectares
7342 sq ft,
1665 sq ft,
"The existing property extends to approximately 2,290 sq m (24,649 sq ft) GIA and sits within an L-shaped site extending to approximately 0.6 acres (0.25 hectares). Fronting London Road is a four storey commercial building, built as a garage with offices above which is currently occupied by a motor company at ground floor level, and by a church across the upper floors and basement. To the rear of the site fronting Montague Road are a number of single storey industrial buildings, currently occupied by a hand carwash. The remainder of the front forecourt and rear of the site is hard standing, predominantly used as car parking.",0.25 Hectares
4672 to 20302 sq ft,
,0.36 Hectares
,0.08 Hectares
,0.18 Hectares
2325 sq ft,
,0.02 Hectares
5288 sq ft,
0 sq ft,
,0.36 Hectares
,0.18 Hectares
"* Potential building size of 6,458 sq ft (600 sq m)",0.12 Hectares
1258 to 5385 sq ft,
,0.13 Hectares
3600 sq ft,
,0.24 Hectares
6781 to 6871 sq ft,