I am trying to use Nominatim to geolocate sets of addresses I have scraped from the web. Nominatim works fine for "standard" addresses eg. 123 StreetName St., ExampleSuburb but some of the addresses I have scraped have "non-standard" elements eg. Warehouse 3, 123 StreetName., ExampleSuburb.
Is there a way I can strip away "non-standard" elements to make it easier for Nominatim to find them? Or is there a way I can get Nominatim to try and geolocate addresses despite non-standard elements?
For example, the code below throws a type error while executing the code and I can't figure out how to fix the reformat address to stop this from happening as it is scraped directly off the website without me intervening at all.
from bs4 import BeautifulSoup
import requests
from requests import get
import sqlite3
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
def scrapecafes(city, area):
#url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
response = requests.get(url, timeout=5)
soup_cafe_names = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_names)
cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
#cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]
#print(cafeNamesClean)
#addresses
soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_addresses)
cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
cafeAddressesClean = [address.text for address in cafeAddresses]
#cafeAddressesTuple = [(address,) for address in cafeAddressesClean]
#print(cafeAddressesClean)
##geocode addresses
locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
try:
for item in cafeAddressesClean:
location = (locator.geocode(item))
lat = [location.latitude for item in location]
long = [location.longitude for item in location]
print(location)
except:
pass
#zip up for table
fortable = zip(cafeNamesClean, cafeAddressesClean, lat, long)
print(fortable)
scrapecafes(melbourne, fitzroy)
There 2 problems in your script.
cafeAddressesClean
but you are not storing the output anywhere.zip
the lists, you are not converting them to list.The below inserts the values into sqlite database. Total there are 10 values that get inserted.
from bs4 import BeautifulSoup
import requests
from requests import get
import sqlite3
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
#cafeNamesthornbury
def scrapecafes(city, area):
#url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
response = requests.get(url, timeout=5)
soup_cafe_names = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_names)
cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]
print(cafeNamesClean)
#addresses
soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_addresses)
cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
cafeAddressesClean = [address.text for address in cafeAddresses]
cafeAddressesTuple = [(address,) for address in cafeAddressesClean]
print(cafeAddressesClean)
##geocode addresses
locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
location = []
for item in cafeAddressesClean:
location.append(locator.geocode(item))
lat = [loc.latitude for loc in location]
long = [loc.longitude for loc in location]
#zip up for table
fortable = list(zip(cafeNamesClean, cafeAddressesClean, lat, long))
##connect to database
try:
sqliteConnection = sqlite3.connect('25july_database.db')
cursor = sqliteConnection.cursor()
print("Database created and Successfully Connected to 25july_database")
sqlite_select_Query = "select sqlite_version();"
cursor.execute(sqlite_select_Query)
record = cursor.fetchall()
print("SQLite Database Version is: ", record)
cursor.close()
except sqlite3.Error as error:
print("Error while connecting to sqlite", error)
#create table
try:
sqlite_create_table_query = ''' CREATE TABLE IF NOT EXISTS scraper (
name TEXT NOT NULL,
address TEXT NOT NULL,
latitude FLOAT NOT NULL,
longitude FLOAT NOT NULL
);'''
cursor = sqliteConnection.cursor()
print("Successfully Connected to SQLite")
cursor.execute(sqlite_create_table_query)
sqliteConnection.commit()
print("SQLite table created")
except sqlite3.Error as error:
print("Error while creating a sqlite table", error)
##enter data into table
try:
for row in list(fortable):
sqlite_insert_name_param = """INSERT INTO scraper VALUES (?,?,?,?);"""
cursor.execute(sqlite_insert_name_param, row)
sqliteConnection.commit()
print("Total", cursor.rowcount, "Records inserted successfully into table")
cursor.close()
except sqlite3.Error as error:
print("Failed to insert data into sqlite table", error)
finally:
if (sqliteConnection):
sqliteConnection.close()
print("The SQLite connection is closed")
scrapecafes('melbourne', 'thornbury')
After running the script:
Prior| 637 High Street, Thornbury|-37.76159772|144.99994556
Rat the Cafe| 72 Wales Street, Thornbury|-37.7618172|145.0091904
Ampersand Coffee and Food| 863 High Street, Thornbury|-37.754689125|145.0010879
Umberto Espresso Bar| 822 High Street, Thornbury|-37.7532839|145.0016297
Brother Alec| 719 High Street, Thornbury|-37.7590570333333|145.0003715
Short Round| 731 High Street, Thornbury|-37.758653675|145.000430475
Jerry Joy| 128 Mansfield Street, Thornbury|-37.7573008|145.0096578
The Old Milk Bar| 144 Dundas Street, Thornbury|-37.7544244|145.020563
Little Henri| 848 High Street, Thornbury|51.6087678|-2.5260139
Northern Soul| 843 High Street, Thornbury|-37.7552406555556|145.000992355556