Search code examples
pythonweb-scrapinggeocodinggeopynominatim

How do I clean addresses so Nominatim can geolocate them properly?


I am trying to use Nominatim to geolocate sets of addresses I have scraped from the web. Nominatim works fine for "standard" addresses eg. 123 StreetName St., ExampleSuburb but some of the addresses I have scraped have "non-standard" elements eg. Warehouse 3, 123 StreetName., ExampleSuburb.

Is there a way I can strip away "non-standard" elements to make it easier for Nominatim to find them? Or is there a way I can get Nominatim to try and geolocate addresses despite non-standard elements?

For example, the code below throws a type error while executing the code and I can't figure out how to fix the reformat address to stop this from happening as it is scraped directly off the website without me intervening at all.

from bs4 import BeautifulSoup
import requests
from requests import get
import sqlite3
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

def scrapecafes(city, area):

    #url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
    url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
    response = requests.get(url, timeout=5)

    soup_cafe_names = BeautifulSoup(response.content, "html.parser")
    type(soup_cafe_names)

    cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
    cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
    #cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]

    #print(cafeNamesClean)

    #addresses
    soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
    type(soup_cafe_addresses)

    cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
    cafeAddressesClean = [address.text for address in cafeAddresses]
    #cafeAddressesTuple = [(address,) for address in cafeAddressesClean]

    #print(cafeAddressesClean)


    ##geocode addresses
    locator = Nominatim(user_agent="myGeocoder")
    geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

    try:
        for item in cafeAddressesClean:
            location = (locator.geocode(item))
            lat = [location.latitude for item in location]
            long = [location.longitude for item in location]
            print(location)

    except:
        pass

    #zip up for table
    fortable = zip(cafeNamesClean, cafeAddressesClean, lat, long)
    print(fortable)

scrapecafes(melbourne, fitzroy)

Solution

  • There 2 problems in your script.

    1. You are looping through the cafeAddressesClean but you are not storing the output anywhere.
    2. After you zip the lists, you are not converting them to list.

    The below inserts the values into sqlite database. Total there are 10 values that get inserted.

    from bs4 import BeautifulSoup
    import requests
    from requests import get
    import sqlite3
    import geopandas
    import geopy
    from geopy.geocoders import Nominatim
    from geopy.extra.rate_limiter import RateLimiter
    
    #cafeNamesthornbury
    def scrapecafes(city, area):
    
        #url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
        url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
        response = requests.get(url, timeout=5)
    
        soup_cafe_names = BeautifulSoup(response.content, "html.parser")
        type(soup_cafe_names)
    
        cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
        cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
        cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]
    
        print(cafeNamesClean)
    
        #addresses
        soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
        type(soup_cafe_addresses)
    
        cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
        cafeAddressesClean = [address.text for address in cafeAddresses]
        cafeAddressesTuple = [(address,) for address in cafeAddressesClean]
    
        print(cafeAddressesClean)
    
    
        ##geocode addresses
        locator = Nominatim(user_agent="myGeocoder")
        geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
    
        location = []
    
        for item in cafeAddressesClean:
            location.append(locator.geocode(item))
    
        lat = [loc.latitude for loc in location]
        long = [loc.longitude for loc in location]
    
        #zip up for table
        fortable = list(zip(cafeNamesClean, cafeAddressesClean, lat, long))
    
    ##connect to database
        try:
            sqliteConnection = sqlite3.connect('25july_database.db')
            cursor = sqliteConnection.cursor()
            print("Database created and Successfully Connected to 25july_database")
    
            sqlite_select_Query = "select sqlite_version();"
            cursor.execute(sqlite_select_Query)
            record = cursor.fetchall()
            print("SQLite Database Version is: ", record)
            cursor.close()
    
        except sqlite3.Error as error:
            print("Error while connecting to sqlite", error)
    
        #create table
        try:
            sqlite_create_table_query = ''' CREATE TABLE IF NOT EXISTS scraper (
                                            name TEXT NOT NULL,
                                            address TEXT NOT NULL,
                                            latitude FLOAT NOT NULL,
                                            longitude FLOAT NOT NULL
                                            );'''
    
            cursor = sqliteConnection.cursor()
            print("Successfully Connected to SQLite")
            cursor.execute(sqlite_create_table_query)
            sqliteConnection.commit()
            print("SQLite table created")
        except sqlite3.Error as error:
            print("Error while creating a sqlite table", error)
    
    ##enter data into table
        try:
    
            for row in list(fortable):
                sqlite_insert_name_param = """INSERT INTO scraper VALUES (?,?,?,?);"""
    
                cursor.execute(sqlite_insert_name_param, row)
    
                sqliteConnection.commit()
            
                print("Total", cursor.rowcount, "Records inserted successfully into table")
    
            cursor.close()
    
        except sqlite3.Error as error:
            print("Failed to insert data into sqlite table", error)
    
        finally:
            if (sqliteConnection):
                sqliteConnection.close()
                print("The SQLite connection is closed")
    
    scrapecafes('melbourne', 'thornbury')
    

    After running the script:

    Prior| 637 High Street, Thornbury|-37.76159772|144.99994556
    Rat the Cafe| 72 Wales Street, Thornbury|-37.7618172|145.0091904
    Ampersand Coffee and Food| 863 High Street, Thornbury|-37.754689125|145.0010879
    Umberto Espresso Bar| 822 High Street, Thornbury|-37.7532839|145.0016297
    Brother Alec| 719 High Street, Thornbury|-37.7590570333333|145.0003715
    Short Round| 731 High Street, Thornbury|-37.758653675|145.000430475
    Jerry Joy| 128  Mansfield Street, Thornbury|-37.7573008|145.0096578
    The Old Milk Bar| 144 Dundas Street, Thornbury|-37.7544244|145.020563
    Little Henri| 848  High Street, Thornbury|51.6087678|-2.5260139
    Northern Soul| 843 High Street, Thornbury|-37.7552406555556|145.000992355556