Tried to crawl restaurants address from google front page information panel but getting "urllib.error.HTTPError: HTTP Error 403: Forbidden" error and program are not run. I am fresher in python web scraping, please help.
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import re
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
#get google URL.
url = "https://www.google.com/search?q=barbeque%20nation%20-%20noida"
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
page = fromstring(response)
soup = BeautifulSoup(page, 'url.parser')
the_page = soup.prettify("utf-8")
hotel_json = {}
for line in soup.find_all('script',attrs={"type" :
"application/ld+json"}):
details = line.text.strip()
details = json.loads(details)
hotel_json["address"]["LrzXr"]=details["address"]["streetAddress"]
break
with open(hotel_json["name"]+".html", "wb") as file:
file.write(html)
with open(hotel_json["name"]+".json", 'w') as outfile:
json.dump(hotel_json, outfile, indent=4)
Add a user-agent header
request = urllib.request.Request(url, headers = {'User-Agent' : 'Mozilla/5.0'})