I'm attempting to scrape https://www.idfpr.com/applications/professionprofile/default.aspx using python 3's urllib and then parse it with BeautifulSoup. However, despite setting each of the input fields, the POST request merely returns the same page. It should redirect to https://www.idfpr.com/Applications/ProfessionProfile/ProfileSearchResults.aspx when the fields are populated and the search button is hit, but it doesn't.
import urllib.request, urllib.parse, urllib.error
import socket, ssl
from bs4 import BeautifulSoup
ssl_context = ssl._create_unverified_context()
page_html = ''
get_req = urllib.request.Request('https://www.idfpr.com/applications/professionprofile/default.aspx',
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
# 'Accept-Encoding': 'gzip,deflate,sdch',
# 'Accept-Language': 'en-US,en;q=0.8',
# 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'
})
try:
page_html = urllib.request.urlopen(get_req,
context=ssl_context
).read()
except socket.timeout:
print("Request timed out. Moving on.")
exit(1)
except urllib.error.URLError as e:
print(e)
exit(1)
except ssl.CertificateError as e:
print(e)
exit(1)
soup_dummy = BeautifulSoup(
page_html,
'html5lib'
)
# parse and retrieve two vital form values
lastfocus = soup_dummy.select("#__LASTFOCUS")[0]['value']
viewstate = soup_dummy.select("#__VIEWSTATE")[0]['value']
viewstategen = soup_dummy.select("#__VIEWSTATEGENERATOR")[0]['value']
eventvalidation = soup_dummy.findAll("input", {"type": "hidden", "name": "__EVENTVALIDATION"})[0]['value']
eventargument = soup_dummy.select('#__EVENTARGUMENT')[0]['value']
# build input list of doctors
doctors = [(0,"AKRAMI","CYRUS")]
for doctor in doctors: # iterate over doctors and search for them on the IL site
formData = (
('__LASTFOCUS', lastfocus),
('__VIEWSTATE', viewstate),
('__VIEWSTATEGENERATOR', viewstategen),
('__EVENTTARGET', 'ctl00$ctl00$MainContent$MainContentContainer$Search'),
('__EVENTARGUMENT', eventargument),
('__EVENTVALIDATION', eventvalidation),
('ctl00$ctl00$MainContent$MainContentContainer$LastName', doctor[1]),
('ctl00$ctl00$MainContent$MainContentContainer$FirstName', doctor[2]),
('ctl00$ctl00$MainContent$MainContentContainer$ddlCounty', '0'),
('ctl00$ctl00$MainContent$MainContentContainer$City', ''),
('ctl00$ctl00$MainContent$MainContentContainer$ddlSpecialty', '0'),
('ctl00$ctl00$MainContent$MainContentContainer$SpecialtyKeyword', ''),
('ctl00$ctl00$MainContent$MainContentContainer$ddlHospitals', '0'),
('ctl00$ctl00$MainContent$MainContentContainer$Search', 'Search'),
('ctl00$ctl00$MainContent$MainContentContainer$Clear', 'Clear')
)
encodedFields = urllib.parse.urlencode(formData).encode('ascii')
# second HTTP request with form data
post_req = urllib.request.Request('https://www.idfpr.com/applications/professionprofile/default.aspx',
data=encodedFields,
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
# 'Accept-Encoding': 'gzip,deflate,sdch',
# 'Accept-Language': 'en-US,en;q=0.8',
# 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'
})
page_html = urllib.request.urlopen(post_req,
data=encodedFields,
context=ssl_context
).read()
soup = BeautifulSoup(page_html, "html5lib")
What am I missing? My guess is it has something to do with __EVENTTARGET
; I read that you need to set it to the submit button you want to hit, which in this case is ctl00$ctl00$MainContent$MainContentContainer$Search
, but that doesn't work.
Following works for me. I am using requests.Session()
though.
import requests
from bs4 import BeautifulSoup as bs
import urllib3; urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
with requests.Session() as s:
r = s.get('https://www.idfpr.com/applications/professionprofile/default.aspx', verify=False)
soup = bs(r.content, 'lxml')
vs = soup.select_one('#__VIEWSTATE')['value']
ev = soup.select_one('#__EVENTVALIDATION')['value']
vsg = soup.select_one('#__VIEWSTATEGENERATOR')['value']
data = {
'__LASTFOCUS': '',
'__VIEWSTATE':vs,
'__VIEWSTATEGENERATOR': vsg,
'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__EVENTVALIDATION': ev,
'ctl00$ctl00$MainContent$MainContentContainer$LastName': 'Alaraj',
'ctl00$ctl00$MainContent$MainContentContainer$FirstName': 'Ali ',
'ctl00$ctl00$MainContent$MainContentContainer$ddlCounty': '0',
'ctl00$ctl00$MainContent$MainContentContainer$City': '',
'ctl00$ctl00$MainContent$MainContentContainer$ddlSpecialty': '0',
'ctl00$ctl00$MainContent$MainContentContainer$SpecialtyKeyword': '',
'ctl00$ctl00$MainContent$MainContentContainer$ddlHospitals': '0',
'ctl00$ctl00$MainContent$MainContentContainer$Search': 'Search'
}
r = s.post('https://www.idfpr.com/applications/professionprofile/default.aspx', data=data)
soup = bs(r.content, 'lxml')
print(soup.select_one('#MainContent_MainContentContainer_gvwProfiles').text)