Search code examples
pythonasp.netweb-scrapingbeautifulsoupurllib

Trying to scrape .aspx site in python but it won't POST


I'm attempting to scrape https://www.idfpr.com/applications/professionprofile/default.aspx using python 3's urllib and then parse it with BeautifulSoup. However, despite setting each of the input fields, the POST request merely returns the same page. It should redirect to https://www.idfpr.com/Applications/ProfessionProfile/ProfileSearchResults.aspx when the fields are populated and the search button is hit, but it doesn't.

import urllib.request, urllib.parse, urllib.error
import socket, ssl
from bs4 import BeautifulSoup


ssl_context = ssl._create_unverified_context()

    page_html = ''
    get_req = urllib.request.Request('https://www.idfpr.com/applications/professionprofile/default.aspx',
                                     headers={
                                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko)  Chrome/24.0.1312.57 Safari/537.17',
                                        'Content-Type': 'application/x-www-form-urlencoded',
                                        # 'Accept-Encoding': 'gzip,deflate,sdch',
                                        # 'Accept-Language': 'en-US,en;q=0.8',
                                        # 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'
                                    })
    try:
        page_html = urllib.request.urlopen(get_req,
                                           context=ssl_context
                                           ).read()
    except socket.timeout:
        print("Request timed out. Moving on.")
        exit(1)
    except urllib.error.URLError as e:
        print(e)
        exit(1)
    except ssl.CertificateError as e:
        print(e)
        exit(1)

    soup_dummy = BeautifulSoup(
        page_html,
        'html5lib'
    )

    # parse and retrieve two vital form values
    lastfocus = soup_dummy.select("#__LASTFOCUS")[0]['value']
    viewstate = soup_dummy.select("#__VIEWSTATE")[0]['value']
    viewstategen = soup_dummy.select("#__VIEWSTATEGENERATOR")[0]['value']
    eventvalidation = soup_dummy.findAll("input", {"type": "hidden", "name": "__EVENTVALIDATION"})[0]['value']
    eventargument = soup_dummy.select('#__EVENTARGUMENT')[0]['value']

    # build input list of doctors
    doctors = [(0,"AKRAMI","CYRUS")]

    for doctor in doctors:  # iterate over doctors and search for them on the IL site

        formData = (
            ('__LASTFOCUS', lastfocus),
            ('__VIEWSTATE', viewstate),
            ('__VIEWSTATEGENERATOR', viewstategen),
            ('__EVENTTARGET', 'ctl00$ctl00$MainContent$MainContentContainer$Search'),
            ('__EVENTARGUMENT', eventargument),
            ('__EVENTVALIDATION', eventvalidation),
            ('ctl00$ctl00$MainContent$MainContentContainer$LastName', doctor[1]),
            ('ctl00$ctl00$MainContent$MainContentContainer$FirstName', doctor[2]),
            ('ctl00$ctl00$MainContent$MainContentContainer$ddlCounty', '0'),
            ('ctl00$ctl00$MainContent$MainContentContainer$City', ''),
            ('ctl00$ctl00$MainContent$MainContentContainer$ddlSpecialty', '0'),
            ('ctl00$ctl00$MainContent$MainContentContainer$SpecialtyKeyword', ''),
            ('ctl00$ctl00$MainContent$MainContentContainer$ddlHospitals', '0'),
            ('ctl00$ctl00$MainContent$MainContentContainer$Search', 'Search'),
            ('ctl00$ctl00$MainContent$MainContentContainer$Clear', 'Clear')
        )

        encodedFields = urllib.parse.urlencode(formData).encode('ascii')
        # second HTTP request with form data
        post_req = urllib.request.Request('https://www.idfpr.com/applications/professionprofile/default.aspx',
                                          data=encodedFields,
                                          headers={
                                            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko)  Chrome/24.0.1312.57 Safari/537.17',
                                            'Content-Type': 'application/x-www-form-urlencoded',
                                            # 'Accept-Encoding': 'gzip,deflate,sdch',
                                            # 'Accept-Language': 'en-US,en;q=0.8',
                                            # 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'
                                        })
        page_html = urllib.request.urlopen(post_req,
                                           data=encodedFields,
                                           context=ssl_context
                                           ).read()
        soup = BeautifulSoup(page_html, "html5lib")

What am I missing? My guess is it has something to do with __EVENTTARGET; I read that you need to set it to the submit button you want to hit, which in this case is ctl00$ctl00$MainContent$MainContentContainer$Search, but that doesn't work.


Solution

  • Following works for me. I am using requests.Session() though.

    import requests
    from bs4 import BeautifulSoup as bs
    import urllib3; urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    
    with requests.Session() as s:
        r = s.get('https://www.idfpr.com/applications/professionprofile/default.aspx', verify=False)
        soup = bs(r.content, 'lxml')
        vs = soup.select_one('#__VIEWSTATE')['value']
        ev = soup.select_one('#__EVENTVALIDATION')['value']
        vsg = soup.select_one('#__VIEWSTATEGENERATOR')['value']
        data = {
      '__LASTFOCUS': '',
      '__VIEWSTATE':vs,
      '__VIEWSTATEGENERATOR': vsg,
      '__EVENTTARGET': '',
      '__EVENTARGUMENT': '',
      '__EVENTVALIDATION': ev,
        'ctl00$ctl00$MainContent$MainContentContainer$LastName': 'Alaraj',
      'ctl00$ctl00$MainContent$MainContentContainer$FirstName': 'Ali ',
      'ctl00$ctl00$MainContent$MainContentContainer$ddlCounty': '0',
      'ctl00$ctl00$MainContent$MainContentContainer$City': '',
      'ctl00$ctl00$MainContent$MainContentContainer$ddlSpecialty': '0',
      'ctl00$ctl00$MainContent$MainContentContainer$SpecialtyKeyword': '',
      'ctl00$ctl00$MainContent$MainContentContainer$ddlHospitals': '0',
      'ctl00$ctl00$MainContent$MainContentContainer$Search': 'Search'
    }
        r = s.post('https://www.idfpr.com/applications/professionprofile/default.aspx',  data=data)
        soup = bs(r.content, 'lxml')
        print(soup.select_one('#MainContent_MainContentContainer_gvwProfiles').text)