Search code examples
pythonscrapyresponsepython-requests

Web scrape aspx website using python


I able to get the HTTP headers & params but unable to generate response object. The site is - https://www.sacmembership.ca/Search/Search.aspx & i am looking to scrape details for each practitioners . here is the code i reached so far :-

import cookielib
import socket
import urllib
import urllib2

url = 'https://www.sacmembership.ca/Search/Search.aspx'
http_header = {
                #"POST" : "https://www.sacmembership.ca/Search/Results.aspx HTTP/1.1",
                "Host" : "www.sacmembership.ca",
                "Connection" : "keep-alive",
                "Content-Length" : "16581",
                "Cache-Control" :"max-age=0",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Origin": "https://www.sacmembership.ca",
                "User-Agent" : "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36",
                "Content-Type" : "application/x-www-form-urlencoded",
                "Referer" : "https://www.sacmembership.ca/Search/Search.aspx",
                "Accept-Encoding" : "gzip, deflate",
                "Accept-Language" : "en-US,en;q=0.8"
                }

params = {
    'ctl00$ContentPlaceHolder1$ddlProfession' : "",
    'ctl00$ContentPlaceHolder1$ddlFacility' : "",
    'ctl00$ContentPlaceHolder1$txtCity' : "",
    'ctl00$ContentPlaceHolder1$ddlProvince' : "",
    'ctl00$ContentPlaceHolder1$ddlSortBy' : "LastName",
    'ctl00$ContentPlaceHolder1$ddlLanguageOfPractice' : "",
    'ctl00$ContentPlaceHolder1$txtEmployerCompanyName' : "",
    'ctl00$ContentPlaceHolder1$txtFirstName' : "",
    'ctl00$ContentPlaceHolder1$txtLastName' : "",
    'ctl00$ContentPlaceHolder1$btnSearch' : "Search"
    }

cookie_jar = cookielib.LWPCookieJar()
cookie = urllib2.HTTPCookieProcessor(cookie_jar)

opener = urllib2.build_opener(cookie)


req = urllib2.Request(url, urllib.urlencode(params), http_header)


res = opener.open(req)
html = res.read()
print html
"""
open("tmp.html", "w").write(html)
body = html
"""

Please help me on this


Solution

  • I am able to achieve what i was looking for Using Selenium.

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.common.by import By
    from scrapy import Selector
    from selenium.webdriver.support.ui import Select, WebDriverWait
    import csv
    import time
    import requests
    from scrapy import Selector as s
    
    driver = webdriver.Firefox()
    Links = ['','','','','']
    for each in links:
        driver.get(each)
        time.sleep(02)
        driver.find_element_by_id("showAll").click()
        time.sleep(04)
        source = driver.page_source
        sel = s(text=source,type="html")
        apartment_listing = sel.xpath('//section[@class="placardHeader"]//a[@class="placardTitle"]//@href').extract()
        with open("C:\Users\ssamant\Desktop\Client\Anida\Phase_II\Apartments\\apartment_listing.csv","ab")as export:
            for each1 in apartment_listing:
                export.write('{}\n'.format(each1))
        #New_link = driver.current_url
        i = 0
        while (i)<21:
            driver.find_element_by_class_name('next').click()
            time.sleep(02)
            source1 = driver.page_source
            sel1 = s(text=source1,type="html")
            apartment_listing1 = sel.xpath('//section[@class="placardHeader"]//a[@class="placardTitle"]//@href').extract()
            with open("C:\Users\ssamant\Desktop\Client\Anida\Phase_II\Apartments\\apartment_listing.csv","ab")as export:
                for each2 in apartment_listing1:
                    export.write('{}\n'.format(each2))
            i = i+1