I am trying to learn to make anonymous http requests and have had some success but my most recent attempt is not accepting my request (requesocks.exceptions.HTTPError: 400 Client Error). I am using tor to obtain an anonymous IP. Here's my code:
from fake_useragent import UserAgent
import requests
import requesocks
def newUserAgent():
"adds a new User-Agent item to HEADERS dictionary"
HEADERS['User-Agent'] = UA.random
def newUrl():
"increments CurrentPage and returns url"
url = 'http://www.realtor.ca/Residential/Map.aspx#CultureId=1&ApplicationId=1&RecordsPerPage=9&MaximumResults=9&PropertyTypeId=300&TransactionTypeId=2' \
'&StoreyRange=0-0&OwnershipTypeGroupId=1&BuildingTypeId=1&BedRange=0-0&BathRange=0-0&LongitudeMin=-119.66980648040801&LongitudeMax=-119.58174419403106' \
'&LatitudeMin=49.822197219797346&LatitudeMax=49.84943388971021&SortOrder=A&SortBy=1&viewState=l&Longitude=-119.487716674805&Latitude=49.8434562683105' \
'&CurrentPage=' + str(CURRENT_PAGE + 1)
return url
def getDataDict():
"returns data_dict from msl.ca url"
# Reset User-Agent in HEADERS, increment CurrentPage in url
newUserAgent()
url = newUrl()
# Check visible IP
ip = SESSION.get("http://icanhazip.com/")
print "visible IP is:", ip.text
# Request the URL
response = SESSION.get(url, headers=HEADERS)
response.raise_for_status() # raise exception if invalid response
def main():
getDataDict()
#------------------------
# global objects:
#------------------------
CURRENT_PAGE = 0
UA = UserAgent()
HEADERS = {
'Host': 'www.realtor.ca',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'http://www.realtor.ca/Residential/Map.aspx',
'Content-Length': '411',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
#Initialize a new wrapped requests object
SESSION = requesocks.session()
#Use Tor for both HTTP and HTTPS
SESSION.proxies = {'http': 'socks5://localhost:9050', 'https': 'socks5://localhost:9050'}
if __name__ == '__main__':
main()
What am I doing wrong here? IP prints fine (i.e. request to http://icanhazip.com/ works fine but then not to subsequent URL - what's different??)
The server is returning HTTP Error 400. The request URL is invalid.
You cannot make an HTTP request with anchor text (#CultureId...
) in it.
The hash mark (#) and everything after it are not sent in the HTTP request. Sites that make heavy use of Ajax (like the one you are trying to use) will use Javascript to read the anchor text and then issue Ajax requests to update the content.
From looking at their site quickly, the request from the anchor tag is requested via Ajax to the URL http://www.realtor.ca/api/Listing.svc/PropertySearch_Post
with the anchor text in the post body.
From the looks of the cookies, you will need to first make a request to /Residential/Map.aspx
to establish session cookies, then you can try making a request to the PropertySearch_Post
URL with your search paramters. It returns a JSON response which you'll have to parse to do anything with the search results.
EDIT: This code works for me (prints a successful JSON response with results)
import requests
import requesocks
def newUserAgent():
"adds a new User-Agent item to HEADERS dictionary"
HEADERS['User-Agent'] = 'Mozilla/5.0 (Ubuntu; Firefox=41)'
def newUrl():
"increments CurrentPage and returns url"
url = 'http://www.realtor.ca/Residential/Map.asp'
return url
def getDataDict():
"returns data_dict from msl.ca url"
# Reset User-Agent in HEADERS, increment CurrentPage in url
newUserAgent()
url = newUrl()
# Check visible IP
ip = SESSION.get("http://icanhazip.com/")
print "visible IP is:", ip.text
# Request the URL
response = SESSION.get(url, headers=HEADERS)
response.raise_for_status() # raise exception if invalid response
PAYLOAD = { 'CultureId': '1', 'ApplicationId': '1', 'RecordsPerPage': '9', 'MaximumResults': '9', 'PropertyTypeId': '300','TransactionTypeId': '2','StoreyRange': '0-0', 'OwnershipTypeGroupId': '1', 'BuildingTypeId': '1', 'BedRange': '0-0', 'BathRange': '0-0', 'LongitudeMin': '-119.66980648040801', 'LongitudeMax': '-119.58174419403106', 'LatitudeMin': '49.822197219797346', 'LatitudeMax': '49.84943388971021', 'SortOrder': 'A', 'SortBy': '1', 'viewState': 'l', 'Longitude': '-119.487716674805', 'Latitude': '49.8434562683105', 'CurrentPage': '1' }
response = SESSION.post('http://www.realtor.ca/api/Listing.svc/PropertySearch_Post', data=PAYLOAD, headers=HEADERS)
print response.text
def main():
getDataDict()
#------------------------
# global objects:
#------------------------
CURRENT_PAGE = 0
HEADERS = {
'Host': 'www.realtor.ca',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Referer': 'http://www.realtor.ca/Residential/Map.aspx',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
UA = newUserAgent()
#Initialize a new wrapped requests object
SESSION = requesocks.session()
#Use Tor for both HTTP and HTTPS
SESSION.proxies = {'http': 'socks5://localhost:9050', 'https': 'socks5://localhost:9050'}
if __name__ == '__main__':
main()