python python-requests tor data-extraction

Anonymous request being denied?

I am trying to learn to make anonymous http requests and have had some success but my most recent attempt is not accepting my request (requesocks.exceptions.HTTPError: 400 Client Error). I am using tor to obtain an anonymous IP. Here's my code:

from fake_useragent import UserAgent
import requests
import requesocks


def newUserAgent():
    "adds a new User-Agent item to HEADERS dictionary"
    HEADERS['User-Agent'] = UA.random

def newUrl():
    "increments CurrentPage and returns url"
    url = 'http://www.realtor.ca/Residential/Map.aspx#CultureId=1&ApplicationId=1&RecordsPerPage=9&MaximumResults=9&PropertyTypeId=300&TransactionTypeId=2' \
    '&StoreyRange=0-0&OwnershipTypeGroupId=1&BuildingTypeId=1&BedRange=0-0&BathRange=0-0&LongitudeMin=-119.66980648040801&LongitudeMax=-119.58174419403106' \
    '&LatitudeMin=49.822197219797346&LatitudeMax=49.84943388971021&SortOrder=A&SortBy=1&viewState=l&Longitude=-119.487716674805&Latitude=49.8434562683105'  \
    '&CurrentPage=' + str(CURRENT_PAGE + 1) 
    return url


def getDataDict():
    "returns data_dict from msl.ca url"
    # Reset User-Agent in HEADERS, increment CurrentPage in url
    newUserAgent()
    url = newUrl()

    # Check visible IP
    ip = SESSION.get("http://icanhazip.com/")
    print "visible IP is:", ip.text

    # Request the URL 
    response = SESSION.get(url, headers=HEADERS)
    response.raise_for_status() # raise exception if invalid response


def main():
    getDataDict()



#------------------------
#    global objects:
#------------------------

CURRENT_PAGE = 0

UA = UserAgent()
HEADERS = {
'Host': 'www.realtor.ca',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'http://www.realtor.ca/Residential/Map.aspx',
'Content-Length': '411',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}


#Initialize a new wrapped requests object
SESSION = requesocks.session()
#Use Tor for both HTTP and HTTPS
SESSION.proxies = {'http': 'socks5://localhost:9050', 'https': 'socks5://localhost:9050'}





if __name__ == '__main__':
    main()

What am I doing wrong here? IP prints fine (i.e. request to http://icanhazip.com/ works fine but then not to subsequent URL - what's different??)

Solution

The server is returning HTTP Error 400. The request URL is invalid. You cannot make an HTTP request with anchor text (#CultureId...) in it.

The hash mark (#) and everything after it are not sent in the HTTP request. Sites that make heavy use of Ajax (like the one you are trying to use) will use Javascript to read the anchor text and then issue Ajax requests to update the content.

From looking at their site quickly, the request from the anchor tag is requested via Ajax to the URL http://www.realtor.ca/api/Listing.svc/PropertySearch_Post with the anchor text in the post body.

Screenshot:

From the looks of the cookies, you will need to first make a request to /Residential/Map.aspx to establish session cookies, then you can try making a request to the PropertySearch_Post URL with your search paramters. It returns a JSON response which you'll have to parse to do anything with the search results.

EDIT: This code works for me (prints a successful JSON response with results)

import requests
import requesocks


def newUserAgent():
    "adds a new User-Agent item to HEADERS dictionary"
    HEADERS['User-Agent'] = 'Mozilla/5.0 (Ubuntu; Firefox=41)'

def newUrl():
    "increments CurrentPage and returns url"
    url = 'http://www.realtor.ca/Residential/Map.asp'
    return url


def getDataDict():
    "returns data_dict from msl.ca url"
    # Reset User-Agent in HEADERS, increment CurrentPage in url
    newUserAgent()
    url = newUrl()

    # Check visible IP
    ip = SESSION.get("http://icanhazip.com/")
    print "visible IP is:", ip.text

    # Request the URL 
    response = SESSION.get(url, headers=HEADERS)
    response.raise_for_status() # raise exception if invalid response

    PAYLOAD = { 'CultureId': '1', 'ApplicationId': '1', 'RecordsPerPage': '9', 'MaximumResults': '9', 'PropertyTypeId': '300','TransactionTypeId': '2','StoreyRange': '0-0', 'OwnershipTypeGroupId': '1', 'BuildingTypeId': '1', 'BedRange': '0-0', 'BathRange': '0-0', 'LongitudeMin': '-119.66980648040801', 'LongitudeMax': '-119.58174419403106', 'LatitudeMin': '49.822197219797346', 'LatitudeMax': '49.84943388971021', 'SortOrder': 'A', 'SortBy': '1',  'viewState': 'l', 'Longitude': '-119.487716674805', 'Latitude': '49.8434562683105', 'CurrentPage': '1' }
    response = SESSION.post('http://www.realtor.ca/api/Listing.svc/PropertySearch_Post', data=PAYLOAD, headers=HEADERS)

    print response.text

def main():
    getDataDict()



#------------------------
#    global objects:
#------------------------

CURRENT_PAGE = 0

HEADERS = {
'Host': 'www.realtor.ca',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Referer': 'http://www.realtor.ca/Residential/Map.aspx',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
UA = newUserAgent()

#Initialize a new wrapped requests object
SESSION = requesocks.session()
#Use Tor for both HTTP and HTTPS
SESSION.proxies = {'http': 'socks5://localhost:9050', 'https': 'socks5://localhost:9050'}





if __name__ == '__main__':
    main()