Search code examples
web-scrapingscrapyscrapy-splash

Scraper only getting first item?


My scraper is only getting the first listing below. How can I make it get all addresses in the list? This is a simple program which extracts the addresses in the link below.

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest


class Listings2Spider(CrawlSpider):
    name = 'listings2'
    allowed_domains = ['www.realtor.ca']
    user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"

    script = '''
    function main(splash, args)
        splash.private_mode_enabled = false
        url = args.url
        assert(splash:go(url))
        assert(splash:wait(3))
        return splash:html()
    end
    '''

    def start_requests(self):
        yield SplashRequest(url='https://www.realtor.ca/map#ZoomLevel=13&Center=43.686631%2C-79.339824&LatitudeMax=43.75741&LongitudeMax=-79.25894&LatitudeMin=43.61577&LongitudeMin=-79.42071&view=list&Sort=6-D&PGeoIds=g20_dpz8de7m&GeoName=East%20York%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&Currency=CAD',
                             headers={'User-Agent': self.user_agent}, callback=self.parse_item, endpoint="execute", args={'lua_source': self.script})

    rules = (
        Rule(LinkExtractor(restrict_xpaths="//div[@class='cardCon']"), callback='parse_item', follow=True, process_request='set_user_agent'),
    )
    def set_user_agent(self, request):
        request.headers['User-Agent'] = self.user_agent
        return request

    def parse_item(self, response):
        yield {
            'Address': response.xpath("//div[@class='listingCardAddress']/text()").get()
        }

Solution

  • There seems to be a rate limit being applied, I can automate the api calls and avoid using browser automation but the site seems to block requests after a few pages. Note that doing it this way you can get a maximum of 100 results per page which will be (when it works!) much quicker and you get far more details from the json than from the front-end html.

    import requests
    
    s = requests.Session()
    home_url = 'https://www.realtor.ca/map#ZoomLevel=13&Center=43.686631%2C-79.339824&LatitudeMax=43.75741&LongitudeMax=-79.25894&LatitudeMin=43.61577&LongitudeMin=-79.42071&view=list&Sort=6-D&PGeoIds=g20_dpz8de7m&GeoName=East%20York%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&Currency=CAD'
    step = s.get(home_url)
    print(step)
    
    url = 'https://api2.realtor.ca/Listing.svc/PropertySearch_Post'
    
    headers = {
        'accept':'*/*',
        'accept-encoding':'gzip, deflate, br',
        'content-type':'application/x-www-form-urlencoded; charset=UTF-8',
        'origin':'https://www.realtor.ca',
        'referer':'https://www.realtor.ca/',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
        }
    
    output = []
    for page in range(1,5):
    
        payload = {
            'ZoomLevel':'13',
            'LatitudeMax':'43.75741',
            'LongitudeMax':'-79.25894',
            'LatitudeMin':'43.61577',
            'LongitudeMin':'-79.42071',
            'Sort':'6-D',
            'PropertyTypeGroupID':'1',
            'PropertySearchTypeId':'1',
            'TransactionTypeId':'2',
            'Currency':'CAD',
            'RecordsPerPage':'100',
            'ApplicationId':'1',
            'CultureId':'1',
            'Version':'7.0',
            'CurrentPage': str(page)
            }
    
        post = s.post(url,headers=headers,data=payload).json()
        results = len(post['Results'])
        print(f'Scraping page: {page}, results: {results}')
    
        for listing in post['Results']:
            print(listing['Id'],listing['Property']['Price'])