Search code examples
pythonscrapy

Scrape data with scrapy


I am going to scrape the Pararius.nl as a practice with scrapy but when I start crawling it returns the fairlane protection, how can I pass it? Do I need any tools? please help with one example

def parse(self, response):
    url = 'https://www.pararius.nl/{deal_type}/nederland/p-{page}/'

        for deal_type in ['huurwoningen', 'koopwoningen']:
            for i in range(1, 2):
                yield scrapy.Request(url.format(deal_type=deal_type, page=i), callback=self.parse_pages,cookies=self.cookies,
                                     headers=self.h, method='GET', cb_kwargs={'deal_type': deal_type})
    
    
    def parse_pages(self, response, deal_type):
        print(response.url)
        return

Solution

  • I was able to paginate and collect data from the page without any issues. as I see in your URL you using p- but as I see on the website it should be page-

    here is code I've using for it:

    def parse(self, response):
        url = 'https://www.pararius.nl/{deal_type}/nederland/page-{page}/'
        # 'koopwoningen'
        for deal_type in ['huurwoningen']:
            for i in range(1, 2):
                yield scrapy.Request(
                    url.format(deal_type=deal_type, page=i),
                    callback=self.parse_pages,
                    method='GET',
                )
    
    def parse_pages(self, response):
        for row in response.css('li.search-list__item'):
            href = row.css('a.listing-search-item__link--depiction::attr(href)').get()
            if href:
                yield {
                    'seacrh_url': response.url,
                    'url': 'https://pararius.nl' + href,
                    'title': row.xpath(
                        './/div[contains(@class, "listing-search-item__sub-title\'")]/text()').get().strip()
                }
    
        page = response.meta.get('page', 2)
        if 'page-' in response.url:
            next_url = re.sub(r'page-\d+', f'page-{page}', response.url)
        else:
            next_url = response.url.strip('/') + f'/page-{page}'
        page += 1
        if page < 10:
            yield scrapy.Request(
                next_url,
                callback=self.parse_pages,
                method='GET',
                meta={'page': page}
            )
    

    additional settings I've used:

    USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
    ROBOTSTXT_OBEY = False
    

    and this is example of result I've received:

    {"seacrh_url": "https://www.pararius.nl/huurwoningen/nederland/page-2", "url": "https://pararius.nl/studio-te-huur/breda/60de7cf7/vredenburchstraat", "title": "4811 RD Breda (Boeimeer)"},
    {"seacrh_url": "https://www.pararius.nl/huurwoningen/nederland/page-2", "url": "https://pararius.nl/appartement-te-huur/eindhoven/7035fc54/aalsterweg", "title": "5615 CH Eindhoven (Looiakkers)"},
    {"seacrh_url": "https://www.pararius.nl/huurwoningen/nederland/page-2", "url": "https://pararius.nl/appartement-te-huur/rotterdam/a5eba2a3/herman-gorterstraat", "title": "3061 SM Rotterdam (Kralingen West)"},
    

    for zah.nl additional lib is required pip install undetected-chromedriver

    you can use the following:

    import re
    import time
    
    import scrapy
    
    import undetected_chromedriver as UC
    
    
    class ZahSpider(scrapy.Spider):
        name = 'zah'
        allowed_domains = ['www.zah.nl']
        start_urls = ['https://www.zah.nl/te-koop/?page=1']
    
        headers = {
            'authority': 'www.zah.nl',
            'accept':     'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'uk-UA,uk;q=0.9,en-US;q=0.8,en;q=0.7',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
        }
    
        def start_requests(self):
            options = uc.ChromeOptions()
            driver = uc.Chrome(options=options)
            driver.maximize_window()
            driver.get('https://www.zah.nl/te-koop/?page=1')
            time.sleep(5)
    
            cookies_list = driver.get_cookies()
            self.cookies_dict = {}
            for cookie in cookies_list:
                self.cookies_dict[cookie['name']] = cookie['value']
            driver.quit()
    
            yield scrapy.Request(
                url='https://www.zah.nl/te-koop/?page=1',
                cookies=self.cookies_dict,
                headers=self.headers,
                callback=self.parse
            )
    
        def parse(self, response):
            for row in response.css('div.result'):
                yield {
                    'title': row.css('a > h2::text').get(),
                    'url': row.css('a::attr(href)').get()
                }
    
            page = response.meta.get('page', 2)
            if page < 5:
                next_url = re.sub(r'page=\d+', f'page={page}', response.url)
                page += 1
               yield scrapy.Request(
                    url=next_url,
                    cookies=self.cookies_dict,
                    headers=self.headers,
                    meta={'page': page},
                   callback=self.parse
                )