Search code examples
pythonweb-scrapingscrapyplaywrightplaywright-python

I try to use scrapy playwright to scroll through a web shop in order to scrape all products but it doesn't work


I am trying to click on the load more button until it disappears and all products are loaded. Then I want to click on all indivdidual products to scrape the data I need from the products individual site.

I have tried multiple ways of scrolling down and rearranged the code and synthax a few times using chat gpt and gemini. However, i still get returned an empty json file.


import scrapy
import datetime
import re

from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageMethod
from scrapy.selector import Selector



class LidlSpider(scrapy.Spider):
    name = 'lidl_snacks'
    allowed_domains = ['sortiment.lidl.ch']
    custom_settings = {
        'ROBOTSTXT_OBEY': False
    }
    start_urls = [
        'https://sortiment.lidl.ch/de/sussigkeiten-snacks#/', #246 Produkte
    ] 
    
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url,
                dont_filter=True,
                callback=self.parse,
                meta={
                    'url': url,
                    'playwright': True,
                    'playwright_include_page': True,
                    'playwright_page_methods':[
                        PageMethod('wait_for_selector', 'div.product-item-info'),
                        PageMethod("wait_for_selector", "button.primary.amscroll-load-button-new"),
                        
                    ]
                }
            )
    async def scroll_to_bottom(self,page):
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")

    async def parse(self, response):
        page = response.meta["playwright_page"]
        pagination_buttons = page.locator("button.primary.amscroll-load-button-new")  # Adjust the selector as needed

        
        if pagination_buttons:
            buttons = await pagination_buttons.all()
            for button in buttons:
                await button.click()  # Trigger pagination action
                await page.wait_for_navigation()
                await self.scroll_to_bottom(page)  # Optional scroll down on the new page
                
        # Extract product information after pagination click
        content = await page.content()
        sel = Selector(text=content)
        produkte = sel.css('div.product-item-info')
        for produkt in produkte:
            produkt_url = produkt.css('a.product-item-link::attr(href)').get()
            yield response.follow(produkt_url, callback=self.parse_produkt, meta={'url': response.meta['url']})

      

    def parse_produkt(self, response):
        
        mini_dict = {
                'retailer':       self.name,
                'datetime':       datetime.date.today(),
                'categorie':      None,
                'id':             None, #response.css('div.col-left>p::text').get().split()[1],
                'brand':          str(response.css('p.brand-name::text').get()),
                'detail':         str(response.css('span.base::text').get()),
                'actual_price':   response.css('strong.pricefield__price::attr(content)').get(),
                'quantity':       None,
                'regular_price':  None,
                'price_per_unit': None,

            }
           

        yield mini_dict

       
    

        
if __name__ == "__main__":  # __main__ was only created for debug purposes
    process = CrawlerProcess()
    process.crawl(LidlSpider)
    process.start()


Solution

  • There are a couple of problems I can see,

    • There is a popup on the page, where you first need to click the Zustimmen (Agree) button before you can click anything else. So add the following to your code :
    popup = 'div#onetrust-banner-sdk'
    if await page.is_visible(popup, timeout = 5000):
        await page.locator('button#onetrust-accept-btn-handler').click()
        await page.wait_for_selector(popup, state='hidden')
    
    • The page.wait_for_navigation() gives an error as there is no such method in playwright.page so you could replace it with page.wait_for_load_state("domcontentloaded")

    • There is a single Weitere Produkte laden (Load More Products) button, which you need to click multiple times until it goes away, so pagination_buttons in your code returns a single button, that gets clicked once.

    pagination_buttons = page.locator("button.primary.amscroll-load-button-new")
    buttons = await pagination_buttons.all()
    for button in buttons:
        await button.click()  # Trigger pagination action
        await page.wait_for_load_state("domcontentloaded")  # Wait for new page to load
        await self.scroll_to_bottom(page)  # Optional scroll down on the new page
    

    you can fix that by replacing the above with:

    while True: 
        try:
            show_more_button = page.locator("button.primary.amscroll-load-button-new")
            if show_more_button:
                await show_more_button.click()
                await page.wait_for_load_state("domcontentloaded", timeout=5000)
                await self.scroll_to_bottom(page)  
            else:
                break
        except Exception:
            break
    
    

    Here is the full code :

    import datetime
    import scrapy
    
    from scrapy.crawler import CrawlerProcess
    from scrapy_playwright.page import PageMethod
    from scrapy.selector import Selector
    
    class LidlSpider(scrapy.Spider):
        name = 'lidl_snacks'
        allowed_domains = ['sortiment.lidl.ch']
        custom_settings = {
            'ROBOTSTXT_OBEY': False
        }
        start_urls = [
            'https://sortiment.lidl.ch/de/kaffee-tee', #72 products
        ] 
        
        def start_requests(self):
            for url in self.start_urls:
                yield scrapy.Request(
                    url,
                    dont_filter=True,
                    callback=self.parse,
                    meta={
                        'url': url,
                        'playwright': True,
                        'playwright_include_page': True,
                        'playwright_page_methods':[
                            PageMethod('wait_for_load_state',"domcontentloaded"),   
                        ]
                    }
                )
        async def scroll_to_bottom(self,page):
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
    
        async def parse(self, response):
            page = response.meta["playwright_page"]
            
            #await page.screenshot(path="popup.png")
    
            popup = 'div#onetrust-banner-sdk'
            if await page.is_visible(popup, timeout = 5000):
                await page.locator('button#onetrust-accept-btn-handler').click()
                await page.wait_for_selector(popup, state='hidden')
    
            #await page.screenshot(path="popup_clicked_check.png", full_page=True)
            
            #count = 0
            while True: 
                try:
                    show_more_button = page.locator("button.primary.amscroll-load-button-new")
                    if show_more_button:
                        await show_more_button.click()
                        await page.wait_for_load_state("domcontentloaded", timeout=5000)  # Wait for new page to load
                        await self.scroll_to_bottom(page)  # Optional scroll down on the new page
                        # await page.screenshot(path=f"page_scrolled_{count}.png", full_page=True)
                        # count+=1
                    else:
                        break
                except Exception:
                    break
       
            #Extract product information after pagination click
            content = await page.content()
            sel = Selector(text=content)
            produkte = sel.css('div.product-item-info')
            for produkt in produkte:
                produkt_url = produkt.css('a.product-item-link::attr(href)').get()
                yield response.follow(produkt_url, callback=self.parse_produkt, meta={'url': response.meta['url']})
    
          
    
        def parse_produkt(self, response):
            
            mini_dict = {
                    'retailer':       self.name,
                    'datetime':       datetime.date.today(),
                    'categorie':      None,
                    'id':             None, #response.css('div.col-left>p::text').get().split()[1],
                    'brand':          str(response.css('p.brand-name::text').get()),
                    'detail':         str(response.css('span.base::text').get()),
                    'actual_price':   response.css('strong.pricefield__price::attr(content)').get(),
                    'quantity':       None,
                    'regular_price':  None,
                    'price_per_unit': None,
    
                }
               
    
            yield mini_dict
    
    if __name__ == "__main__":  # __main__ was only created for debug purposes
        process = CrawlerProcess()
        process.crawl(LidlSpider)
        process.start()
    
    

    Note(s) :

    • Replaced the /sussigkeiten-snacks#/ with /kaffee-tee as the page has less products to scrape.
    • returns the correct amount of items when run as is, or you can use
    scrapy crawl lidl_snacks -O snacks.json
    

    to see what it returns.