python scrapy playwright scrapy-playwright

Scrapy-playwright with multiple start_urls

Similar problem was discussed here but I was not able to make my code work. The aim is to scrapy-playwright generate request-response for each URL in start_urls and parse each response the same way. CSV with urls is correctly read to a list but requests are not generated by start_requests. See the commented code below.

import scrapy
import asyncio
from scrapy_playwright.page import PageMethod

class MySpider(scrapy.Spider):
    name = "Forum01"
    allowed_domains = ["example.com"]

    def start_requests(self):
        with open('FullLink.csv') as file:
            start_urls = [line.strip() for line in file]
        print(start_urls) # When Scrapy crawl the list of URLs is correctly printed
        
        for u in self.start_urls:    
            yield scrapy.Request(
                u,
                meta=dict(
                    playwright=True,
                    playwright_include_page=False,
                    playwright_page_methods=[
                        PageMethod("wait_for_selector", "div.modal-body > p")
                    ], # End of methods
                ), # End of meta
                callback=self.parse
            )

    async def parse(self, response): # Does not work either with sync or async
        for item in response.css('div.modal-content'):
            yield{
                'title': item.css('h1::text').get(),
                'info': item.css('.row+ p::text').get(),
            }

Do you have an idea how to correctly feed the URLs to the spider? Thank you!

Solution

You are trying to iterate an empty sequence in your for loop instead of the one extracted from the csv file.

Unless explicitly overwritten self.start_urls will always refer to an empty list that is created in the scrapy.Spider constructor. Removing the self part of self.start_urls should solve your problem.

import scrapy
import asyncio
from scrapy_playwright.page import PageMethod

class MySpider(scrapy.Spider):
    name = "Forum01"
    allowed_domains = ["example.com"]

    def start_requests(self):
        with open('FullLink.csv') as file:
            start_urls = [line.strip() for line in file] 
        print(start_urls) # When Scrapy crawl the list of URLs is correctly printed
        
        for u in self.start_urls: # <- change self.start_urls to just start_urls
            yield scrapy.Request(  #-----------------------------------
                u,
                meta=dict(
                    playwright=True,
                    playwright_include_page=False,
                    playwright_page_methods=[
                        PageMethod("wait_for_selector", "div.modal-body > p")
                    ], # End of methods
                ), # End of meta
                callback=self.parse
            )

    async def parse(self, response): # Does not work either with sync or async
        for item in response.css('div.modal-content'):
            yield{
                'title': item.css('h1::text').get(),
                'info': item.css('.row+ p::text').get(),
            }