Search code examples
pythonscrapyscrapy-splash

Scrapy Splash Dynamic scraping with CrawlSpider


I tried to get some data from a react based website, but when I use CrawlSpider I can't parse other pages. For Example I can parse my first URL with splash and other urls will parse regularly without dynamic content.

this is my code:

class PageSpider(CrawlSpider):
  host = 'hooshmandsazeh.com'
  protocol = 'https'
  root_domain = 'hooshmandsazeh.com'
  name = 'page'
  allowed_domains = [host]
  #start_urls = [f'{protocol}://{host}',]

  def start_requests(self):
        url = f'{self.protocol}://{self.host}'
        yield SplashRequest(url, dont_process_response=True, args={'wait':  1}, meta={'real_url': url})

  custom_settings = {
    #'DEPTH_LIMIT': 9,
    }

  rules = (
        # Rule(LinkExtractor(allow=('node_\d+\.htm',)), follow=True),
        Rule(LinkExtractor(allow=(host),deny=('\.webp', '\.js', '\.css', '\.jpg', '\.png'),unique=True),
                              callback='parse',
                              follow=True,
                              process_request='splash_request'
                              ),
    )

  def splash_request(self, request):
      request.meta['real_url'] = request.url
      print("Aliii",request.meta['real_url'])
      return request

  def _requests_to_follow(self, response):
      
      if not isinstance(response, HtmlResponse):
          return
      seen = set()
      newresponse = response.replace(url=response.meta.get('real_url'))
      for n, rule in enumerate(self._rules):
          links = [lnk for lnk in rule.link_extractor.extract_links(newresponse)
                  if lnk not in seen]
          
          if links and rule.process_links:
              links = rule.process_links(links)
          for link in links:
              seen.add(link)
              r = self._build_request(n, link)
              yield rule.process_request(r)
      

  def parse(self,response):
    if len(LinkExtractor(deny = self.host).extract_links(response)) > 0:
      loader = ItemLoader(item=PageLevelItem(), response=response)
      loader.add_value('page_source_url', response.url)
      yield loader.load_item()

Solution

  • Check below code worked for me:

    def splash_request(self, request):
              # request = request.replace(url=RENDER_HTML_URL + request.url)
              request.meta['real_url'] = request.url
              return SplashRequest(request.meta['real_url'], dont_process_response=True, args={'wait':  0}, meta={'real_url': request.meta['real_url']})