Search code examples
pythonpython-requestsscrapypaginationheader

How to solve 'Spider_error_processing_headers' problem while using Scrapy's CrawlSpider and LinkExtractor?


I got a massage lik ERROR: Spider error processing and line 276, in aiter_errback yield await it.anext() in my terminal, and my code given below Can anyone tell me where is the problem.

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class CandywareCrawlspiderSpider(CrawlSpider):
    name = "candyware_crawlspider"
    allowed_domains = ["www.candywarehouse.com"]
    # start_urls = ["https://www.candywarehouse.com/collections/wedding?page=24"]

    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'

    # Editing the user-agent in the request sent
    def start_requests(self):
        yield scrapy.Request(url='https://www.candywarehouse.com/collections/wedding?page=24', headers={
            'user-agent': self.user_agent
        })

    # Setting rules for the crawler
    rules = (
        Rule(LinkExtractor(restrict_xpaths=('//ul[@class="pagination-custom"]//li/a[@title="Next »"]')), callback='parse_item', follow=True, process_request='set_user_agent'),)
    #
    # # Setting the user-agent
    def set_user_agent(self, request, spider):
        request.headers['User-Agent'] = self.user_agent
        return request

    def parse_item(self, response):

        product_list = response.xpath('//div[@class="js-grid"]/div')

        for product in product_list:
            product_name = product.xpath('.//p[@class="product__grid__title"]/text()').get().strip()
            price = product.xpath('.//span[@class="price"]/text()').get().strip()
            review_counts = product.xpath('.//span[@class="tt-product-block__rating"]/text()').get().replace('\n', '').replace('   ', '')

            yield {
                'product_name': product_name,
                'price': price,
                'review_counts': review_counts,
                'User-Agent': response.request.headers['User-Agent'],
            }


Solution

  • If you're using string methods on the result from get() function you need to make sure you're getting a string.

    product_name = product.xpath('.//p[@class="product__grid__title"]/text()').get(default='').strip()
    price = product.xpath('.//span[@class="price"]/text()').get(default='').strip()
    review_counts = product.xpath('.//span[@class="tt-product-block__rating"]/text()').get(default='').replace('\n', '').replace('   ', '')