Search code examples
pythonerror-handlingscrapyscrapy-splashindex-error

Scraper not getting total data


I have a .py scraper, and whe it runs, works fine but is not getting the 100% of the data. I 'm getting lot of errors like this:

2022-05-05 20:53:39 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.justforsport.com.ar/buzo-hombre-361-degrees-y2201my002a-urban-1-gris/p> (referer: https://www.justforsport.com.ar/hombre?page=3)
Traceback (most recent call last):
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\defer.py", line 120, in iter_errback
    yield next(it)
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
    return next(self.data)
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\utils\python.py", line 353, in __next__
    return next(self.data)
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
    for r in iterable:
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
    for x in result:
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
    for r in iterable:
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
    for r in iterable:
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
    for r in iterable:
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
    for r in iterable:
  File "c:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\just_for_sport\just_for_sport\spiders\jfs_hombre.py", line 41, in parse_article_detail
    precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
  File "C:\Users\User\Desktop\Personal\DABRA\Scraper_jfs\venv\lib\site-packages\parsel\selector.py", line 70, in __getitem__
    o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range

this is my script:

import scrapy
from scrapy_splash import SplashRequest
from concurrent.futures import process
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os

if os.path.exists('jfs_hombre.csv'):
    os.remove('jfs_hombre.csv')
    print("The file has been deleted successfully")
else:
    print("The file does not exist!")

class JfsSpider_hombre(scrapy.Spider):
    name = 'jfs_hombre'
    start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
        

    def parse(self,response):
        total_products=int(int(response.css('div.vtex-search-result-3-x-totalProducts--layout.pv5.ph9.bn-ns.bt-s.b--muted-5.tc-s.tl.t-action--small span::text').get())/27) + 1
        for count in range(1, total_products):
            yield SplashRequest(url=f'https://www.justforsport.com.ar/hombre?page={count}',
                          callback=self.parse_links)

  
    def parse_links(self,response):
        links=response.css('a.vtex-product-summary-2-x-clearLink.vtex-product-summary-2-x-clearLink--shelf-product.h-100.flex.flex-column::attr(href)').getall()  
        for link in links:
            yield SplashRequest(response.urljoin('https://www.justforsport.com.ar' + link), self.parse_article_detail)
        
      
    def parse_article_detail(self, response):
        precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
        yield {
            'Casa':'Just_For_Sports',
            'Sku' :response.css('span.vtex-product-identifier-0-x-product-identifier__value::text').get(),
            'Name':response.css('span.vtex-store-components-3-x-productBrand::text').get() ,
            'precio':''.join(precio0.css('span.vtex-product-price-1-x-currencyInteger.vtex-product-price-1-x-currencyInteger--product::text').getall()),
            'Link':response.url,
            'Date':datetime.today().strftime('%Y-%m-%d')
                 }

process= CrawlerProcess(
    settings = { 
        'FEED_URI':'jfs_hombre.csv' ,
        'FEED_FORMAT': 'csv',
        'FEED_EXPORT_ENCODING':'utf-8',
        'CONCURRENT_REQUESTS': 16,
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_MAX_DELAY' : 2,
        'USER_AGENT' : 'Googlebot/2.1 (+http://www.google.com/bot.html)'
        } )        
            
process.crawl(JfsSpider_hombre)
process.start()

I don´t understand what the error is about...why sometimes I get the 100% of the info and sometimes I get these messages? it's something related to the script, the user_agent, about the moment when the process run?

Thanks in advance!


Solution

  • Data is also generatig from from API calls json response as GET method and you call grab all data point whatever you want with the easiest and the superfast way. So below is given an example of working solution.

    import scrapy
    from scrapy.crawler import CrawlerProcess
    
    class JfsSpider_hombre(scrapy.Spider):
        name = 'jfs_hombre'
        #start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
            
        def start_requests(self):
    
            yield scrapy.Request(
                url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables=%7B%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%226869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5%22%2C%22sender%22%3A%22vtex.store-resources%400.x%22%2C%22provider%22%3A%22vtex.search-graphql%400.x%22%7D%2C%22variables%22%3A%22eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9%22%7D',
                callback=self.parse,
                method="GET"
            )
    
        def parse(self, response):
            resp = response.json()
            #print(resp)
            for item in range(0,576,32):
                resp['recordsFiltered']=item
           
                for result  in resp['data']['productSearch']['products']:
                    yield {
                        'productName': result['productName']
                    }
    if __name__ == "__main__":
        process =CrawlerProcess()
        process.crawl()
        process.start()
    

    Output:

    'downloader/response_status_count/200': 1,
     'item_scraped_count': 576,