I'm new in scrapy and I'm trying to scrap https:opensports.I need some data from all products, so the idea is to get all brands (if I get all brands I'll get all products). Each url's brand, has a number of pages (24 articles per page), so I need to define the total number of pages from each brand and then get the links from 1 to Total number of pages. I ' m facing a (or more!) problem with hrefs...This is the script:
import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import datetime
#start_url: https://www.opensports.com.ar/marcas.html
class SolodeportesSpider(scrapy.Spider):
name = 'solodeportes'
start_urls = ['https://www.opensports.com.ar/marcas.html']
custom_settings = {'FEED_URI':'opensports_' + f'{datetime.datetime.today().strftime("%d-%m-%Y-%H%M%S")}.csv', 'FEED_FORMAT': 'csv', }
#get links of dif. brands
def parse(self, response):
marcas= response.css('#maincontent > div.category-view > div > div.brands-page > table > tbody td a::attr(href)').getall()
for marca in marcas:
yield Request(marca, self.parse_paginator)
#get total number of pages of the brand And request all pages from 1 to total number of products
def parse_paginator(self,response):
total_products = int(int(response.css('#toolbar-amount > span:nth-child(3)::text').get() / 24) + 1)
for count in range(1, total_products):
yield Request(url=f'https://www.opensports.com.ar/{response.url}?p={count}',
callback=self.parse_listings)
#Links list to click to get the articles detail
def parse_listings(self, response):
all_listings = response.css('a.product-item-link::attr(class)').getall()
for url in all_listings:
yield Request(url, self.detail_page)
#url--Article-- Needed data
def detail_page(self, response):
yield {
'Nombre_Articulo' :response.css('h1.page-title span::text').get(),
'Precio_Articulo' : response.css('span.price::text').get(),
'Sku_Articulo' : response.css('td[data-th="SKU"]::text').get() ,
'Tipo_Producto': response.css('td[data-th="Disciplina"]::text').get() ,
'Item_url': response.url
}
process = CrawlerProcess()
process.crawl(SolodeportesSpider)
process.start()
And I'm getting this error message:
c:/Users/User/Desktop/Personal/DABRA/Scraper_opensports/opensports/opens_sp_copia_solod.py 2022-01-16 03:45:05 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot) 2022-01-16 03:45:05 [scrapy.utils.log] INFO: Versions: lxml 4.7.1.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.10.1 (tags/v3.10.1:2cd268a, Dec 6 2021, 19:10:37) [MSC v.1929 64 bit
(AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1m 14 Dec 2021), cryptography 36.0.1, Platform Windows-10-10.0.19042-SP0 2022-01-16 03:45:05 [scrapy.utils.log] DEBUG: Using reactor:
twisted.internet.selectreactor.SelectReactor 2022-01-16 03:45:05 [scrapy.crawler] INFO: Overridden settings: {} 2022-01-16 03:45:05 [scrapy.extensions.telnet] INFO: Telnet Password: b362a63ff2281937
2022-01-16 03:45:05 [py.warnings] WARNING:
C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site- packages\scrapy\extensions\feedexport.py:247: ScrapyDeprecationWarning: TheFEED_URI
andFEED_FORMAT
settings have been deprecated in favor of theFEEDS
setting. Please see
theFEEDS
setting docs for more details exporter = cls(crawler)2022-01-16 03:45:05 [scrapy.middleware] INFO: Enabled extensions: ['scrapy.extensions.corestats.CoreStats', 'scrapy.extensions.telnet.TelnetConsole', 'scrapy.extensions.feedexport.FeedExporter', 'scrapy.extensions.logstats.LogStats'] 2022-01-16 03:45:05 [scrapy.middleware] INFO: Enabled downloader middlewares: ['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', 'scrapy.downloadermiddlewares.retry.RetryMiddleware', 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware', 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware', 'scrapy.downloadermiddlewares.stats.DownloaderStats'] 2022-01-16 03:45:05 [scrapy.middleware] INFO: Enabled spider middlewares: ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', 'scrapy.spidermiddlewares.referer.RefererMiddleware', 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', 'scrapy.spidermiddlewares.depth.DepthMiddleware'] 2022-01-16 03:45:05 [scrapy.middleware] INFO: Enabled item pipelines: [] 2022-01-16 03:45:05 [scrapy.core.engine] INFO: Spider opened 2022-01-16 03:45:05 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 2022-01-16 03:45:05 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023 2022-01-16 03:45:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.opensports.com.ar/marcas.html> (referer: None) 2022-01-16 03:45:07 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.opensports.com.ar/marcas.html> (referer: None) Traceback (most recent call last): File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\utils\defer.py", line 120, in iter_errback yield next(it) File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\utils\python.py", line 353, in next return next(self.data) File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\utils\python.py", line 353, in next return next(self.data) File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable: File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output for x in result: File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable: File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in
return (_set_referer(r) for r in result or ()) File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable: File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in
return (r for r in result or () if _filter(r)) File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in _evaluate_iterable
for r in iterable: File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in
return (r for r in result or () if _filter(r)) File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\core\spidermw.py", line 56, in evaluate_iterable
for r in iterable: File "c:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\opensports\opens_sp_copia_solod.py", line 16, in parse yield Request(marca, self.parse_paginator) File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\http\request_init.py", line 25, in init self.set_url(url) File "C:\Users\User\Desktop\Personal\DABRA\Scraper_opensports\venv\lib\site-packages\scrapy\http\request_init.py", line 73, in _set_url raise ValueError(f'Missing scheme in request url: {self._url}') ValueError: Missing scheme in request url: /marca/adidas.html 2022-01-16 03:45:07 [scrapy.core.engine] INFO: Closing spider (finished) 2022-01-16 03:45:07 [scrapy.statscollectors] INFO: Dumping Scrapy stats: {'downloader/request_bytes': 232, 'downloader/request_count': 1, 'downloader/request_method_count/GET': 1, 'downloader/response_bytes': 22711, 'downloader/response_count': 1, 'downloader/response_status_count/200': 1, 'elapsed_time_seconds': 1.748282, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2022, 1, 16, 6, 45, 7, 151772), 'httpcompression/response_bytes': 116063, 'httpcompression/response_count': 1, 'log_count/DEBUG': 1, 'log_count/ERROR': 1, 'log_count/INFO': 10, 'log_count/WARNING': 1, 'response_received_count': 1, 'scheduler/dequeued': 1, 'scheduler/dequeued/memory': 1, 'scheduler/enqueued': 1, 'scheduler/enqueued/memory': 1, 'spider_exceptions/ValueError': 1, 'start_time': datetime.datetime(2022, 1, 16, 6, 45, 5, 403490)}
At first I have a problem with the f' url...I don't know how to concatenate the url because in :
marcas= response.css('#maincontent > div.category-view > div > div.brands-page > table > tbody td a::attr(href)').getall()
I get this type of url (I don't know if it's ok or I need the https:// part):
'/marca/adidas.html'
I know that it's wrong and I coudln't find a way to fix it...Could anyone give me a hand?
Thanks in advance!
For the relative you can use response.follow
or with request just add the base url.
Some other errors you have:
parse_listings
you have class attribute instead of href.I've fixed errors #1 and #2, you need to figure out how to fix error #3.
import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import datetime
#start_url: https://www.opensports.com.ar/marcas.html
class SolodeportesSpider(scrapy.Spider):
name = 'solodeportes'
start_urls = ['https://www.opensports.com.ar/marcas.html']
custom_settings = {
'FEED_URI': 'opensports_' + f'{datetime.datetime.today().strftime("%d-%m-%Y-%H%M%S")}.csv', 'FEED_FORMAT': 'csv',
}
#get links of dif. brands
def parse(self, response):
marcas= response.css('#maincontent > div.category-view > div > div.brands-page > table > tbody td a::attr(href)').getall()
for marca in marcas:
yield response.follow(url=marca, callback=self.parse_paginator)
#get total number of pages of the brand And request all pages from 1 to total number of products
def parse_paginator(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_listings, dont_filter=True)
next_page = response.xpath('//a[contains(@class, "next")]/@href').get()
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse_paginator)
#Links list to click to get the articles detail
def parse_listings(self, response):
all_listings = response.css('a.product-item-link::attr(href)').getall()
for url in all_listings:
yield Request(url, self.detail_page)
#url--Article-- Needed data
def detail_page(self, response):
yield {
'Nombre_Articulo': response.css('h1.page-title span::text').get(),
'Precio_Articulo': response.css('span.price::text').get(),
'Sku_Articulo': response.css('td[data-th="SKU"]::text').get(),
'Tipo_Producto': response.css('td[data-th="Disciplina"]::text').get(),
'Item_url': response.url
}