I'm running into some difficulties with a Scrapy spider.
Function parse() is not working as it should. It receives a response for a url with a search keyword and then for each listing in the page follows the url to fill the Scrapy Data item.
It has a second yield which recursively calls parse with the next_page url until we reach max_page to also grab all the listings in the following pages.
The second yield isn't returning any output in the output.json file when calling scrapy crawl example -o output.json
Here is a reduced working version of the spider code which can reproduce the problem if added to a scrapy project.
import scrapy
class Data(scrapy.Item):
page: int = scrapy.Field()
url: str = scrapy.Field()
description: str = scrapy.Field()
user: str = scrapy.Field()
images: list = scrapy.Field()
class Example(scrapy.Spider):
name = 'example'
search = '/search?category=&keyword='
keywords = ['terrains', 'maison', 'land']
max_pages = 2
current_page = 1
def gen_requests(self, url):
for keyword in self.keywords:
build_url = url + self.search
kws = keyword.split(' ')
if (len(kws)>1):
for (i, val) in enumerate(kws):
if (i == 0):
build_url += val
else:
build_url += f'+{val}'
else:
build_url += kws[0]
yield scrapy.Request(build_url, meta={'main_url':url, 'current_page':1}, callback=self.parse)
def start_requests(self):
urls = ['https://ci.coinafrique.com', 'https://sn.coinafrique.com', 'https://bj.coinafrique.com']
for url in urls:
for request in self.gen_requests(url):
yield request
def parse(self, response):
current_page = response.meta['current_page']
main_url = response.meta['main_url']
for listing in response.css('div.col.s6.m4'):
href = listing.xpath('.//p[@class="ad__card-description"]/a/@href').get()
yield scrapy.Request(response.urljoin(href), meta={'current_page':current_page}, callback=self.followListing)
try:
next_page_url = response.css('li.pagination-indicator.direction a::attr(href)')[1].get()
if next_page_url is not None and current_page < self.max_pages:
next_page = main_url + '/search' + next_page_url
current_page += 1
yield scrapy.Request(next_page, meta={'main_url':main_url, 'current_page':1}, callback=self.parse)
except:
print('No next page found')
def followListing(self, response):
url = response.url
current_page = response.meta['current_page']
description = response.xpath('//div[@class="ad__info__box ad__info__box-descriptions"]//text()').getall()[1]
profile = response.css('div.profile-card__content')
user = profile.xpath('.//p[@class="username"]//text()').get()
images = []
for image in response.xpath('//div[contains(@class,"slide-clickable")]/@style').re(r'url\((.*)\)'):
images.append(image)
yield Data(
page=current_page,
url=url,
description=description,
user=user,
images=images
)
If I swap the yield in the parse() function it returns only the max_page (ex. page 2) listings, it seems it only returns the results from the first yield in both cases.
def parse(self, response):
current_page = response.meta['current_page']
main_url = response.meta['main_url']
try:
next_page_url = response.css('li.pagination-indicator.direction a::attr(href)')[1].get()
if next_page_url is not None and current_page < self.max_pages:
next_page = main_url + '/search' + next_page_url
current_page += 1
yield scrapy.Request(next_page, meta={'main_url':main_url, 'current_page':1}, callback=self.parse)
except:
print('No next page found')
for listing in response.css('div.col.s6.m4'):
href = listing.xpath('.//p[@class="ad__card-description"]/a/@href').get()
yield scrapy.Request(response.urljoin(href), meta={'current_page':current_page}, callback=self.followListing)
Instead of using the requests meta dictionary to pass around variables in between request methods, scrapy has the cb_kwargs
parameter for just that. However in this instance neither are actually necessary.
The reason it's not working is because something about how you construct the url for the next page is failing. So instead of using the main_url
and the current_page
variables you can get the current page from the pagination elements at the bottom of the page by looking for the page link that has active
as it's class name, and then getting that elements sibling to find the next page. Then you can reconstruct the relative link with response.urljoin
.
For example:
def parse(self, response):
current_page = response.xpath('//li/span[@class="active"]')
current_text = current_page.xpath('.//text()').get()
for listing in response.css('div.col.s6.m4'):
href = listing.xpath('.//p[@class="ad__card-description"]/a/@href').get()
yield scrapy.Request(response.urljoin(href), callback=self.followListing, cb_kwargs={"current_page":current_text})
next_page = current_page.xpath('./following-sibling::span/a/@href').get()
if next_page:
yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
You can do the same in the followlisting
method in order to get the current page.
def followListing(self, response, current_page):
description = response.xpath('//div[@class="ad__info__box ad__info__box-descriptions"]//text()').getall()
description = description[1] if description else ""
profile = response.css('div.profile-card__content')
user = profile.xpath('.//p[@class="username"]//text()').get()
images = []
for image in response.xpath('//div[contains(@class,"slide-clickable")]/@style').re(r'url\((.*)\)'):
images.append(image)
yield Data(
url=response.url,
current_page=current_page,
description=description,
user=user,
images=images
)
So in total your spider would look like this:
import scrapy
class Data(scrapy.Item):
page: int = scrapy.Field()
url: str = scrapy.Field()
current_page = scrapy.Field()
description: str = scrapy.Field()
user: str = scrapy.Field()
images: list = scrapy.Field()
class Example(scrapy.Spider):
name = 'example'
search = '/search?category=&keyword='
keywords = ['terrains', 'maison', 'land']
max_pages = 2
current_page = 1
def gen_requests(self, url):
for keyword in self.keywords:
build_url = url + self.search
kws = keyword.split(' ')
if (len(kws)>1):
for (i, val) in enumerate(kws):
if (i == 0):
build_url += val
else:
build_url += f'+{val}'
else:
build_url += kws[0]
yield scrapy.Request(build_url, callback=self.parse)
def start_requests(self):
urls = ['https://ci.coinafrique.com', 'https://sn.coinafrique.com', 'https://bj.coinafrique.com']
for url in urls:
for request in self.gen_requests(url):
yield request
def parse(self, response):
current_page = response.xpath('//li/span[@class="active"]')
current_text = current_page.xpath('.//text()').get()
for listing in response.css('div.col.s6.m4'):
href = listing.xpath('.//p[@class="ad__card-description"]/a/@href').get()
yield scrapy.Request(response.urljoin(href), callback=self.followListing, cb_kwargs={"current_page":current_text})
if int(current_text) < self.max_pages:
next_page = current_page.xpath('./following-sibling::span/a/@href').get()
if next_page:
yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
def followListing(self, response, current_page):
description = response.xpath('//div[@class="ad__info__box ad__info__box-descriptions"]//text()').getall()
description = description[1] if description else ""
profile = response.css('div.profile-card__content')
user = profile.xpath('.//p[@class="username"]//text()').get()
images = []
for image in response.xpath('//div[contains(@class,"slide-clickable")]/@style').re(r'url\((.*)\)'):
images.append(image)
yield Data(
url=response.url,
current_page=current_page,
description=description,
user=user,
images=images
)
partial output from running the above with
scrapy crawl example -o results.json
{"url": "https://sn.coinafrique.com/annonce/terrains/vente-terrains-150-m2-mbao-4094405", "current_page": "4", "description": "Kalimo city situ\u00e9 \u00e0 30mm du centre-ville de dakar et \u00e0 10mn du lac rose plus pr\u00e9cis\u00e9ment \u00e0 ndiakhirate, proche de l'autoroute \u00e0 p\u00e9age a1 sortie 10 de diamniadio, aibd et du prolongement de la vdn. \ncette nouvelle cit\u00e9 disposant de toutes les commodit\u00e9s vous propose des parcelles de 150m\u00b2 en cours de viabilisation \u00e0 12 500 000 ht payables sur 2ans. \nmodalit\u00e9s de paiement : apport de r\u00e9servation 50% soit 6 250 000 + 200.000 pour les frais d\u2019ouverture de dossier et le reliquat \u00e9tal\u00e9 sur 2ans soit 260 416/ mois sans int\u00e9r\u00eat. \nnature juridique : titre foncier individuel", "user": "Fatou Thiam", "images": ["https://images.coinafrique.com/4094405_uploaded_image1_1676373698.jpg", "https://images.coinafrique.com/4094405_uploaded_image2_1676373698.jpeg", "https://images.coinafrique.com/4094405_uploaded_image1_1676373698.jpg", "https://images.coinafrique.com/4094405_uploaded_image2_1676373698.jpeg"]},
{"url": "https://sn.coinafrique.com/annonce/voitures/toyota-land-cruiser-2012-3898070", "current_page": "2", "description": "Toyota tr\u00e8s bien entretenu. moteur impeccable.", "user": "Mouhamed Seck", "images": ["https://images.coinafrique.com/3898070_uploaded_image1_1664451625.jpg", "https://images.coinafrique.com/3898070_uploaded_image1_1664451625.jpg"]},
{"url": "https://sn.coinafrique.com/annonce/voitures/toyota-land-cruiser-2016-3898332", "current_page": "2", "description": "Vente prado vxr 2016 full option 7 places en tres bon \u00e9tat", "user": "Arnaud Tavarez", "images": ["https://images.coinafrique.com/3898332_uploaded_image1_1664461271.jpg", "https://images.coinafrique.com/3898332_uploaded_image2_1664461271.jpeg", "https://images.coinafrique.com/3898332_uploaded_image3_1664461271.jpeg", "https://images.coinafrique.com/3898332_uploaded_image1_1664461271.jpg", "https://images.coinafrique.com/3898332_uploaded_image2_1664461271.jpeg", "https://images.coinafrique.com/3898332_uploaded_image3_1664461271.jpeg"]},
{"url": "https://sn.coinafrique.com/annonce/voitures/land-rover-range-rover-2014-3860928", "current_page": "2", "description": "Range rover sport hse\nPremi\u00e8re inscription09/2014\nPuissance215 kw (292 ch)\nType de carburant diesel\nTransmissionautomatique\nClasse d'\u00e9mission euro5\nClimatisation (climatisation\nAide au stationnement avant, arri\u00e8re\nVerrouillage centralis\u00e9 sans cl\u00e9\nDirection assist\u00e9e traction int\u00e9grale pneus tout temps pare-brise chauffant volant chauffant Bluetooth ordinateur de bord\nLecteur cd vitres \u00e9lectriques r\u00e9troviseur \u00e9lectrique r\u00e9glage de si\u00e8ge \u00e9lectrique antid\u00e9marrage Electrique\nVolant multifonctionnel\nSyst\u00e8me de navigation\nCommande vocale\nD\u00e9marrage/arr\u00eat automatique\nR\u00e9gulateur de vitesse\nEcran tactile\nPhares au x\u00e9non ", "user": "MANSA STORE", "images": ["https://images.coinafrique.com/3860928_uploaded_image1_1662209243.jpg", "https://images.coinafrique.com/3860928_uploaded_image2_1662209244.jpeg", "https://images.coinafrique.com/3860928_uploaded_image3_1662209244.jpeg", "https://images.coinafrique.com/3860928_uploaded_image4_1662209244.jpeg", "https://images.coinafrique.com/3860928_uploaded_image5_1662209244.jpeg", "https://images.coinafrique.com/3860928_uploaded_image6_1662209244.jpeg", "https://images.coinafrique.com/3860928_uploaded_image1_1662209243.jpg", "https://images.coinafrique.com/3860928_uploaded_image2_1662209244.jpeg", "https://images.coinafrique.com/3860928_uploaded_image3_1662209244.jpeg", "https://images.coinafrique.com/3860928_uploaded_image4_1662209244.jpeg", "https://images.coinafrique.com/3860928_uploaded_image5_1662209244.jpeg", "https://images.coinafrique.com/3860928_uploaded_image6_1662209244.jpeg"]},
{"url": "https://sn.coinafrique.com/annonce/voitures/toyota-land-cruiser-2018-3901898", "current_page": "2", "description": "Toyota prado land cruiser vx anne 2018 automatique diesel 5 palace full options grand \u00e9cran cam\u00e9ra de recul frigo bar \r\ndisponibles chez moi", "user": "Aly D\u00e9me", "images": ["https://images.coinafrique.com/3901898_uploaded_image1_1664670013.jpg", "https://images.coinafrique.com/3901898_uploaded_image2_1664669859.jpeg", "https://images.coinafrique.com/3901898_uploaded_image3_1664669859.jpeg", "https://images.coinafrique.com/3901898_uploaded_image1_1664670013.jpg", "https://images.coinafrique.com/3901898_uploaded_image2_1664669859.jpeg", "https://images.coinafrique.com/3901898_uploaded_image3_1664669859.jpeg"]},
{"url": "https://sn.coinafrique.com/annonce/terrains/terrain-700-m2-yoff-4055054", "current_page": "4", "description": "Terrain 700m2 pieds dans l\u2019eau virage - yoff\na vendre au virage yoff,\nune parcelle pieds l\u2019eau, \npour les amoureux \nde brise de mer ( 700 m2 )\nprix: 630.000.000 fcfa ", "user": "OVHA GROUP", "images": ["https://images.coinafrique.com/4055054_uploaded_image1_1674042554.jpg", "https://images.coinafrique.com/4055054_uploaded_image1_1674042554.jpg"]},
{"url": "https://sn.coinafrique.com/annonce/voitures/land-rover-range-rover-vogue-2020-3889515", "current_page": "2", "description": "Prix d\u00e9douan\u00e9 \n\na vendre magnifique range rover vogue v6 diesel \n\nfull option \n\nv\u00e9hicule diplomatique, entretenu exclusivement chez range rover casablanca. \n\n2 cl\u00e9s / parfait \u00e9tat ", "user": "Auto Elegance", "images": ["https://images.coinafrique.com/3889515_uploaded_image1_1663920624.jpg", "https://images.coinafrique.com/3889515_uploaded_image2_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image3_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image4_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image5_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image6_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image7_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image8_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image9_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image1_1663920624.jpg", "https://images.coinafrique.com/3889515_uploaded_image2_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image3_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image4_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image5_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image6_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image7_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image8_1663920624.jpeg", "https://images.coinafrique.com/3889515_uploaded_image9_1663920624.jpeg"]}