I'm trying to scrape a website but need to use splash in all pages because their content created dynamically. right now it renders only the first page, but not the content page or the pagination pages.
here's the code:
import scrapy
from scrapy_splash import SplashRequest
import scrapy_splash
class ShutSpider(scrapy.Spider):
name = 'Shut'
def start_requests(self):
yield SplashRequest(url='ROOTURL',callback=self.parse)
def parse(self, response):
# follow links to author pages
content=response.xpath('//*[@id="iconQuesBar"]/a[4]/@href').extract()
for href in content:
yield response.follow(href.replace('?id=', ''), self.parse_QNA)
if content == []:
return
# follow pagination links
for href in response.xpath('//*[@id="body-div"]/table/tbody/tr[2]/td[3]/center/form/span/a/@href').extract():
yield response.follow(href, self.parse)
def parse_QNA(self, response):
yield {
'url': response.url,
'title': response.xpath('//h1[@class = "head"]/text()').extract()
I've played with it in every way I could think off but it didn't work. the only solution I can think of right now is to send the links to the content pages and pagination using the render API but I think it's really bad coding and there has to be another way.
Thanks for you help.
Instead of response.follow()
, explicitely yield new SplashRequest
for subsequent pages. Also, you'll have to use response.urljoin()
in this case. Here's modified code:
import scrapy
from scrapy_splash import SplashRequest
import scrapy_splash
class ShutSpider(scrapy.Spider):
name = 'Shut'
def start_requests(self):
yield SplashRequest(url='ROOTURL',callback=self.parse)
def parse(self, response):
# follow links to author pages
content = response.xpath('//*[@id="iconQuesBar"]/a[4]/@href').extract()
for href in content:
yield SplashRequest(response.urljoin(href.replace('?id=', '')), self.parse_QNA)
if content == []:
return
# follow pagination links
for href in response.xpath('//*[@id="body-div"]/table/tbody/tr[2]/td[3]/center/form/span/a/@href').extract():
yield SplashRequest(response.urljoin(href), self.parse)
def parse_QNA(self, response):
yield {
'url': response.url,
'title': response.xpath('//h1[@class = "head"]/text()').extract()