I am using Scrapy plus selenium to scrapy data from dynamic pages.here is my spider code:
class asbaiduSpider(CrawlSpider):
name = 'apps_v3'
start_urls = ["http://as.baidu.com/a/software?f=software_1012_1"]
rules = (Rule(SgmlLinkExtractor(allow=("cid=(50[0-9]|510)&s=1&f=software_1012_1", )), callback='parse_item',follow=True),)
def __init__(self):
CrawlSpider.__init__(self)
chromedriver = "/usr/bin/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
self.driver = webdriver.Chrome(chromedriver)
def __del__(self):
self.driver.stop()
CrawlSpider.__del__(self)
def parse_item(self,response):
hxs = Selector(response)
#links= hxs.xpath('//span[@class="tit"]/text()').extract()
links= hxs.xpath('//a[@class="hover-link"]/@href').extract()
for link in links:
#print 'link:\t%s'%link
time.sleep(2)
return Request(link,callback=self.parse_page)
def parse_page(self,response):
self.driver.get(response.url)
time.sleep(2.5)
app_comments = ''
num = len(self.driver.find_elements_by_xpath("//section[@class='s-index-page devidepage']/a"))
print 'num:\t%s'%num
if num == 8:
print 'num====8 ohohoh'
while True:
link = self.driver.find_element_by_link_text('下一页')
try:
link.click()
except:
break
The problem is, everytime after clicking page2, it just quit the current page. But I need to crawl page3, page4 and so on. the pages need to parse are like : http://as.baidu.com/a/item?docid=5302381&pre=web_am_software&pos=software_1012_0&f=software_1012_0 (it's in Chinese, sorry for the inconvenience) And I need to turn the bottom pages and scrape the comment data. I have been stuck with the problem for 2 days. I really appreciate for any help. Thank you...
If I have understood it correct here is your case
If my understanding is correct. I think you can proceed with below logic.