I am building a web scraper using CasperJS. I am trying to scrape 3 pages. Each page has 15 links I want to scrape.
In the following code snippet, all the links I want to scrape are collected in the allLinks
array. Its length is 45. When I log the array, I see all the links I want to scrape correctly. I then iterate over each link and visit the page. When I log link
(contains each link), I can still see all the 45 links as expected. However, when I log link
inside the thenOpen
function, I only see links from the first page. So, it only logs 15 links. It doesn't log any links from page2 and page3.
casper.then(function(){
this.each(allLinks,function(self,link){
console.log("Getting all the links that need to be visited");
console.log(allLinks);
console.log("Getting each link");
console.log(link);
this.thenOpen(link,function(a){
console.log("Inside function that extracts data");
console.log(link);
});
});
});
Following is the complete code.
var casper = require('casper').create();
var url = casper.cli.get(0);
console.log(url);
var page2 = casper.cli.get(1);
console.log(page2);
jsonObj = { data : [] };
//var url = 'http://www.houzz.com/professionals/c/Nashville--TN/p/15';
var webPage = require('webpage');
zapTitle = [];
zapContact = [];
zapServices = [];
var page = webPage.create();
var nextBtn = "a.navigation-button.next";
var allLinks = [];
casper.start(url);
casper.waitForSelector(nextBtn, processPage);
casper.run();
function processPage() {
for (var i = 1; i <= page2; i = i + 1) {
this.then(function(){
console.log(i);
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
console.log(allLinks);
if (!this.exists(nextBtn)) {
return;
}
this.thenClick(nextBtn).then(function() {
this.echo(this.getCurrentUrl());
});
});
};
}
function getPageData(){
//return document.title;
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
}
casper.then(function(){
this.each(allLinks,function(self,link){
console.log("Inside the each function");
console.log(link);
this.thenOpen(link,function(a){
console.log("Inside function that extracts data");
console.log(link);
var description = this.fetchText('div.profile-about div:nth-child(1)');
description = description.replace(/[\t\n]/g,"");
var name = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(2) div.info-list-text');
name = name.replace(/[<b>Contact</b>: ]/g,"");
jsonObj.data.push({
"title" : this.fetchText('a.profile-full-name'),
"contact" : this.fetchText('div.profile-about div:nth-child(1)'),
"services" : this.getHTML('div.info-list-text span:nth-child(2) span'),
"name" : name,
"location" : this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span'),
"description" : description,
"reviews" : this.getHTML('div.pro-rating a span.pro-review-string span')
});
casper.open('https://zapier.com/hooks/catch/29s1m6/', {
method: 'post',
data: {
"title" : this.fetchText('a.profile-full-name'),
"contact" : this.getHTML('div.pro-contact-methods span.pro-contact-text:nth-child(2)'),
"services" : this.getHTML('div.info-list-text span:nth-child(2) span'),
"name" : name,
"location" : this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span'),
"description" : description,
"reviews" : this.getHTML('div.pro-rating a span.pro-review-string span')
}
});
}).then(function() {
console.log(jsonObj.data.length);
//console.log(jsonObj);
if (jsonObj.data.length == 13) {
console.log(jsonObj.data[13].title);
}
/*for(var i = 0; i < jsonObj.data.length; i = i + 1 ) {
console.log(i);
console.log("zaptitle");
//zapTitle.push(jsonObj.data[i]);
console.log(jsonObj.data[i].title);
//}
}*/
//require('utils').dump(jsonObj.data[2].title);
//require('utils').dump(jsonObj);
//require('utils').dump(jsonObj.data[8]);
//require('utils').dump(zapTitle);
for(var i = 0; i < jsonObj.data.length; i = i + 1 ) {
zapServices.push(jsonObj.data[i].services);
}
/*casper.open('https://zapier.com/hooks/catch/29s1m6/', {
method: 'post',
data: {"title" : zapTitle,
//"contact" : zapContact,
"services" : zapServices
}*/
});
});
});
casper.getHTML(selector)
fails and exits the script if it doesn't find the element you are looking for (code reference). Of course, this error is hidden if you're using PhantomJS 2.0 or 2.1.
The culprit is "location" : this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span'),
on the 12th page, because it doesn't exist. You need to check whether the selector exists (e.g. with casper.exists(selector)
) before trying to access it.
This is what a little console.log
debugging can accomplish.