I am building a simple web scraper. I am trying to scrape every link with the class name .pro-title
in this url - http://www.home.com/professionals/c/oho,-TN. I don't understand why the thenOpen()
function is executing twice.
var casper = require('casper').create({
logLevel:"verbose",
debug:true
});
var links;
var name;
var paragraph;
var firstName;
casper.start('http://www.home.com/professionals/c/oho,-TN');
casper.then(function getLinks(){
links = this.evaluate(function(){
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
//this.echo(links);
});
return links;
});
});
casper.then(function(){
this.each(links,function(self,link){
self.thenOpen(link,function(a){
//this.echo(this.getCurrentUrl());
// this.echo(this.getCurrentUrl());
//this.echo("first");
var firstName = this.fetchText('div.info-list-text');
this.echo(firstName);
});
});
});
casper.run(function(){
this.exit();
});
Artjom B was correct in that you were gathering href
attributes that weren't valid URLs. You can eliminate them using a regular expression.
var casper = require('casper').create({
logLevel:"verbose",
debug:true
});
var links;
var name;
var paragraph;
var firstName;
var expression = /[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?/gi;
var regex = new RegExp(expression);
casper.start('http://www.houzz.com/professionals/c/Nashville,-TN');
casper.then(function getLinks(){
links = this.evaluate(function(){
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
});
});
casper.then(function(){
this.each(links,function(self,link){
if (link.match(regex)) {
self.thenOpen(link,function(a){
var firstName = this.fetchText('div.info-list-text');
this.echo(firstName);
});
}
});
});
casper.run(function(){
this.exit();
});