I am new to PhantomJS and trying to capture the homepage of Trade Me. Here's my code so far:
var page = require('webpage').create();
page.open('http://trademe.co.nz', function () {
// Checks for bottom div and scrolls down from time to time
window.setInterval(function() {
// Checks if there is a div with class=".has-more-items"
// (not sure if this is the best way of doing it)
// var count = page.content.match(/class=".site-footer"/g);
var footer_visible = page.evaluate(function() {
return $('.site-footer').is(':visible');
});
if(!footer_visible) { // Didn't find
console.log('Scrolling');
page.evaluate(function() {
// Scrolls to the bottom of page
window.document.body.scrollTop = document.body.scrollHeight;
});
}
else { // Found
console.log('Found');
// Do what you want
window.setTimeout( function() {
console.log('Capturing');
page.render('phantom-capture.png', {format: 'png'});
phantom.exit();
}, 10000);
}
}, 1000); // Number of milliseconds to wait between scrolls
});
There are several things that baffle me:
Scrolling
never gets printed.Found
, and the word is printed 10 times. I assume that's because it is contained within the setInterval
block with a 1 second interval, and there's a 10 second wait caused by the setTimeout
?Loading...
message.I'm new to all this and my knowledge of Javascript is very rusty.
Ryan Doherty has provided great explanation as to why console.log('Scrolling');
never gets called and you figured out why Found
is printed 10 times yourself!
And I'd like to talk about how to deal with those ajaxified pages. Generally when you work with such sites you can figure out a criterion by which to judge if the page has loaded, or at least parts of it that you need (though sometimes, as Ryan rightfully notes, it can be very hard, especially if there are a lot of external resources and/or iframes on a page).
In this very case I suppose we can decide that the page has loaded when there is no "Loading" labels left. So we turn off javascript and inspect those labels. Turns out they are <div class="carousel-loading-card">
. That means we only have to wait till they are gone. But to trigger their loading we must simulate page scrolling. In PhantomJS you can "natively" do that by changing page.scrollPosition setting.
var page = require('webpage').create();
// Let's not confuse the target site by our default useragent
// and native viewport dinemsions of 400x300
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0';
page.viewportSize = { width: 1280, height: 1024 };
var totalHeight, scroll = 0;
page.open('http://trademe.co.nz', function(){
totalHeight = page.evaluate(function(){
return $(document).height();
});
wait();
});
function wait()
{
var loading = page.evaluate(function(){
return $(".carousel-loading-card").length;
});
if(loading > 0) {
if(scroll <= totalHeight)
{
scroll += 200;
page.scrollPosition = {
top: scroll,
left: 0
};
page.render('trademe-' + (new Date()).getTime() + '.jpg');
}
console.log(loading + " panels left. Scroll: " + scroll + "px");
setTimeout(wait, 3000);
} else {
// Restore defaults to make a full page screenshot at the end
page.scrollPosition = { top: 0, left: 0 };
page.render('trademe-ready.png');
phantom.exit();
}
}