Search code examples
node.jszombie.jsnode-async

Zombiejs - fetching contents of links synchronouly


I have been playing with nodejs and zombiejs to fetch some personal data from a site. Unfortunately I am stuck at a point where zombiejs only gets me the data from first link and then hangsup.

The steps I follow are-

  1. Go to to the base url
  2. Get the number of pages
  3. Use async library to fetch them in series by opening a new browser window everytime. Note I only create a browser window instead of a totally new browser instance as it expensive to create one.

This is my code

var Browser = require("zombie");
var async = require('async');

var so_base="http://stackoverflow.com";
var so_url="http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=";

var browser = new Browser();
browser.visit(so_base, function () {
    var arr=[];
    for(var i=1;i<=10;i++) {
      arr.push(i);
    }
    async.eachSeries(
                arr,
                function(k, callback) {
                        browser.open();
                        browser.visit(so_url+k,function() { 
                                console.log(browser.location.href);
                                console.log(browser.html());           
                        });
                },
                function(e) {
                        console.log(e);
                });
});

Results

>node main_zombie.js 
..... HTML DUMP
http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=1
>

Any suggestions would be appreciated


Solution

  • Found the mistake

    As per https://github.com/caolan/async#each

    One needs to call the callback function with empty arguments or null if there is no error. So the correct code would be

    var Browser = require("zombie");
    var async = require('async');
    
    var so_base="http://stackoverflow.com";
    var so_url="http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=";
    
    var browser = new Browser();
    browser.visit(so_base, function () {
        var arr=[];
        for(var i=1;i<=10;i++) {
          arr.push(i);
        }
        async.eachSeries(
                    arr,
                    function(k, callback) {
                            browser.open();
                            browser.visit(so_url+k,function() { 
                                    console.log(browser.location.href);
                                    console.log(browser.html());
                            // Add callback and check if we reached the last page
                                    if (k == 10) {
                                      browser.close();
                                    }
                                    callback();
                            });
                    },
                    function(e) {
                            console.log(e);
                    });
    });