Search code examples
node.jsweb-crawlerout-of-memoryhtml-to-text

node.js \ why do I get RangeError: Maximum call stack size exceeded


The purpose of the below program is to crawl CNN, and write all its text to a single file (using couple of third parties)

I get

RangeError: Maximum call stack size exceeded

How to troubleshoot this, and how can I bypass that? is there a way I can "free" memory ? and how?

//----------Configuration--------------

var startingUrl = "http://cnn.com"; //keep the http\https or www prefix
var crawlingDepth = "50";
var outputFileName = "cnn.txt";

//-------------------------------------

var Crawler = require("js-crawler");
var sanitizeHtml = require('sanitize-html');
var htmlToText = require('html-to-text');
var fs = require('fs');

var index = 0;

new Crawler().configure({depth: crawlingDepth})
  .crawl(startingUrl, function onSuccess(page) {

  var text = htmlToText.fromString(page.body, {
        wordwrap: false,
        hideLinkHrefIfSameAsText: true,
        ignoreHref: true,
        ignoreImage: true
    });

    index++;
    console.log(index + " pages were crawled"); 
    fs.appendFile(outputFileName, text, function (err) {
        if (err) {
            console.log(err);
        };
        console.log('It\'s saved! in same location.');
    }); 
  });

Solution

  • 1) This is a problem with the depth of recursion.

    2) It is necessary to avoid it:

    • At each depth level is traversed by the links in the loop current levels (on the first level is one primary reference);

    • Access using 'Crawler.prototype._getAllUrls' links of the current page and if these links have not yet been processed - loops through them;

    3) Only concept:

    var Urls = [ ["http://cnn.com/"] ]; // What we crawling
    var crawledUrls = {}; // Check if already crawled
    var crawlingDepth = 3;
    var depth = 0; // Current depth
    var index = 0; // Current index
    var Crawler = require("js-crawler");
    
    function crawling() {
      console.log(depth, index, Urls[depth][index]);
    
      // Prepare next level
      if (typeof Urls[depth+1] === "undefined") Urls.push([]);
    
      // Already crawled flag
      crawledUrls[ Urls[depth][index] ] = true;
    
        new Crawler().configure({depth: 1}).crawl({
            url: Urls[depth][index],
            success: function(page) {
                // Do some with crawled page
    
                // Collect urls at crawled page
                var urls = Crawler.prototype._getAllUrls( page.url, page.body );
                for(var j=0; j<urls.length; j++) {
                    // Check same domain and now crawled yet
                    if ( typeof crawledUrls[urls[j]] === "undefined"
                         && urls[j].indexOf(Urls[0][0])===0 ) {
                        Urls[depth+1].push(urls[j]);
                    }
                }
            },
            failure: function(page) {
            },
            finished: function(crawled) {
              index++;
              if (index<Urls[depth].length) {
                setTimeout(crawling,0);
              } else {
                depth++;
                index = 0;
                if (depth<crawlingDepth) {
                  setTimeout(crawling,0);
                } else {
                  // Finished
                }
            }
            }
        });
    }
    
    crawling();