Search code examples
node.jsasynchronousweb-crawlerimdb

Need to send response after an action has completed


I am trying to make a web crawler which crawls IMDB and lists the movie name and rating. This is my index.js file. Suppose i am crawling for 10 movies. I am then saving the crawled results in a different file say 'message.txt'. Now i want to send this message.txt file as a response to any request. But whenever I make a request it always send me an empty file to my browser initially. Then i notice that it takes some time before the crawled results are saved in the message.txt file. I think this is because all actions are asynchronous in nodejs. So is there a way to send the message.txt file only after crawling is complete?

var express = require('express');
var app = express();

var cheerio = require('cheerio');
var request = require('request');
var fs = require('fs');

app.listen(8080);
console.log('Running');


app.get('/', function(req, res) {
  console.log('Recieved the get Request');
  var i = 1;
  var count = 0;
  while (count < 10) {
    var url = 'http://www.imdb.com/title/tt' + i + '/'; 
    console.log(url);
    count = count + 1;
    i = i + 1;
    request(url, function(error, response, html) {
      if (!error) {
        var $ = cheerio.load(html);
        var title, ratings, released;
        var json = {
          title: '',
          ratings: '',
          released: ''
        };
        $('.title_wrapper').filter(function() {
          var data = $(this);
          json.title = data.children().first().text().trim();
          json.released = data.children().last().children().last().text().trim();
        });
        $('.ratingValue').filter(function() {
          var data = $(this);
          json.ratings = parseFloat(data.text().trim());
        });
        console.log(json);
        fs.appendFile('message.txt', JSON.stringify(json, null, 4) + '\n', function(err) {});   
      };
    });
  };
  res.sendFile(__dirname + '/index.js');
});


Solution

  • You can use the async package which is great for controlling flow, something like:

      console.log('Recieved the get Request');
      var i = 1;
      var count = 0;
      while (count < 10) {
        var url = 'http://www.imdb.com/title/tt' + i + '/';
        console.log(url);
        count = count + 1;
        i = i + 1;
        async.waterfall([
            function sendRequest (callback) {
                if (!error) {
                    var $ = cheero.load(html);
                    var json = {
                        title: '',
                        ratings: '',
                        released: ''
                    }
                }
                $('.title_wrapper').filter(function() {
                    var data = $(this);
                    json.title = data.children().first().text().trim();
                    json.released = data.children().last().children().last().text().trim();
                });
                $('.ratingValue').filter(function() {
                    var data = $(this);
                    json.ratings = parseFloat(data.text().trim());
                });
                callback(null, JSON.stringify(json, null, 4) + '\n');
            },
            function appendFile (json, callback) {
                fs.appendFile('message.txt', json, function(err) {
                    if (err) { callback(err); }
                    callback();
                });
            }
        ], function(err) {
            res.sendFile(__dirname + '/index.js');
        });