I am trying to make a web crawler which crawls IMDB and lists the movie name and rating. This is my index.js file. Suppose i am crawling for 10 movies. I am then saving the crawled results in a different file say 'message.txt'. Now i want to send this message.txt file as a response to any request. But whenever I make a request it always send me an empty file to my browser initially. Then i notice that it takes some time before the crawled results are saved in the message.txt file. I think this is because all actions are asynchronous in nodejs. So is there a way to send the message.txt file only after crawling is complete?
var express = require('express');
var app = express();
var cheerio = require('cheerio');
var request = require('request');
var fs = require('fs');
app.listen(8080);
console.log('Running');
app.get('/', function(req, res) {
console.log('Recieved the get Request');
var i = 1;
var count = 0;
while (count < 10) {
var url = 'http://www.imdb.com/title/tt' + i + '/';
console.log(url);
count = count + 1;
i = i + 1;
request(url, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html);
var title, ratings, released;
var json = {
title: '',
ratings: '',
released: ''
};
$('.title_wrapper').filter(function() {
var data = $(this);
json.title = data.children().first().text().trim();
json.released = data.children().last().children().last().text().trim();
});
$('.ratingValue').filter(function() {
var data = $(this);
json.ratings = parseFloat(data.text().trim());
});
console.log(json);
fs.appendFile('message.txt', JSON.stringify(json, null, 4) + '\n', function(err) {});
};
});
};
res.sendFile(__dirname + '/index.js');
});
You can use the async package which is great for controlling flow, something like:
console.log('Recieved the get Request');
var i = 1;
var count = 0;
while (count < 10) {
var url = 'http://www.imdb.com/title/tt' + i + '/';
console.log(url);
count = count + 1;
i = i + 1;
async.waterfall([
function sendRequest (callback) {
if (!error) {
var $ = cheero.load(html);
var json = {
title: '',
ratings: '',
released: ''
}
}
$('.title_wrapper').filter(function() {
var data = $(this);
json.title = data.children().first().text().trim();
json.released = data.children().last().children().last().text().trim();
});
$('.ratingValue').filter(function() {
var data = $(this);
json.ratings = parseFloat(data.text().trim());
});
callback(null, JSON.stringify(json, null, 4) + '\n');
},
function appendFile (json, callback) {
fs.appendFile('message.txt', json, function(err) {
if (err) { callback(err); }
callback();
});
}
], function(err) {
res.sendFile(__dirname + '/index.js');
});