Search code examples
javascriptnode.jscallbackpromise

How to use request synchronously in Node.js?


Here is my very basic script to scrape a given URL and get all the links in it. I want to print the links_arr after the request function is complete. But, here the request callback function is executed and so, I get an empty array printed. How do i do this synchronously? i.e. in the following sequence.

  1. URL is requested.
  2. Cheerio gets all the links.
  3. We loop through all the items, and insert them into the links_arr array.
  4. Print the array.

P.S. I know that Node.js meant for asynchronous tasks, but this is the requirement that I need to fulfill. I read that there are things like Promises which can help me in this, but, since I don't have advanced knowledge in Node, I don't know how to implement it. Some help would be much appreciated. I am using Request library for http requests, url for url parsing, and cheerio for html parsing.

var request = require('request');
var cheerio = require('cheerio');
var url = require('url');
var all_links = [];
var processing = [];
var processed = [];
var base_url = "https://www.npmjs.com";
var base_host = "www.npmjs.com";


var analyze_href_sync = function(u){
    console.log("Analysing URL "+u);
    url_obj = url.parse(u);
    url_formatted = url.format(url_obj);
    if (!url_obj.host) {
        //Relative URL
        resolved_url = url.resolve(base_url, url_formatted);
        return resolved_url;
    } else if (url_obj.protocol.startsWith("http")){
        if (url_obj.host == base_host) {
            return url_formatted;
        } else {
            return false;
        }
    } else {
        return false;
    }
}

var scrape_all_links_sync = function(u){
    console.log("Scraping all links from URL "+u);
    var links_arr = [];
    request(u, function(err, res, body){
        $ = cheerio.load(body);
        links = $('a');
        $(links).each(function(i, link){
            href = $(link).attr('href');
            console.log(href);
            links_arr.push(href);
        });
    });

    console.log(links_arr); //Need to print this, after the above request loopo is complete. i.e. After the array is filled.
}

var store_into_csv_sync = function(u){

}

var insert_into_processing_sync = function(u){

}

var remove_from_processing_sync = function(u){

}

var main = function(u){
    var analyze_url = analyze_href_sync(u);
    if (analyze_url != false) {
        scrape_all_links_sync(analyze_url);
    }
}

main(base_url);

The output of the above script is

Analysing URL https://www.npmjs.com
Scraping all links from URL https://www.npmjs.com/
[]
...
*All the other links found*

Solution

  • You need to put console.log(links_arr); inside the callback function:

    var scrape_all_links_sync = function(u){
    console.log("Scraping all links from URL "+u);
    var links_arr = [];
    request(u, function(err, res, body){
        $ = cheerio.load(body);
        links = $('a');
        $(links).each(function(i, link){
            href = $(link).attr('href');
            console.log(href);
            links_arr.push(href);
        });
        console.log(links_arr); //Need to print this, after the above request loop is complete. i.e. After the array is filled.
      });
    }
    

    So just move 1 line up the statement and it will be fine.

    In JavaScript we have 3 ways of handling asynchronous code:

    1. callbacks
    2. promises
    3. generator based

    So you can choose which one to use and you can also mix them (e.g. depends of the library you use). You may read more information here.