Search code examples
javascriptnode.jswebweb-scrapingcheerio

how to handle string comparison and file writing using xlsx with nodejs


This script is to get the title of the webpage where the URL of the website will be passed from an excel file, check to see if the title contains the keyword, and then store that domain in the new excel file.

There is no issue with the partial code, but the title comparison does not work as expected. Does anyone have an idea how to fix it?

here is my code

var request = require("request");
var cheerio = require("cheerio");
const xlsx = require("xlsx");

jsonData = [{ Domain: "blogger.com" }, { Domain: "stackoverflow.com" }];

function fetchTitle(url, onComplete = null) {
  request(url, function (error, response, body) {
    var output = url; // default to URL

    if (!error && (response && response.statusCode) === 200) {
      var $ = cheerio.load(body);
      console.log(`URL = ${url}`);

      var title = $("head > title").text().trim();
      console.log(`Title = ${title}`);
      output = `[${title}] (${url})`;
      var keywords = ["Developers", "blog"];
      var results = [];
      var UrlArray = [];
      for (var i = 0; i < keywords.length; i++) {
        var match = title.match(new RegExp(keywords.join("|"), "g"));
        results.push(keywords[i]);
      }
      if (match.length > 0) {
        UrlArray.push({
          Domain: url,
          Keywords: results,
          Title: output,
        });

        finalJsonData = JSON.stringify(UrlArray);
        const ws = xlsx.utils.json_to_sheet(UrlArray);
        const wb = xlsx.utils.book_new();
        xlsx.utils.book_append_sheet(wb, ws, "Responses");
        xlsx.writeFile(wb, "output.xlsx");
      }
    } else {
      console.log(
        `Error = ${error}, code = ${response && response.statusCode}`
      );
    }

    console.log(`output = ${output} \n\n`);

    if (onComplete) onComplete(output);
  });
}

jsonData.forEach(function (table) {
  var tableName = table.Domain;
  var URL = "http://" + tableName;
  fetchTitle(URL);
});

When I execute the script, I am able to get the title, but when I compare it with the keyword, it is not working as expected. Keywords are not being stored. You can see how the output looks after executing the script.

output excel screenshot

The script shows that both domains have keywords, but only blogger is stored in the spreadsheet, even then keywords aren't stored


Solution

    1. you're overwriting the file on each loop,
    2. keywords is an array, so it doesn't get saved, furthermore, keywords column will always contain all keywords, not the matching ones...

    as requests are async, you need to track them all, and write results only when all requests are finished.

    try this:

    • match case insensitive, and store only matching keywords for that site, not all (I also added "no match" for domains with no match)
    • store results outside the loop
    • move writing results into a separate function
    • add request counter and callback to track requests
    • write results when requests are done

    the code:

    var request = require("request");
    var cheerio = require("cheerio");
    const xlsx = require("xlsx");
    
    const jsonData = [{ Domain: "blogger.com" }, { Domain: "stackoverflow.com" }];
    
    var UrlArray = [];
    
    function writeResults() {
    
        const finalJsonData = JSON.stringify(UrlArray);
        const ws = xlsx.utils.json_to_sheet(UrlArray);
        const wb = xlsx.utils.book_new();
        xlsx.utils.book_append_sheet(wb, ws, "Responses");
        xlsx.writeFile(wb, "output.xlsx");
    
    }
    
    function fetchTitle(url, onComplete = null) {
    
        request(url, function (error, response, body) {
            var output = url; // default to URL
    
            if (!error && (response && response.statusCode) === 200) {
                var $ = cheerio.load(body);
                console.log(`URL = ${url}`);
    
                var title = $("head > title").text().trim();
                console.log(`Title = ${title}`);
                output = `[${title}] (${url})`;
                var keywords = ["Developers", "blog"];
                var results = [];
                for (var i = 0; i < keywords.length; i++) {
                    let match = title.match(new RegExp(keywords[i], "gi"));
                    if (match && match.length > 0) {
                        results.push(keywords[i]);
                    }
                }
    
                UrlArray.push({
                    Domain: url,
                    Keywords: results.length > 0 ? results.join(', ') : 'no match',
                    Title: output,
                });
    
            } else {
                console.log(
                    `Error = ${error}, code = ${response && response.statusCode}`
                );
            }
    
            console.log(`output = ${output} \n\n`);
    
            if (onComplete) onComplete(output);
        });
    }
    
    let counter = 0;
    
    jsonData.forEach(function (table) {
        var tableName = table.Domain;
        var URL = "http://" + tableName;
        fetchTitle(URL, ()=>{
            counter++;
            if(counter === jsonData.length) {
                console.log(`all ${counter} requests done`);
                writeResults();
            }
        });
    });