This script is to get the title of the webpage where the URL of the website will be passed from an excel file, check to see if the title contains the keyword, and then store that domain in the new excel file.
There is no issue with the partial code, but the title comparison does not work as expected. Does anyone have an idea how to fix it?
here is my code
var request = require("request");
var cheerio = require("cheerio");
const xlsx = require("xlsx");
jsonData = [{ Domain: "blogger.com" }, { Domain: "stackoverflow.com" }];
function fetchTitle(url, onComplete = null) {
request(url, function (error, response, body) {
var output = url; // default to URL
if (!error && (response && response.statusCode) === 200) {
var $ = cheerio.load(body);
console.log(`URL = ${url}`);
var title = $("head > title").text().trim();
console.log(`Title = ${title}`);
output = `[${title}] (${url})`;
var keywords = ["Developers", "blog"];
var results = [];
var UrlArray = [];
for (var i = 0; i < keywords.length; i++) {
var match = title.match(new RegExp(keywords.join("|"), "g"));
results.push(keywords[i]);
}
if (match.length > 0) {
UrlArray.push({
Domain: url,
Keywords: results,
Title: output,
});
finalJsonData = JSON.stringify(UrlArray);
const ws = xlsx.utils.json_to_sheet(UrlArray);
const wb = xlsx.utils.book_new();
xlsx.utils.book_append_sheet(wb, ws, "Responses");
xlsx.writeFile(wb, "output.xlsx");
}
} else {
console.log(
`Error = ${error}, code = ${response && response.statusCode}`
);
}
console.log(`output = ${output} \n\n`);
if (onComplete) onComplete(output);
});
}
jsonData.forEach(function (table) {
var tableName = table.Domain;
var URL = "http://" + tableName;
fetchTitle(URL);
});
When I execute the script, I am able to get the title, but when I compare it with the keyword, it is not working as expected. Keywords are not being stored. You can see how the output looks after executing the script.
The script shows that both domains have keywords, but only blogger is stored in the spreadsheet, even then keywords aren't stored
keywords
is an array, so it doesn't get saved, furthermore, keywords column will always contain all keywords, not the matching ones...as requests are async, you need to track them all, and write results only when all requests are finished.
try this:
the code:
var request = require("request");
var cheerio = require("cheerio");
const xlsx = require("xlsx");
const jsonData = [{ Domain: "blogger.com" }, { Domain: "stackoverflow.com" }];
var UrlArray = [];
function writeResults() {
const finalJsonData = JSON.stringify(UrlArray);
const ws = xlsx.utils.json_to_sheet(UrlArray);
const wb = xlsx.utils.book_new();
xlsx.utils.book_append_sheet(wb, ws, "Responses");
xlsx.writeFile(wb, "output.xlsx");
}
function fetchTitle(url, onComplete = null) {
request(url, function (error, response, body) {
var output = url; // default to URL
if (!error && (response && response.statusCode) === 200) {
var $ = cheerio.load(body);
console.log(`URL = ${url}`);
var title = $("head > title").text().trim();
console.log(`Title = ${title}`);
output = `[${title}] (${url})`;
var keywords = ["Developers", "blog"];
var results = [];
for (var i = 0; i < keywords.length; i++) {
let match = title.match(new RegExp(keywords[i], "gi"));
if (match && match.length > 0) {
results.push(keywords[i]);
}
}
UrlArray.push({
Domain: url,
Keywords: results.length > 0 ? results.join(', ') : 'no match',
Title: output,
});
} else {
console.log(
`Error = ${error}, code = ${response && response.statusCode}`
);
}
console.log(`output = ${output} \n\n`);
if (onComplete) onComplete(output);
});
}
let counter = 0;
jsonData.forEach(function (table) {
var tableName = table.Domain;
var URL = "http://" + tableName;
fetchTitle(URL, ()=>{
counter++;
if(counter === jsonData.length) {
console.log(`all ${counter} requests done`);
writeResults();
}
});
});