I've created a script in node
using promise
in combination with request
and cheerio
to parse the links
under Province
column from this webpage then reuse those links to scrape all the urls under Office
column from all of such pages and finally make use these links
to collect the title
from all of such target pages, as in Cairos main Post Office
in this page.
My current script most of the times gets stuck. However, sometimes it throws this error UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'parent' of undefined
. I've checked each of the functions and found that they are all working in the right way individually.
Although the script looks a bit bigger, it is built upon a very simple logic which is make use of each links
from it's landing page until it reaches the title
of it's target page.
This is my try so far:
const request = require('request');
const cheerio = require('cheerio');
const link = 'https://www.egyptcodebase.com/en/p/all';
const base_link = 'https://www.egyptcodebase.com/en/';
const items = [];
const nitems = [];
let getLinks = () => {
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.table tbody tr').each(function() {
items.push(base_link + $(this).find("a").attr("href"));
});
resolve(items);
} catch (e) {
reject(e);
}
});
});
};
let getData = (links) => {
const promises = links
.map(nurl => new Promise((resolve, reject) => {
request(nurl, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.table tbody tr').each(function() {
nitems.push(base_link + $(this).find("a").attr("href"));
});
resolve(nitems);
} catch (e) {
reject(e);
}
})
}))
return Promise.all(promises)
}
let FetchData = (links) => {
const promises = links
.map(nurl => new Promise((resolve, reject) => {
request(nurl, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
resolve($(".home-title > h2").eq(0).text());
} catch (e) {
reject(e);
}
})
}))
return Promise.all(promises)
}
getLinks().then(resultList => {
getData(resultList).then(resultSet => {
FetchData(resultSet).then(title =>{
console.log(title);
})
})
})
How can I scrape the titles from target pages making use of all the links from landing pages?
So the issue is with 2D
array. If you go through carefully over your getData
function, you're returning 2D array.
map
return an array and within that map you're resolving another array nitems
.
Here's the working code:
const base_link = 'https://www.egyptcodebase.com/en/';
// helper wrapper DRY
const getHtmls = (url) => {
return new Promise((resolve, reject) => {
request({ uri: url, method: 'GET', followAllRedirects: true } , function(error, response, html) {
if (error) reject(error);
else resolve(html);
});
})
}
let getLinks = async () => {
const link = 'https://www.egyptcodebase.com/en/p/all';
const items = [];
try {
const html = await getHtmls(link);
let $ = cheerio.load(html);
$('.table tbody tr').each(function() {
items.push(base_link + $(this).find("a").attr("href"));
});
} catch (e) {
// handling error here so execution can continue for good eggs
console.error(e.message)
}
return items;
};
let getData = async (links) => {
const out = [];
try {
const promises = links.map(nurl => getHtmls(nurl));
const htmls = await Promise.all(promises);
htmls.forEach(html => {
let $ = cheerio.load(html);
$('.table tbody tr').each(function() {
out.push(base_link + $(this).find("a").attr("href"));
});
})
} catch (e) {
// handling error here so execution can continue for good eggs
console.error(e.message)
}
return out;
}
let FetchData = async (links) => {
const out = [];
try {
const promises = links.map(nurl => getHtmls(nurl));
const htmls = await Promise.all(promises)
htmls.forEach(html => {
try {
let $ = cheerio.load(html);
out.push($(".home-title > h2").eq(0).text());
} catch (e){
// handling error here so execution can continue for good eggs
console.error(e.message)
}
})
} catch (e) {
// handling error here so execution can continue for good eggs
console.error(e.message)
}
return out;
}
getLinks().then(resultList => {
getData(resultList).then(resultSet => {
FetchData(resultSet).then(title =>{
console.log(title);
})
})
})
Note: Instead of writing your own Promise
wrapper, you could use request-promise
package