Search code examples
node.jsweb-scrapingpromiserequestcheerio

Unable to make use of links to fetch different titles


I've created a script in node using promise in combination with request and cheerio to parse the links under Province column from this webpage then reuse those links to scrape all the urls under Office column from all of such pages and finally make use these links to collect the title from all of such target pages, as in Cairos main Post Office in this page.

My current script most of the times gets stuck. However, sometimes it throws this error UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'parent' of undefined. I've checked each of the functions and found that they are all working in the right way individually.

Although the script looks a bit bigger, it is built upon a very simple logic which is make use of each links from it's landing page until it reaches the title of it's target page.

This is my try so far:

const request = require('request');
const cheerio = require('cheerio');

const link = 'https://www.egyptcodebase.com/en/p/all';
const base_link = 'https://www.egyptcodebase.com/en/';

const items = [];
const nitems = [];

let getLinks = () => {
    return new Promise((resolve, reject) => {
        request(link, function(error, response, html) {
            let $ = cheerio.load(html);
            if (error) return reject(error);
            try {
                $('.table tbody tr').each(function() {
                    items.push(base_link + $(this).find("a").attr("href"));
                });
                resolve(items);
            } catch (e) {
                reject(e);
            }
        });
    });
};

let getData = (links) => {
    const promises = links
        .map(nurl => new Promise((resolve, reject) => {
            request(nurl, function(error, response, html) {
                let $ = cheerio.load(html);
                if (error) return reject(error);
                try {
                    $('.table tbody tr').each(function() {
                        nitems.push(base_link + $(this).find("a").attr("href"));
                    });
                    resolve(nitems);
                } catch (e) {
                    reject(e);
                }
            })
        }))

    return Promise.all(promises)
}

let FetchData = (links) => {
    const promises = links
        .map(nurl => new Promise((resolve, reject) => {
            request(nurl, function(error, response, html) {
                let $ = cheerio.load(html);
                if (error) return reject(error);
                try {
                    resolve($(".home-title > h2").eq(0).text());
                } catch (e) {
                    reject(e);
                }
            })
        }))

    return Promise.all(promises)
}

getLinks().then(resultList => {
    getData(resultList).then(resultSet => {
        FetchData(resultSet).then(title =>{
            console.log(title);
        })
    })
})

How can I scrape the titles from target pages making use of all the links from landing pages?


Solution

  • So the issue is with 2D array. If you go through carefully over your getData function, you're returning 2D array.

    map return an array and within that map you're resolving another array nitems.

    Here's the working code:

    const base_link = 'https://www.egyptcodebase.com/en/';
    
    // helper wrapper DRY
    const getHtmls = (url) => {
      return new Promise((resolve, reject) => {
        request({ uri: url, method: 'GET', followAllRedirects: true } , function(error, response, html) {
          if (error) reject(error);
          else resolve(html);
        });
      })
    }
    
    let getLinks = async () => {
      const link = 'https://www.egyptcodebase.com/en/p/all';
      const items = [];
      try {
        const html = await getHtmls(link);
        let $ = cheerio.load(html);
        $('.table tbody tr').each(function() {
          items.push(base_link + $(this).find("a").attr("href"));
        });
      } catch (e) {
        // handling error here so execution can continue for good eggs
        console.error(e.message)
      }
      return items;
    };
    
    let getData = async (links) => {
      const out = [];
      try {
        const promises = links.map(nurl => getHtmls(nurl));
    
        const htmls = await Promise.all(promises);
        htmls.forEach(html => {
          let $ = cheerio.load(html);
          $('.table tbody tr').each(function() {
            out.push(base_link + $(this).find("a").attr("href"));
          });
        })
      } catch (e) {
        // handling error here so execution can continue for good eggs
        console.error(e.message)
      }
      return out;
    }
    
    let FetchData = async (links) => {
      const out = [];
      try {
        const promises = links.map(nurl => getHtmls(nurl));
        const htmls = await Promise.all(promises)
        htmls.forEach(html => {
          try {
            let $ = cheerio.load(html);
            out.push($(".home-title > h2").eq(0).text());
          } catch (e){
            // handling error here so execution can continue for good eggs
            console.error(e.message)
          }
        })
      } catch (e) {
        // handling error here so execution can continue for good eggs
        console.error(e.message)
      }
      return out;
    }
    
    getLinks().then(resultList => {
      getData(resultList).then(resultSet => {
        FetchData(resultSet).then(title =>{
          console.log(title);
        })
      })
    })
    

    Note: Instead of writing your own Promise wrapper, you could use request-promise package