Search code examples
javascriptnode.jsexpressaxioscheerio

Scrape multiple websites using NodeJS, Express, Cherio and Axios


I would like to scrape multiple websites using NodeJS, Express, Cheerio and Axios. I'm able now to scrape 1 website and display the information to the HTML. But when I try to scrape multiple websites looking for the same element, it doesn't go through the forEach (stops after 1 cycle). Notice my loop which doesn't work correctly: urls.forEach(url => {

2 files that are the most important: index.js

const PORT = 8000
const axios = require('axios')
const cheerio = require('cheerio')
const express = require('express')
const app = express()
const cors = require('cors')
app.use(cors())

const urls = ['https://www.google.nl','https://www.google.de']
// const url = 'https://www.heineken.com/nl/nl/'
app.get('/', function(req, res){
  res.json('Robin')
})

urls.forEach(url => {
  app.get('/results', (req, res) => {
    axios(url)
      .then(response => {
        const html = response.data
        const $ = cheerio.load(html)
        const articles = []

        $('script', html).each(function(){
          const link = $(this).get()[0].namespace
          if (link !== undefined) {
            if (link.indexOf('w3.org') > -1) {
             articles.push({
               link
             })
            }
          }
        })
        res.json(articles)
      }).catch(err => console.log(err))
 })
})

app.listen(PORT, () => console.log('server running on PORT ${PORT}'))

App.js:

const root = document.querySelector('#root')

fetch('http://localhost:8000/results')
  .then(response => {return response.json()})
  .then(data => {
    console.log(data)
    data.forEach(article => {
      const title = `<h3>` + article.link + `</h3>`
      root.insertAdjacentHTML("beforeend", title)
    })
  })

Solution

  • You're registering multiple route handlers for the same route. Express will only route requests to the first one. Move your URL loop inside app.get("/results", ...)...

    app.get("/results", async (req, res, next) => {
      try {
        res.json(
          (
            await Promise.all(
              urls.map(async (url) => {
                const { data } = await axios(url);
                const $ = cheerio.load(data);
                const articles = [];
    
                $("script", html).each(function () {
                  const link = $(this).get()[0].namespace;
                  if (link !== undefined) {
                    if (link.indexOf("w3.org") > -1) {
                      articles.push({
                        link,
                      });
                    }
                  }
                });
                return articles;
              })
            )
          ).flat() // un-nest each array of articles
        );
      } catch (err) {
        console.error(err);
        next(err); // make sure Express responds with an error
      }
    });